In [1]:
import os

In [2]:
%pwd
os.chdir("/Users/vanshbansal/Desktop/Road Accidents")
%pwd

'/Users/vanshbansal/Desktop/Road Accidents'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir: Path
    curr_data_path: Path
    store_data_path: Path 

In [4]:
from src.utils.common import read_yaml , create_directories
from src.constants import CONFIG_FILE_PATH

class ConfigurationManager:
    def __init__(self ,config_filepath=CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            curr_data_path = config.curr_data_path,
            store_data_path = config.store_data_path
        )

        return data_ingestion_config


In [7]:
import os
from src import logger
import pandas as pd

class DataIngestion:
    def __init__(self , config: DataIngestionConfig):
        try:
            self.config = config
        except Exception as e:
            raise e
    
    def store_data(self):
        # store the path of file
        curr_file_path = self.config.curr_data_path
        store_file_path = self.config.store_data_path

        # Define the size of the sample
        sample_size = 600000
        chunk_size = 10000  # Adjust based on your system's memory capacity

        # Initialize an empty list to store sampled rows
        sampled_rows = []

        # Iterate over the CSV file in chunks
        for chunk in pd.read_csv(curr_file_path, chunksize=chunk_size):
            # Randomly sample rows from the current chunk
            sampled_chunk = chunk.sample(n=min(sample_size, len(chunk)))
            sampled_rows.append(sampled_chunk)
            sample_size -= len(sampled_chunk)
            if sample_size <= 0:
                break
            
        # Concatenate all sampled chunks into a single DataFrame
        sampled_data = pd.concat(sampled_rows)

        # Save the sampled data to a new CSV file
        sampled_data.to_csv(store_file_path, index=False)

In [8]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.store_data()
except Exception as e:
    raise e

[2025-01-11 03:31:03,999: INFO: common: yaml file: config.yaml loaded successfully]


[2025-01-11 03:31:04,008: INFO: common: created directory at: artifacts]
[2025-01-11 03:31:04,013: INFO: common: created directory at: artifacts/data_ingestion]
