In [1]:
import os

In [2]:
%pwd

'c:\\Users\\syama talari\\OneDrive\\Desktop\\model_deployement_mlops\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\syama talari\\OneDrive\\Desktop\\model_deployement_mlops'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingPipelineConfig:
    root_dir : Path
    source_URL : str
    local_data_file : Path
    unzip_dir : Path

In [12]:
from mlproj.constants import *
from mlproj.utils.common import read_yaml

import os
import shutil
import urllib.request as request
import zipfile
from mlproj import logger
from mlproj.utils.common import get_size

In [7]:
def create_directory(path_to_directory: Path) -> None: # <-- FIX IS HERE
    """Creates a directory if it does not exist."""
    os.makedirs(path_to_directory, exist_ok=True)
    logger.info(f"Directory created at: {path_to_directory}")

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directory(Path(self.config.artifacts_root))
    
    def get_data_ingestion_config(self) -> TrainingPipelineConfig:
        config = self.config.data_ingestion

        data_ingestion_config = TrainingPipelineConfig(
            root_dir=Path(config.root_dir),
            source_URL=config.source_URL,
            local_data_file=Path(config.local_data_file),
            unzip_dir=Path(config.unzip_dir),
        )

        return data_ingestion_config 


In [13]:
class DataIngestion:
    def __init__(self, config: TrainingPipelineConfig):
        self.config = config
        # Ensure the root directory for ingestion exists
        # You should have created artifacts/data_ingestion earlier, but creating it here adds robustness
        os.makedirs(self.config.root_dir, exist_ok=True) 

    # We modify download_data to be more generic and handle both local/remote
    def download_or_copy_data(self) -> Path:
        source_url = self.config.source_URL
        dest_path = self.config.local_data_file

        # 1. Handle Remote Download (e.g., starts with http)
        if source_url.lower().startswith('http'):
            if not os.path.exists(dest_path):
                # Ensure local directory for saving the file exists
                os.makedirs(dest_path.parent, exist_ok=True) 

                filename, headers = request.urlretrieve(
                    url = source_url,
                    filename = dest_path
                )
                logger.info(f"Remote file: {filename} downloaded successfully.")
            else:
                logger.info(f"Remote file already exists.")

        # 2. Handle Local Copy (if source_URL is a local file path)
        elif os.path.exists(source_url):
            # The destination path needs its parent directory created first
            os.makedirs(dest_path.parent, exist_ok=True) 
            
            # Use shutil.copy to copy the CSV file locally
            shutil.copy(source_url, dest_path)
            logger.info(f"Local file copied from {source_url} to {dest_path}")
            
        else:
            logger.error(f"Source URL/Path is invalid or does not exist: {source_url}")
            raise FileNotFoundError(f"Data source not found at {source_url}")

        return dest_path
    
    # We rename this to reflect the unzipping step which may not always be needed
    def extract_if_zip(self, file_path: Path, extract_to: Path) -> None:
        if file_path.suffix == '.zip':
            # This is your existing unzip logic, only run for zip files
            os.makedirs(extract_to, exist_ok=True)
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
            logger.info(f"File extracted to: {extract_to}")
        else:
            # If it's not a zip (e.g., CSV), the extraction step is essentially complete
            logger.info(f"File {file_path} is not a zip; skipping extraction step.")
            # For a CSV, the data is now in local_data_file, which is the final path

    def initiate_data_ingestion(self) -> Path:
        # Step 1: Download or Copy the data
        ingested_file_path = self.download_or_copy_data()
        
        # Step 2: Extract the data (Only runs if it's a zip)
        self.extract_if_zip(ingested_file_path, self.config.unzip_dir)
        
        # We need to return the path to the actual CSV file.
        # Since you are likely copying a CSV file, the final data path is usually the local_data_file.
        return self.config.unzip_dir # If unzipped, or local_data_file if copied CSV

In [14]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()

    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.initiate_data_ingestion()
except Exception as e:
    logger.exception(e)

[2025-10-28 22:36:40,218: INFO: common]: YAML file: configs\config.yaml loaded successfully]
[2025-10-28 22:36:40,221: INFO: common]: YAML file: params.yaml loaded successfully]
[2025-10-28 22:36:40,224: INFO: common]: YAML file: schema.yaml loaded successfully]
[2025-10-28 22:36:40,227: INFO: 3425958059]: Directory created at: artifacts]
[2025-10-28 22:36:41,629: INFO: 2876475998]: Remote file: artifacts\data_ingestion\data.zip downloaded successfully.]
[2025-10-28 22:36:41,671: INFO: 2876475998]: File extracted to: artifacts\data_ingestion]
