In [1]:
import os
os.chdir("../")
%pwd

'/Users/valentinmonney/Documents/data_science/python/MachineLearningProjects/Kidney-Disease-Classification-Deep-Learning-Project'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    """Configuration class for data ingestion."""

    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path

In [4]:
from cnn_classifier.constants import *
from cnn_classifier.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    """Class for managing configuration settings."""

    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH,
        params_filepath: str = PARAMS_FILE_PATH,
    ) -> None:
        """Initialize the ConfigurationManager class.

        Args:
        ----
            config_filepath: The path to the configuration file.
            params_filepath: The path to the parameters file.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """Return the data ingestion configuration."""
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        return DataIngestionConfig(
            root_dir=config.root_dir,
            source_url=config.source_url,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
        )

In [7]:
"""DataIngestion class, which download and extract data for the CNN classifier."""

import zipfile

# from cnn_classifier.entity.config_entity import DataIngestionConfig
from cnn_classifier.utils.common import load_json, get_size
from cnn_classifier import logger
from pathlib import Path
import kaggle

kaggle_config_path = Path.home() / ".kaggle" / "kaggle.json"
config = load_json(kaggle_config_path)
kaggle_username = config.username
kaggle_key = config.key

[2023-10-18 11:55:17,471: INFO: common] json file loaded succesfully from: /Users/valentinmonney/.kaggle/kaggle.json


In [12]:
class DataIngestion:
    """Class for downloading and extracting data for the CNN classifier."""

    def __init__(self, config: DataIngestionConfig) -> None:
        """Initialize the DataIngestion class with a configuration object."""
        self.config = config

    def download_file(self) -> None:
        """Download file from URL if not exist locally."""
        if not Path(self.config.local_data_file).exists():
            dataset_path = self.config.source_url
            destination_folder = self.config.root_dir
            kaggle.api.dataset_download_files(
                dataset_path, path=destination_folder, unzip=False
            )

            logger.info("zip data downloaded!")
        else:
            logger.info(
                f"zip data already exists with size: "
                f"{get_size(Path(self.config.local_data_file))}",
            )

    def extract_zip_file(self) -> None:
        """Extract the zip file into the data directory.

        Returns
        -------
          None
        """
        unzip_path = self.config.unzip_dir
        Path(unzip_path).mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, "r") as zip_ref:
            zip_ref.extractall(unzip_path)

In [13]:
STAGE_NAME = "Data Ingestion"


class DataIngestionTrainingPipeline:
    """Data ingestion training pipeline."""

    def __init__(self) -> None:
        """Initialize the DataIngestionTrainingPipeline class."""

    def main(self) -> None:
        """Run the data ingestion pipeline."""
        config = ConfigurationManager()
        data_ingestion_config = config.get_data_ingestion_config()
        data_ingestion = DataIngestion(config=data_ingestion_config)
        data_ingestion.download_file()
        data_ingestion.extract_zip_file()


try:
    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
    obj = DataIngestionTrainingPipeline()
    obj.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
    logger.exception(e)
    raise

[2023-10-18 12:17:34,343: INFO: 425422528] >>>>>> stage Data Ingestion started <<<<<<
[2023-10-18 12:17:34,350: INFO: common] yaml file: config/config.yaml loaded successfully
[2023-10-18 12:17:34,353: INFO: common] yaml file: params.yaml loaded successfully
[2023-10-18 12:17:34,354: INFO: common] created directory at: artifacts
[2023-10-18 12:17:34,355: INFO: common] created directory at: artifacts/data_ingestion
[2023-10-18 12:18:15,162: INFO: 1470172067] zip data downloaded!
[2023-10-18 12:18:28,469: INFO: 425422528] >>>>>> stage Data Ingestion completed <<<<<<

