## Data Ingestion

In [1]:
import os

In [2]:
os.chdir('../../')

In [3]:
%pwd

'/home/utpal108/dev/Upwork/Projects/Diabetic-Retinopathy-Prediction'

In [4]:
# Config Entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class MLDataIngestionConfig:
    root_dir: Path
    ml_data_source_url: str
    raw_dataset_dir: Path
    ml_dataset_dir: Path
    ml_data_path: Path

In [5]:
from diabeticRetinopathy.constants import *
from diabeticRetinopathy.utils import read_yaml, create_directories

In [6]:
# Configuration Manager
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> MLDataIngestionConfig: 
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = MLDataIngestionConfig(
            root_dir = Path(config.root_dir),
            ml_data_source_url = config.ml_data_source_url,
            raw_dataset_dir = Path(config.raw_dataset_dir),
            ml_dataset_dir = Path(config.ml_dataset_dir),
            ml_data_path = Path(config.ml_data_path)
        )

        return data_ingestion_config

In [7]:
import opendatasets as od
import shutil
from PIL import Image

In [8]:
# Components
class DataIngestion:
    def __init__(self, config: MLDataIngestionConfig):
        self.config = config

    def download_dataset(self):
        if not os.path.exists(self.config.ml_dataset_dir) or (not os.listdir(self.config.ml_dataset_dir)):
            create_directories([self.config.raw_dataset_dir])
            # Download the dataset
            od.download(self.config.ml_data_source_url,data_dir=self.config.raw_dataset_dir)
        
    def preprocess_dataset(self):
        if os.path.exists(self.config.raw_dataset_dir):
            try:
                create_directories([self.config.ml_dataset_dir])
                for root, dirs, files in os.walk(self.config.raw_dataset_dir):
                    for file in files:
                        file_path = os.path.join(root, file)
                        
                        # Check if the file is an image and valid
                        if file.lower().endswith(('.csv')):
                            # Copy the file to the destination folder
                            shutil.copy(file_path, self.config.ml_data_path)

                shutil.rmtree(self.config.raw_dataset_dir)
            
            except Exception as e:
                raise e

In [9]:
# Pipeline
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_dataset()
    data_ingestion.preprocess_dataset()
    
except Exception as e:
    raise e

2024-04-18 08:56:10,954 : diabeticRetinopathy.logger - INFO - YAML file: config/config.yaml loaded successfully
2024-04-18 08:56:10,958 : diabeticRetinopathy.logger - INFO - YAML file: params.yaml loaded successfully
2024-04-18 08:56:10,960 : diabeticRetinopathy.logger - INFO - created directory at: artifacts
2024-04-18 08:56:10,961 : diabeticRetinopathy.logger - INFO - created directory at: artifacts/data_ingestion
2024-04-18 08:56:10,963 : diabeticRetinopathy.logger - INFO - created directory at: artifacts/data_ingestion/raw_dataset


Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Dataset URL: https://www.kaggle.com/datasets/mrsimple07/diabetes-prediction
Downloading diabetes-prediction.zip to artifacts/data_ingestion/raw_dataset/diabetes-prediction


100%|██████████| 64.2k/64.2k [00:00<00:00, 345kB/s]
2024-04-18 08:56:23,375 : diabeticRetinopathy.logger - INFO - created directory at: artifacts/data_ingestion/ml_dataset



