In [1]:
import os
os.chdir('../../')
%pwd

'/home/utpal108/dev/Upwork/Projects/Diabetic-Retinopathy-Prediction'

In [2]:
from dataclasses import dataclass
from pathlib import Path

In [3]:
@dataclass(frozen=True)
class MLDataPreprocessingConfig:
    data_path: Path
    preprocessor_path: Path

In [4]:
import pandas as pd
from diabeticRetinopathy.constants import *
from diabeticRetinopathy.utils import read_yaml, save_object, create_directories

In [5]:
# Configuration Manager
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

    def get_ml_data_preprocessing_config(self) -> MLDataPreprocessingConfig:
        config = self.config
        create_directories([config.data_preprocessor.root_dir])

        data_preprocessing_config = MLDataPreprocessingConfig(
            data_path = Path(config.data_ingestion.ml_data_path),
            preprocessor_path = Path(config.data_preprocessor.preprocessor_path)
        )

        return data_preprocessing_config

In [6]:
from sklearn.impute import SimpleImputer # For Handling Missing Values
from sklearn.preprocessing import StandardScaler # For Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # For Ordinal Encoding
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
# Components
class DataPreprocessing:
    def __init__(self, config: MLDataPreprocessingConfig):
        self.config = config

    def _data_preprocessor(self, numerical_features, categorical_features):
        '''
        Preprocess the raw dataset
        '''

        # Numerical Pipeline
        num_pipeline = Pipeline(
            steps=[
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())
            ]
        )

        # Categorical Pipeline
        cat_pipeline = Pipeline(
            steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinalencoder', OrdinalEncoder()),
                ('scaler', StandardScaler())
            ]
        )

        preprocessor = ColumnTransformer([
            ('num_pipeline',num_pipeline,numerical_features),
            ('cat_pipeline', cat_pipeline, categorical_features)
        ])

        return preprocessor

    
    def initiate_data_preprocessing(self) -> None:
        
        try:
            df = pd.read_csv(self.config.data_path)
            
            # Split into independent and dependent features
            X = df.iloc[:,:-1]
            y = df.iloc[:,-1]

            # Data over sampling
            oversample = SMOTE()
            X,y = oversample.fit_resample(X, y)

            # Segregating Numerical and Categorical features
            numerical_features = [feature for feature in X.columns if X[feature].dtypes !=object]
            categorical_features = [feature for feature in X.columns if X[feature].dtypes ==object]

            preprocessor = self._data_preprocessor(numerical_features=numerical_features, categorical_features=categorical_features)
            
            X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42,shuffle=True)
            
            X_train = preprocessor.fit_transform(X_train)
            X_test = preprocessor.fit_transform(X_test)

            save_object(self.config.preprocessor_path, preprocessor)

            return X_train, X_test, y_train, y_test 

        except Exception as e:
            raise e
    

In [8]:
# Pipeline
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_ml_data_preprocessing_config()
    data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
    X_train, X_test, y_train, y_test = data_preprocessing.initiate_data_preprocessing()

except Exception as e:
    raise e

2024-04-17 21:40:08,447 : diabeticRetinopathy.logger - INFO - YAML file: config/config.yaml loaded successfully
2024-04-17 21:40:08,453 : diabeticRetinopathy.logger - INFO - YAML file: params.yaml loaded successfully
2024-04-17 21:40:08,455 : diabeticRetinopathy.logger - INFO - created directory at: artifacts/preprocessor
