In [2]:
import os

In [3]:
os.chdir("/Users/vanshbansal/Desktop/Road Accidents")

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class PreprocessingConfig:
    root_dir: Path
    preprocesser_obj: Path
    train_file_path: Path
    test_file_path: Path
    preprocessed_x_train: Path
    preprocessed_x_test: Path
    preprocessed_y_train: Path
    preprocessed_y_test: Path

In [8]:
from src.utils.common import read_yaml , create_directories
from src.constants import CONFIG_FILE_PATH

class ConfigurationManager:
    def __init__(self , config_filepath=CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)
    
    def get_preprocessing_config(self) -> PreprocessingConfig:
        config = self.config.preprocessing
        create_directories([config.root_dir])
        

        preprocessing_config = PreprocessingConfig(
            root_dir = config.root_dir,
            preprocesser_obj = config.preprocesser_obj,
            train_file_path = config.train_file_path,
            test_file_path = config.test_file_path,
            preprocessed_x_train = config.preprocessed_x_train,
            preprocessed_x_test = config.preprocessed_x_test,
            preprocessed_y_train = config.preprocessed_y_train,
            preprocessed_y_test = config.preprocessed_y_test
        )

        return preprocessing_config

In [10]:
import pandas as pd
import numpy as np
from src import logger
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import joblib

class Preprocessing:
    def __init__(self , config: PreprocessingConfig):
        try:
            self.config = config
        except Exception as e:
            raise e
    
    def transform_x(self , x_train , x_test):
        df = pd.read_csv(self.config.train_file_path)
        
        num_cols = df.select_dtypes(include=['number']).columns.tolist()  # Columns with numeric data types
        cat_cols = df.select_dtypes(exclude=['number']).columns.tolist()  # Non-numeric columns (categorical)

        # List of columns to exclude from null-checks
        keep_cols = [
            "Temperature(F)",
            "Humidity(%)",
            "Pressure(in)",
            "Visibility(mi)",
            "Wind_Direction",
            "Wind_Speed(mph)",
            "Weather_Condition"
        ]
        
        null_int_cols = list(set(keep_cols).intersection(set(num_cols)))
        null_cat_cols = list(set(keep_cols).intersection(set(cat_cols)))


        # Define transformers
        imp_enc = ColumnTransformer(
            transformers=[
                ("num_missing", SimpleImputer(strategy="median"), null_int_cols),  # Impute missing values for numerical columns
                ("cat_imputer_ohe", Pipeline(steps=[
                    ("cat_imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values for categorical columns
                    ("ohe_trf", OneHotEncoder(sparse_output=False, handle_unknown='ignore'))  # Apply OneHotEncoder to all categorical columns
                ]), cat_cols),  # Apply both imputation and one-hot encoding to all categorical columns
            ],
            remainder='passthrough'  # Keep other columns as they are
        )

        
        yj_trf = PowerTransformer()
        
        scaler_trf = ColumnTransformer([
            ("scaler_trf" , StandardScaler() , slice(0,40))
        ])
        
        pca = PCA(n_components=15)

        pre_pipe = Pipeline([
            ("preprocessor" , imp_enc),
            ("yj_trf" , yj_trf),
            ("scaler_trf" , scaler_trf),
            ("pca" , pca)
        ])

        x_train_trf = pre_pipe.fit_transform(x_train)
        x_test_trf = pre_pipe.transform(x_test)

        np.save(self.config.preprocessed_x_train , x_train_trf)
        np.save(self.config.preprocessed_x_test , x_test_trf)        
        joblib.dump(pre_pipe, self.config.preprocesser_obj) 


    def transform_y(self , y_train , y_test):
        
        le = LabelEncoder()
        le.fit(y_train)

        y_train = le.transform(y_train)
        y_test= le.transform(y_test)

        np.save(self.config.preprocessed_y_train , y_train)
        np.save(self.config.preprocessed_y_test , y_test)
    
    def start_preprocessing(self):
        
        train_file_path = self.config.train_file_path
        test_file_path = self.config.test_file_path

        train = pd.read_csv(train_file_path)
        test = pd.read_csv(test_file_path)

        x_train = train.drop(columns=['Severity'])
        y_train = train['Severity']
        x_test = test.drop(columns=['Severity'])
        y_test = test['Severity']

        
        self.transform_x(x_train , x_test)
        self.transform_y(y_train , y_test)

        



In [11]:
try:
    config = ConfigurationManager()
    preprocessing_obj = config.get_preprocessing_config()
    preprocessing = Preprocessing(config=preprocessing_obj)
    preprocessing.start_preprocessing()
except Exception as e:
    raise e

[2025-01-11 11:09:33,253: INFO: common: yaml file: config.yaml loaded successfully]
[2025-01-11 11:09:33,264: INFO: common: created directory at: artifacts/preprocessing]


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
