In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Race-Prediction-Trials\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Race-Prediction-Trials'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    train_array_path: Path
    test_array_path: Path
    preprocessor_obj_file_path: Path

In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            train_array_path=config.train_array_path,
            test_array_path=config.test_array_path,
            preprocessor_obj_file_path=config.preprocessor_obj_file_path
        )

        return data_transformation_config

In [8]:
import sys
import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OrdinalEncoder
from src.exception import CustomException
from src import logger
from src.utils.common import save_object
import warnings
warnings.filterwarnings("ignore")

In [9]:
## 5. Update the components

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config=config


    def get_data_transformer_object(self):
        '''
        This function is responsible for data transformation
        
        '''
        try:
            numerical_columns = ['grid', 'position', 'laps', 'fastestLap', 'rank', 'fastestLapSpeed', 'round', 'race_age']
            
            categorical_columns = ['drivername', 'location', 'constructorname', 'status']

            num_pipeline= Pipeline(
                    steps=[
                            ("imputer",SimpleImputer(strategy="median")),
                            ("scaler",StandardScaler())

                    ]
                )

            cat_pipeline=Pipeline(

                    steps=[
                            ("imputer", SimpleImputer(strategy="most_frequent")),
                            ("ordinal_encoder", OrdinalEncoder(encoding_method='arbitrary')),
                            ("scaler", StandardScaler(with_mean=False))
                    ]

                )

            logger.info(f"Categorical columns: {categorical_columns}")
            logger.info(f"Numerical columns: {numerical_columns}")

            preprocessor=ColumnTransformer(
                    [
                        ("num_pipeline",num_pipeline,numerical_columns),
                        ("cat_pipelines",cat_pipeline,categorical_columns)

                    ]


                )

            return preprocessor
        
        except Exception as e:
            raise CustomException(e,sys)
        
    def initiate_data_transformation(self):

        try:
            train_df=pd.read_csv(self.config.train_data_path)
            test_df=pd.read_csv(self.config.test_data_path)

            logger.info("Read train and test data completed")

            logger.info("Obtaining preprocessing object")

            preprocessing_obj=self.get_data_transformer_object()

            target_column_name="points"

            input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df=train_df[target_column_name]

            input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df=test_df[target_column_name]

            logger.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )

            input_feature_train_arr=preprocessing_obj.fit(input_feature_train_df, target_feature_train_df)
            input_feature_train_arr=preprocessing_obj.transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            np.save(self.config.train_array_path, train_arr)
            np.save(self.config.test_array_path, test_arr)            

            logger.info(f"Saved preprocessing object.")

            save_object(

                file_path=self.config.preprocessor_obj_file_path,
                obj=preprocessing_obj

            )

            return (
                self.config.train_array_path,
                self.config.test_array_path,
                self.config.preprocessor_obj_file_path,
            )
        except Exception as e:
            raise CustomException(e,sys)


In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_data_transformer_object()
    data_transformation.initiate_data_transformation()
except Exception as e:
  raise e

[2024-07-12 17:49:51,512: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-12 17:49:51,512: INFO: common: created directory at: output]
[2024-07-12 17:49:51,512: INFO: common: created directory at: output/data_transformation]
[2024-07-12 17:49:51,512: INFO: 431699284: Categorical columns: ['drivername', 'location', 'constructorname', 'status']]
[2024-07-12 17:49:51,512: INFO: 431699284: Numerical columns: ['grid', 'position', 'laps', 'fastestLap', 'rank', 'fastestLapSpeed', 'round', 'race_age']]
[2024-07-12 17:49:51,565: INFO: 431699284: Read train and test data completed]
[2024-07-12 17:49:51,566: INFO: 431699284: Obtaining preprocessing object]
[2024-07-12 17:49:51,566: INFO: 431699284: Categorical columns: ['drivername', 'location', 'constructorname', 'status']]
[2024-07-12 17:49:51,567: INFO: 431699284: Numerical columns: ['grid', 'position', 'laps', 'fastestLap', 'rank', 'fastestLapSpeed', 'round', 'race_age']]
[2024-07-12 17:49:51,568: INFO: 431699284: Ap

