In [1]:
import os

In [2]:
%pwd

'/Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict'

In [5]:
import os
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
from Autopredictor.src.utils.common import read_yaml, create_directories
from Autopredictor.src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from Autopredictor.src.logging import logger

In [6]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    transformer_path: Path

In [7]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config['artifacts_root']])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config['data_transformation']
        create_directories([config['root_dir']])
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config['root_dir']),
            data_path=Path(config['data_path']),
            transformer_path=Path(config['transformer_path'])
        )
        return data_transformation_config

In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.preprocessor = None

    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)
        data = data.rename(columns={'failed': 'fail'})  # Ensure column names are consistent
        train, test = train_test_split(data, test_size=0.25, random_state=42)
        train_path = self.config.root_dir / "train.csv"
        test_path = self.config.root_dir / "test.csv"
        train.to_csv(train_path, index=False)
        test.to_csv(test_path, index=False)
        logger.info("Split data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)
        return train, test

    def create_pipeline(self):
        numerical_features = ['year', 'price', 'mileage', 'tax', 'mpg', 'enginesize']
        categorical_features = ['model', 'transmission', 'fueltype', 'manufacturer']

        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        joblib.dump(self.preprocessor, self.config.transformer_path)

    def transform_data(self):
        train, test = self.train_test_spliting()
        if self.preprocessor is None:
            raise ValueError("Preprocessor not created. Call create_pipeline() first.")
        self.preprocessor = joblib.load(self.config.transformer_path)
        X_train_transformed = self.preprocessor.fit_transform(train.drop('fail', axis=1))
        X_test_transformed = self.preprocessor.transform(test.drop('fail', axis=1))
        train_transformed_df = pd.DataFrame.sparse.from_spmatrix(X_train_transformed)
        test_transformed_df = pd.DataFrame.sparse.from_spmatrix(X_test_transformed)
        train_df = pd.concat([train[['fail']].reset_index(drop=True), train_transformed_df], axis=1)
        test_df = pd.concat([test[['fail']].reset_index(drop=True), test_transformed_df], axis=1)
        train_df.to_csv(self.config.root_dir / "train_transformed.csv", index=False)
        test_df.to_csv(self.config.root_dir / "test_transformed.csv", index=False)


In [9]:
try:
        config_manager = ConfigurationManager()
        data_transformation_config = config_manager.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        data_transformation.create_pipeline()
        data_transformation.transform_data()
except Exception as e:
        logger.exception("Exception occurred during data transformation:", exc_info=True)


[2024-05-30 15:17:35,084: INFO:common: yaml file: /Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/config/config.yaml loaded successfully]
[2024-05-30 15:17:35,088: INFO:common: yaml file: /Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/params.yaml loaded successfully]
[2024-05-30 15:17:35,092: INFO:common: yaml file: /Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/schema.yaml loaded successfully]
[2024-05-30 15:17:35,093: INFO:common: created directory at: artifacts]
[2024-05-30 15:17:35,095: INFO:common: created directory at: artifacts/data_transformation]


[2024-05-30 15:17:35,858: INFO:716117572: Split data into training and test sets]
[2024-05-30 15:17:35,859: INFO:716117572: (73284, 11)]
[2024-05-30 15:17:35,860: INFO:716117572: (24428, 11)]
