In [1]:
import os

In [2]:
%pwd

'/Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    classifier__C: float
    classifier__max_iter: int
    classifier__penalty: str
    preprocessor__num__imputer__strategy: str
    target_column: str

In [7]:
from Autopredictor.src.constants import *
from Autopredictor.src.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.LogisticRegression  # Adjust based on the model used
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            train_data_path=Path(config.train_data_path),
            test_data_path=Path(config.test_data_path),
            model_name=config.model_name,
            classifier__C=params.classifier__C,
            classifier__max_iter=params.classifier__max_iter,
            classifier__penalty=params.classifier__penalty,
            preprocessor__num__imputer__strategy=params.preprocessor__num__imputer__strategy,
            target_column=schema.name
        )

        return model_trainer_config

In [9]:
import pandas as pd
import os
from Autopredictor.src.logging import logger
from sklearn.linear_model import LogisticRegression
import joblib

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer

In [11]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Load the data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features (X) and target variable (y)
        train_x = train_data.drop(columns=[self.config.target_column])
        train_y = train_data[self.config.target_column]
        test_x = test_data.drop(columns=[self.config.target_column])
        test_y = test_data[self.config.target_column]

        # Define numerical and categorical features
        numerical_features = train_x.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = train_x.select_dtypes(include=['object']).columns.tolist()

        # Create preprocessing pipelines for numerical and categorical data
        numerical_pipeline = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=self.config.preprocessor__num__imputer__strategy)),
            ('scaler', StandardScaler())
        ])

        categorical_pipeline = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # ColumnTransformer to apply different preprocessing to numerical and categorical features
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_pipeline, numerical_features),
                ('cat', categorical_pipeline, categorical_features)
            ]
        )

        # Create the pipeline that includes preprocessing and the classifier
        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('classifier', LogisticRegression(
                C=self.config.classifier__C,
                max_iter=self.config.classifier__max_iter,
                penalty=self.config.classifier__penalty,
                solver='saga',
                random_state=42
            ))
        ])

        # Fit the pipeline
        pipeline.fit(train_x, train_y)

        # Save the trained model
        joblib.dump(pipeline, os.path.join(self.config.root_dir, self.config.model_name))


In [12]:
try:
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_trainer_config()

    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e:
    raise e

[2024-05-30 15:47:21,619: INFO:common: yaml file: /Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/config/config.yaml loaded successfully]
[2024-05-30 15:47:21,626: INFO:common: yaml file: /Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/params.yaml loaded successfully]
[2024-05-30 15:47:21,635: INFO:common: yaml file: /Users/kanayojustice/Documents/Data_scientist_projects/AutoPredict/schema.yaml loaded successfully]
[2024-05-30 15:47:21,639: INFO:common: created directory at: artifacts]
[2024-05-30 15:47:21,640: INFO:common: created directory at: artifacts/model_trainer]


