In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Race-Prediction-Trials\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Race-Prediction-Trials'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    trained_model_file_path: Path
    train_array_path: Path
    test_array_path: Path

In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            trained_model_file_path=config.trained_model_file_path,
            train_array_path=config.train_array_path,
            test_array_path=config.test_array_path
        )

        return model_trainer_config

In [8]:
import sys
from dataclasses import dataclass
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from pgbm.sklearn import HistGradientBoostingRegressor
from sklearn.metrics import r2_score
from src.exception import CustomException
from src import logger
from src.utils.common import save_object,evaluate_models
import warnings
warnings.filterwarnings("ignore")
import numpy as np

In [9]:
## 5. Update the components

class ModelTrainer:
    def __init__(self, config:ModelTrainerConfig):
        self.config=config

    def initiate_model_trainer(self):
        try:
            train_array=np.load(self.config.train_array_path)
            test_array=np.load(self.config.test_array_path)
            logger.info("Split training and test input data")
            X_train,y_train,X_test,y_test=(
                train_array[:,:-1],
                train_array[:,-1],
                test_array[:,:-1],
                test_array[:,-1]
            )
            models = {
                
                    "Hist Gradient Boosting": HistGradientBoostingRegressor(),
                    "XGBRegressor": XGBRegressor(),
                    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                }
            params={
                    
                    "Hist Gradient Boosting":{
                        'learning_rate': [0.1], 
                        'max_depth': [10]
                    },
        
                    "XGBRegressor":{
                        'learning_rate': [0.1], 
                        'n_estimators': [256]
                    },
                    "CatBoosting Regressor":{
                        'depth': [8], 
                        'iterations': [100], 
                        'learning_rate': [0.1]
                    },

                }

            model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
                                             models=models,param=params)
            
            ## To get best model score from dict
            best_model_score = max(sorted(model_report.values()))

            ## To get best model name from dict

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            best_model = models[best_model_name]

            if best_model_score<0.6:
                raise CustomException("No best model found")
            logger.info(f"Best found model on both training and testing dataset")

            save_object(
                file_path=self.config.trained_model_file_path,
                obj=best_model
            )

            predicted=best_model.predict(X_test)

            r2_square = r2_score(y_test, predicted)

            logger.info(f"best model: {best_model_name}; best R-squared score: {r2_square}")

            return best_model_name, r2_square
            
        except Exception as e:
            raise CustomException(e,sys)

In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.initiate_model_trainer()
except Exception as e:
  raise e

[2024-07-12 18:08:18,868: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-12 18:08:18,868: INFO: common: created directory at: output]
[2024-07-12 18:08:18,868: INFO: common: created directory at: output/model_trainer]
[2024-07-12 18:08:18,885: INFO: 2005102743: Split training and test input data]
[2024-07-12 18:08:26,505: INFO: 2005102743: Best found model on both training and testing dataset]
