<a href="https://colab.research.google.com/github/truc-nmt/tiki_discount_rate/blob/main/notebooks/4.%20Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Task 4: Modeling - TrucNMT**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
%%capture
!pip install dill catboost xgboost

In [21]:

import os
import sys
import pandas as pd
import numpy as np
from dataclasses import dataclass

# Packages for logger
import logging
from datetime import datetime

# Packages for model evaluation
import dill
import pickle
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, train_test_split


# Packages for data transformation
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

# Packages for model trainer
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, f1_score, recall_score, precision_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

## **Logger**

In [7]:
LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
logs_path = os.path.join(os.getcwd(), "logs", LOG_FILE)
os.makedirs(logs_path, exist_ok=True)

LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)

logging.basicConfig(
    filename=LOG_FILE_PATH,
    format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)

## **Exception**

In [8]:
def error_message_detail(error, error_detail:sys):

    _, _, exc_tb = error_detail.exc_info()          # Get info about traceback(Info about error position)
    file_name = exc_tb.tb_frame.f_code.co_filename  # Get name of file script Python from "exc_tb"
    error_message="Error occured in python script name [{0}] line number [{1}] error message[{2}]".format(
    file_name,exc_tb.tb_lineno,str(error))

    return error_message


class CustomException(Exception):

    def __init__(self, error_message, error_detail:sys):
        super().__init__(error_message)
        self.error_message = error_message_detail(error_message, error_detail=error_detail)

    def __str__(self):
        return self.error_message


## **Utils**

In [11]:
def save_object(file_path, obj):
    try:
        dir_path = os.path.dirname(file_path) #  is used to get the directory path of the file_path.
        os.makedirs(dir_path, exist_ok=True)

        with open(file_path, "wb") as file_obj:
            pickle.dump(obj, file_obj)

    except Exception as e:
        raise CustomException(e, sys)


def load_object(file_path):
    try:
        with open(file_path, "rb") as file_obj:
            return pickle.load(file_obj)

    except Exception as e:
        raise CustomException(e, sys)

def evaluate_models(X_train, y_train, X_test, y_test, models, param):
    try:
        report = {}

        for model_name, model in models.items():
            para = param[model_name]

            gs = GridSearchCV(model, para, cv=3)
            gs.fit(X_train, y_train)

            model.set_params(**gs.best_params_)
            model.fit(X_train, y_train)

            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

            train_r2_score = r2_score(y_train, y_train_pred)
            test_r2_score = r2_score(y_test, y_test_pred)
            mse = mean_squared_error(y_test, y_test_pred)
            mae = mean_absolute_error(y_test, y_test_pred)

            report[model_name] = (train_r2_score, test_r2_score, mse, mae)

        return report

    except Exception as e:
        raise CustomException(e, sys)

##**1. Data Ingestion**

In [13]:
@dataclass
class DataIngestionConfig:
    train_path: str = os.path.join('/content/data', 'train.csv')
    test_path: str = os.path.join('/content/data', 'test.csv')
    raw_path: str = os.path.join('/content/data', 'raw.csv')

class DataIngestion:

    def __init__(self):
        self.ingestion_config = DataIngestionConfig()

    def initiate_data_ingestion(self):
        logging.info("Entered the data ingestion method or component")

        try:
            df = pd.read_csv('/content/Data_Tiki_Cleaned.csv')
            logging.info("Read the dataset as dataframe")

            os.makedirs(os.path.dirname(self.ingestion_config.train_path), exist_ok= True)

            df.to_csv(self.ingestion_config.raw_path, index = False, header = True)

            logging.info("Train test split initiated")

            # Make dataset
            train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
            train_set.to_csv(self.ingestion_config.train_path, index = False, header = True)
            test_set.to_csv(self.ingestion_config.test_path, index = False, header = True)


            logging.info("Data Ingestion Completed!")

            return(
                self.ingestion_config.train_path,
                self.ingestion_config.test_path,
            )

        except Exception as e:
            raise CustomException(e, sys)

##**2. Data Transformation**

In [19]:
class MultiColumnLabelEncoder:

    def __init__(self, columns=None):
        self.columns = columns if columns else []

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        for col in self.columns:
            le = LabelEncoder()
            X_[col] = le.fit_transform(X_[col])
        return X_

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join("data", "preprocessor.pkl")

class DataTransformation:

    def __init__(self):
        self.data_transformation_config = DataTransformationConfig()

    def get_data_transformation_object(self):
        '''

        This function is responsible for data transformation

        '''

        '''
            Information of columns data types after cleaned and filled null values!
            RangeIndex: 1916 entries, 0 to 1915
            Data columns (total 24 columns):
            #   Column             Non-Null Count  Dtype
            ---  ------             --------------  -----
            0   Name               1916 non-null   object
            1   Link Product       1916 non-null   object
            2   Store              1916 non-null   object
            3   Type               1916 non-null   object
            4   Author Name        1916 non-null   object
            5   Short Description  1916 non-null   object
            6   Publisher          1916 non-null   object
            7   Translators        1916 non-null   object
            8   Categories         1916 non-null   object
            9   Width              1916 non-null   float64
            10  Length             1916 non-null   float64
            11  Height             1916 non-null   float64
            12  Product ID         1916 non-null   int64
            13  Price              1916 non-null   int64
            14  Original Price     1916 non-null   int64
            15  Discount           1916 non-null   int64
            16  Discount Rate      1916 non-null   int64
            17  Rating             1916 non-null   float64
            18  Review Count       1916 non-null   int64
            19  Quantity Sold      1916 non-null   float64
            20  Number of page     1916 non-null   float64
            21  Range Price        1916 non-null   object
            22  Publication Date   1916 non-null   object
            23  Publication Year   1916 non-null   float64
            dtypes: float64(7), int64(6), object(11)
        '''
        try:

            # numerical_columns = df.select_dtypes(include=['number']).columns.tolist()

            numerical_columns = [
                                'Width',
                                'Length',
                                'Height',
                                'Product_ID',
                                'Price',
                                'Original_Price',
                                'Discount',
                                # 'Discount_Rate', # target
                                'Rating',
                                'Review_Count',
                                'Quantity_Sold',
                                'Number_of_page',
                                'Publication_Year'
                                ]
            # categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
            categorical_columns = [
                                'Store',
                                'Type',
                                'Author_Name',
                                'Publisher',
                                'Translators',
                                'Categories',
                                'Range_Price',
                            ]

            # Pipeline

            num_pipeline = Pipeline(
                steps = [
                    # ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]
            )

            cate_pipeline = Pipeline(
                steps=[
                    ("label_encoder", MultiColumnLabelEncoder(columns=categorical_columns)),
                    ("scaler", StandardScaler(with_mean=False))
                ]
            )

            # Log info

            logging.info(f"Categorical columns: {categorical_columns}")
            logging.info(f"Numerical columns: {numerical_columns}")

            # Preprocessor

            preprocessor = ColumnTransformer(
                [
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cate_pipeline", cate_pipeline, categorical_columns),
                ]
            )

            return preprocessor

        except Exception as e:
            raise CustomException(e, sys)

    def initiate_data_transformation(self, train_path, test_path):

        try:
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)

            # print(train_df.columns)
            # print(test_df.columns)

            logging.info("read train and test data completed!")
            logging.info("Obtaining preprocessing object")

            preprocessing_obj = self.get_data_transformation_object()

            target_column_name = "Discount_Rate"
            numerical_columns = train_df.select_dtypes(include=['number']).columns.tolist()
            # print(numerical_columns)


            input_feature_train_df = train_df.drop(columns=[target_column_name], axis = 1)
            # print(input_feature_train_df.info)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name], axis = 1)
            target_feature_test_df = test_df[target_column_name]

            logging.info(f"Applying preprocessing object on training dataframe and testing dataframe")

            input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            logging.info(f"Saved preprocessing object.")

            save_object(
                file_path= self.data_transformation_config.preprocessor_obj_file_path,
                obj=preprocessing_obj
            )

            return(train_arr, test_arr, self.data_transformation_config.preprocessor_obj_file_path)

        except Exception as e:
            raise CustomException(e, sys)

##**3. Model Trainer**

In [44]:
@dataclass
class ModelTrainerConfig:
    train_model_file_path = os.path.join("models", "model.pkl")


class ModelTrainer:

    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()


    def initiate_model_trainer(self, train_array, test_array):
        try:
            logging.info("Splitting training and test input data")
            X_train, y_train, X_test, y_test = (
                train_array[:, :-1],
                train_array[:, -1],
                test_array[:, :-1],
                test_array[:, -1]
            )

            models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=True),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }

            # Fine-tune Hyper Parameter
            params = {
                "Decision Tree": {
                    'criterion': ['poisson', 'absolute_error', 'friedman_mse', 'squared_error'],
                    'splitter': ['best', 'random'],
                    'max_features': ['sqrt', 'log2'],
                },
                "Random Forest": {
                    'criterion': ['squared_error', 'friedman_mse', 'poisson', 'absolute_error'],
                    'max_features': ['sqrt', 'log2', None],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                },
                "Gradient Boosting": {
                    'loss': ['squared_error', 'absolute_error', 'quantile', 'huber'],
                    'learning_rate': [0.1, 0.01, 0.05, 0.001],
                    'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
                    'max_features': ['sqrt', 'log2'],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                },
                "Linear Regression": {},
                "XGBRegressor": {
                    'learning_rate': [0.1, 0.01, 0.05, 0.001],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                },
                "CatBoosting Regressor": {
                    'depth': [6, 8, 10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor": {
                    'learning_rate': [0.1, 0.01, 0.5, 0.001],
                    'loss': ['linear', 'square', 'exponential'],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                }
            }

            model_report = evaluate_models(X_train, y_train, X_test, y_test, models, params)

            # Print all model results
            logging.info("Model results:")

            model_results_df = self.create_dataframe_from_results(model_report)
            print(model_results_df)

            # Find the best model based on test R2 score
            best_model_name = max(model_report, key=lambda k: model_report[k][1])  # Choose based on test R2 score
            best_model_scores = model_report[best_model_name]

            # Check if the test R2 score of the best model meets the threshold
            if best_model_scores[1] < 0.6:
                raise CustomException("No best model found")

            # Retrieve the best model
            best_model = models[best_model_name]

            # Save the best model
            save_object(file_path=self.model_trainer_config.train_model_file_path, obj=best_model)

            # Return the test R2 score, MSE, and MAE of the best model
            print("Final Best Model after Fine-tune with Hyper Paramaters")
            return best_model_scores[0], best_model_scores[1], best_model_scores[2], best_model_scores[3]

        except Exception as e:
            raise CustomException(e, sys)

    def create_dataframe_from_results(self, model_report):
      data = []
      for model_name, scores in model_report.items():
          logging.info(f"{model_name}: Train R2 Score - {scores[0]}, Test R2 Score - {scores[1]}, MSE - {scores[2]}, MAE - {scores[3]}") # Save in log file
          data.append([model_name, scores[0], scores[1], scores[2], scores[3]])
      df = pd.DataFrame(data, columns=['Model', 'Train R2 Score', 'Test R2 Score', 'MSE', 'MAE'])
      return df



## **4. Results**

In [45]:
obj = DataIngestion()
train_data, test_data = obj.initiate_data_ingestion()

data_transformation = DataTransformation()
train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data, test_data)

modeltrainer = ModelTrainer()
print(modeltrainer.initiate_model_trainer(train_arr, test_arr))

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
10:	learn: 11.4946196	total: 52ms	remaining: 89.9ms
11:	learn: 11.4207493	total: 55.7ms	remaining: 83.6ms
12:	learn: 11.3461089	total: 62.2ms	remaining: 81.4ms
13:	learn: 11.2666259	total: 75ms	remaining: 85.7ms
14:	learn: 11.1885251	total: 78.6ms	remaining: 78.6ms
15:	learn: 11.1209663	total: 82.2ms	remaining: 72ms
16:	learn: 11.0508081	total: 85.1ms	remaining: 65.1ms
17:	learn: 10.9822952	total: 88.2ms	remaining: 58.8ms
18:	learn: 10.9096954	total: 93.3ms	remaining: 54ms
19:	learn: 10.8414087	total: 99.6ms	remaining: 49.8ms
20:	learn: 10.7837768	total: 105ms	remaining: 45.1ms
21:	learn: 10.7162213	total: 111ms	remaining: 40.4ms
22:	learn: 10.6542976	total: 116ms	remaining: 35.4ms
23:	learn: 10.5938525	total: 121ms	remaining: 30.2ms
24:	learn: 10.5250411	total: 129ms	remaining: 25.8ms
25:	learn: 10.4568657	total: 137ms	remaining: 21ms
26:	learn: 10.3955711	total: 147ms	remaining: 16.3ms
27:	learn: 10.3337178	total:

## **Push to github**

In [23]:
!git config --global user.email "ehthanhtruc@gmail.com"
!git config --global user.name "truc-nmt"