# Train Gradient Boost


In [2]:
# Imports
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from code_files.train import train, train_in_batches, grid_search, random_search, save_model
from code_files.data_preperation import prepare_for_train
import pandas as pd
import numpy as np

In [4]:
# Load Dataset
df_amazon = pd.read_csv("../../dataset/eda_amazon_sales_report.csv")
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117123 entries, 0 to 117122
Data columns (total 24 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Unnamed: 0                           117123 non-null  int64  
 1   Size                                 117123 non-null  int64  
 2   Qty                                  117123 non-null  int64  
 3   Amount                               117123 non-null  float64
 4   promotion-ids                        117123 non-null  int64  
 5   B2B                                  117123 non-null  int64  
 6   Status_Cancelled                     117123 non-null  bool   
 7   Status_Shipped                       117123 non-null  bool   
 8   Status_Shipped - Delivered to Buyer  117123 non-null  bool   
 9   Fulfilment_Amazon                    117123 non-null  bool   
 10  Fulfilment_Merchant                  117123 non-null  bool   
 11  ship-service-

In [5]:
# Split and Prepare for train
dftrain, dftest = train_test_split(df_amazon, test_size=0.1, random_state=42)
Xtrain_prepared, ytrain_prepared, Xtest_prepared, ytest_prepared = prepare_for_train(dftrain, dftest)

In [None]:
# Grid Search
search = random_search(
    Xtrain_prepared,
    ytrain_prepared,
    GradientBoostingRegressor(),
    params={
        "loss": [
            "squared_error",
            "absolute_error",
            "huber",
            "quantile",
        ],  # Different loss functions
        "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],  # Shrinkage rate
        "n_estimators": [50, 100, 200, 300, 500],  # Number of boosting stages
        "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used for training
        "criterion": [
            "friedman_mse",
            "squared_error",
        ],  # Function to measure split quality
        "min_samples_split": [
            2,
            5,
            10,
            20,
        ],  # Minimum samples required to split an internal node
        "min_samples_leaf": [1, 2, 5, 10],  # Minimum samples required to be a leaf node
        "min_weight_fraction_leaf": [
            0.0,
            0.01,
            0.05,
        ],  # Minimum fraction of weight in a leaf node
        "max_depth": [3, 5, 7, 10],  # Maximum depth of individual trees
        "min_impurity_decrease": [
            0.0,
            0.01,
            0.1,
        ],  # Minimum impurity decrease to split a node
        "max_features": [
            None,
            "sqrt",
            "log2",
            0.6,
            0.8,
        ],  # Number of features to consider at each split
        "alpha": [
            0.7,
            0.85,
            0.9,
            0.95,
        ],  # Quantile loss parameter (relevant for `loss="quantile"`)
        "max_leaf_nodes": [None, 10, 20, 50],  # Maximum number of leaf nodes in a tree
        "warm_start": [False, True],  # Reuse previous solution to add more estimators
        "validation_fraction": [
            0.1,
            0.2,
            0.3,
        ],  # Fraction of data for validation (used with early stopping)
        "n_iter_no_change": [None, 5, 10, 20],  # Early stopping rounds
        "tol": [1e-4, 1e-3, 1e-2],  # Tolerance for stopping criteria
        "ccp_alpha": [
            0.0,
            0.01,
            0.1,
        ],  # Complexity parameter for Minimal Cost-Complexity Pruning
    },
    cv=2,
)

save_model(search.best_estimator_)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV] END alpha=0.7, ccp_alpha=0.0, criterion=friedman_mse, learning_rate=0.3, loss=absolute_error, max_depth=5, max_features=log2, max_leaf_nodes=10, min_impurity_decrease=0.0, min_samples_leaf=2, min_samples_split=20, min_weight_fraction_leaf=0.05, n_estimators=50, n_iter_no_change=5, subsample=1.0, tol=0.0001, validation_fraction=0.3, warm_start=False; total time=   2.5s
[CV] END alpha=0.7, ccp_alpha=0.0, criterion=friedman_mse, learning_rate=0.3, loss=absolute_error, max_depth=5, max_features=log2, max_leaf_nodes=10, min_impurity_decrease=0.0, min_samples_leaf=2, min_samples_split=20, min_weight_fraction_leaf=0.05, n_estimators=50, n_iter_no_change=5, subsample=1.0, tol=0.0001, validation_fraction=0.3, warm_start=False; total time=   3.6s
[CV] END alpha=0.85, ccp_alpha=0.0, criterion=friedman_mse, learning_rate=0.05, loss=squared_error, max_depth=10, max_features=sqrt, max_leaf_nodes=50, min_impurity_decrease=0.1, min_sa

AttributeError: 'function' object has no attribute 'best_estimator_'

In [None]:
# Show results
df_grid_results = pd.DataFrame(search.cv_results_)
columns_to_show = ["params", "rank_test_score", "mean_train_score", "mean_test_score"]
df_shown_results = df_grid_results[columns_to_show]

print(search.best_params_)
df_shown_results.sort_values("rank_test_score", ascending = True)

{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


Unnamed: 0,params,rank_test_score,mean_train_score,mean_test_score
1,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",1,-194.972173,-200.131273
3,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",2,-195.226022,-200.134551
5,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",3,-195.513447,-200.145811
0,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",4,-195.015079,-200.146579
2,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",5,-195.286946,-200.149347
7,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",6,-195.503966,-200.159432
4,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",7,-195.490744,-200.201737
6,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",8,-195.559578,-200.20432
31,"{'bootstrap': False, 'max_depth': 10, 'min_sam...",9,-197.073748,-202.058976
30,"{'bootstrap': False, 'max_depth': 10, 'min_sam...",10,-197.0738,-202.063383


In [None]:
# Train
model, scores = train(search.best_estimator_, Xtrain_prepared, ytrain_prepared, Xtest_prepared, ytest_prepared)
print(f"mae: {scores[0]}, rmse: {scores[1]}, r2: {scores[2]}")

mae: 202.62332578391408, rmse: 248.49653493418916, r2: 248.49653493418916


Best Param:
{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}