## Model 5: Boost Methods

In [1]:
import pandas as pd
import numpy as np
import time

import matplotlib as mpl
import matplotlib.pyplot as plt

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

save_figures = False
figure_path = "../figures/models/"
# df_init = pd.read_parquet("../data/init.parquet")
# df_before = pd.read_parquet("../data/before_split.parquet")
# df_train = pd.read_parquet("../data/prep_train.parquet")
# df_val = pd.read_parquet("../data/prep_val.parquet")
# df_test = pd.read_parquet("../data/prep_test.parquet")
# df_tt_train = pd.read_parquet("../data/prep_tt_train.parquet")
# df_tt_test = pd.read_parquet("../data/prep_tt_test.parquet")
df_all = pd.read_parquet("../data/prep_all.parquet")

In [2]:
df_all.head()

Unnamed: 0,Year_Y2015,Year_Y2016,Country_England,Country_Wales,Supermarket_Asda,Supermarket_Tesco Extra,Supermarket_Tesco Metro,Supermarket_Waitrose,Time_Evening,Time_Morning,...,YearCountryAge_Y2016EnglandAge_g2,YearCountryAge_Y2016EnglandAge_g3,YearCountryAge_Y2016WalesAge_g1,YearCountryAge_Y2016WalesAge_g2,YearCountryAge_Y2016WalesAge_g3,YearCountryAge_nan,ObsSize,FemaleN,MaleN,y
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.471136,0.583858,-0.993069,1
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.811174,2.323876,-0.993069,1
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.471136,0.583858,-0.993069,1


In [3]:
X_all = df_all.iloc[:,0:-1].to_numpy()
y_all = df_all["y"].to_numpy()

### Adaboost, Grid Search

In [4]:
random_state = 132
Dtree = DecisionTreeClassifier()
Ada_grid = AdaBoostClassifier(base_estimator = Dtree, 
                              random_state = random_state)
Ada_grid

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=132)

In [5]:
params_Ada = {'learning_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1],
              'n_estimators': [8, 12, 16, 20, 24, 28, 32, 36, 40],
              'base_estimator__max_depth': [1, 2, 3, 4]}

random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_Ada = GridSearchCV(estimator = Ada_grid, 
                          param_grid = params_Ada, 
                          scoring = 'balanced_accuracy', 
                          n_jobs = -2,
                          refit = True, 
                          cv = 5, verbose = 1)
gridCV_Ada

GridSearchCV(cv=5,
             estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                                          random_state=132),
             n_jobs=-2,
             param_grid={'base_estimator__max_depth': [1, 2, 3, 4],
                         'learning_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1,
                                           1.1],
                         'n_estimators': [8, 12, 16, 20, 24, 28, 32, 36, 40]},
             scoring='balanced_accuracy', verbose=1)

In [6]:
%%time

BalAccuracy_ada = []

for each_rs in random_states_split:
    start_time = time.time()
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    n_each_class = [np.sum(y_other_temp == each_c) 
                    for each_c 
                    in np.unique(y_other_temp)]
    weight_each_class = [1/each_n 
                         for each_n 
                         in n_each_class]
    balanced_weights = np.array([weight_each_class[each_y] 
                                 for each_y 
                                 in y_other_temp])
    
    
    gridCV_Ada.fit(X_other_temp, y_other_temp, 
                   groups = None, 
                   sample_weight = balanced_weights)
    
    BalAccuracy_ada.append(gridCV_Ada.best_score_)
    
    print(gridCV_Ada.best_score_, flush = True)
    print(gridCV_Ada.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.6881407535138544
{'base_estimator__max_depth': 1, 'learning_rate': 1, 'n_estimators': 16}
Iteration 123: 40.285 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.6906000464831343
{'base_estimator__max_depth': 1, 'learning_rate': 0.3, 'n_estimators': 36}
Iteration 456: 40.097 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.6893166795552951
{'base_estimator__max_depth': 2, 'learning_rate': 0.6, 'n_estimators': 24}
Iteration 789: 40.792 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.6879047287105728
{'base_estimator__max_depth': 1, 'learning_rate': 0.4, 'n_estimators': 36}
Iteration 741: 42.010 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.693455539258397
{'base_estimator__max_depth': 1, 'learning_rate': 0.8, 'n_estimators': 12}
Iteration 852: 40.950 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits

In [7]:
print(f"{np.mean(BalAccuracy_ada):.4f} ({np.std(BalAccuracy_ada):.4f})")

0.6928 (0.0036)


### Gradient Boost, Grid Search

In [8]:
random_state = 132
Gradient_grid = GradientBoostingClassifier(loss = 'deviance', 
                                           random_state= random_state, 
                                           n_estimators = 50, 
                                           min_samples_split = 2)
Gradient_grid

GradientBoostingClassifier(n_estimators=50, random_state=132)

In [9]:
params_Gradient = {'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
                   'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
                   'max_depth': [2, 3, 4, 5, 6]}

random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_Gradient = GridSearchCV(estimator = Gradient_grid, 
                               param_grid = params_Gradient, 
                               scoring = 'balanced_accuracy', 
                               n_jobs = -2,
                               refit = True, 
                               cv = 5, verbose = 1)
gridCV_Gradient

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(n_estimators=50,
                                                  random_state=132),
             n_jobs=-2,
             param_grid={'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
                         'max_depth': [2, 3, 4, 5, 6],
                         'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                          0.8, 0.9, 1.0]},
             scoring='balanced_accuracy', verbose=1)

In [10]:
%%time

BalAccuracy_grad = []

for each_rs in random_states_split:
    start_time = time.time()
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    n_each_class = [np.sum(y_other_temp == each_c) 
                    for each_c 
                    in np.unique(y_other_temp)]
    weight_each_class = [1/each_n 
                         for each_n 
                         in n_each_class]
    balanced_weights = np.array([weight_each_class[each_y] 
                                 for each_y 
                                 in y_other_temp])
    
    
    gridCV_Gradient.fit(X_other_temp, y_other_temp, 
                   groups = None, 
                   sample_weight = balanced_weights)
    
    BalAccuracy_grad.append(gridCV_Gradient.best_score_)
    
    print(gridCV_Gradient.best_score_, flush = True)
    print(gridCV_Gradient.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
0.6924520052847376
{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 0.3}
Iteration 123: 47.110 seconds
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
0.6913559355461067
{'learning_rate': 0.1, 'max_depth': 2, 'max_features': 0.4}
Iteration 456: 48.559 seconds
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
0.6934289181241501
{'learning_rate': 0.2, 'max_depth': 4, 'max_features': 0.1}
Iteration 789: 48.528 seconds
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
0.6856184576743652
{'learning_rate': 0.2, 'max_depth': 3, 'max_features': 0.1}
Iteration 741: 48.419 seconds
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
0.6928683205470905
{'learning_rate': 0.15, 'max_depth': 2, 'max_features': 0.2}
Iteration 852: 48.238 seconds
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
0.7001775498857763
{'learning_rate': 0.15, 'max_depth': 3, 'max_featur

In [11]:
print(f"{np.mean(BalAccuracy_grad):.4f} ({np.std(BalAccuracy_grad):.4f})")

0.6929 (0.0034)


### XGB, Grid Search

In [12]:
random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

params_XGB = {'learning_rate': [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55],
              'max_depth': [2, 3, 4, 5, 6],
              'gamma': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1,  1.2],
              'n_estimators': [10, 12, 16, 20, 24, 30, 35, 40]}


In [13]:
%%time

BalAccuracy_xgb = []

for each_rs in random_states_split:
    start_time = time.time()
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    n_each_class = [np.sum(y_other_temp == each_c) 
                    for each_c 
                    in np.unique(y_other_temp)]

    xgb_pos_scale = n_each_class[0]/n_each_class[1]
    
    
    xbg_random_state = 132
    XGB_grid = XGBClassifier(random_state= xbg_random_state, 
                             use_label_encoder=False,  
                             scale_pos_weight = xgb_pos_scale, 
                             eval_metric = 'auc')
    
    gridCV_XGB = GridSearchCV(estimator = XGB_grid, 
                           param_grid = params_XGB, 
                           scoring = 'balanced_accuracy', 
                           n_jobs = -2,
                           refit = True, 
                           cv = 5, verbose = 1)
    
    gridCV_XGB.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_xgb.append(gridCV_XGB.best_score_)
    
    print(gridCV_XGB.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
{'gamma': 1.0, 'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 12}
Iteration 123: 266.052 seconds
Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
{'gamma': 0.6, 'learning_rate': 0.45, 'max_depth': 2, 'n_estimators': 16}
Iteration 456: 259.737 seconds
Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
{'gamma': 0.5, 'learning_rate': 0.55, 'max_depth': 3, 'n_estimators': 20}
Iteration 789: 262.148 seconds
Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
{'gamma': 0.7, 'learning_rate': 0.55, 'max_depth': 5, 'n_estimators': 16}
Iteration 741: 261.579 seconds
Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
{'gamma': 0.5, 'learning_rate': 0.25, 'max_depth': 3, 'n_estimators': 20}
Iteration 852: 265.376 seconds
Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
{'gamma': 0.7, 'learning_rate': 0.35, 'max_depth': 3, 'n_estimators': 12}
Iterat

In [14]:
print(f"{np.mean(BalAccuracy_xgb):.4f} ({np.std(BalAccuracy_xgb):.4f})")

0.6944 (0.0037)


In [15]:
import json

boost_scores = {"Ada": BalAccuracy_ada, 
                "Gradient": BalAccuracy_grad, 
                "XGB": BalAccuracy_xgb}

with open("../results/boost_scores.json", "w") as outfile:
    json.dump(boost_scores, outfile)
    
# with open("logistic.json", "r") as readfile:
#     dict_data = json.load(readfile)