# Extreme Gradient Boosting Tree

In [1]:
import os
import math
import pandas as pd
from joblib import dump, load
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import GridSearchCV, KFold, cross_validate
import xgboost as xgb
from helper import plot_confusion_matrix

In [3]:
ROOT_PATH = '.'

## Read in dataset from csv

In [4]:
X_train = pd.read_csv(os.path.join(ROOT_PATH, "data", "split_train_values.csv"))
y_train = np.ravel(pd.read_csv(os.path.join(ROOT_PATH, "data", "split_train_labels.csv")))
X_test = pd.read_csv(os.path.join(ROOT_PATH, "data", "split_test_values.csv"))
y_test = np.ravel(pd.read_csv(os.path.join(ROOT_PATH, "data", "split_test_labels.csv")))

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

display(X_train.head())

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,8,678,11750,3,35,5,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,17,21,3613,2,10,6,4,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,20,190,11721,2,10,8,4,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,17,811,11295,2,10,9,4,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,9,772,10140,2,0,4,4,0,1,0,...,0,0,0,0,0,0,0,0,1,0


## Hyperparameter tuning

In [11]:
hp_space1 = {
    'xgbclassifier__max_depth':range(3,10,2),
    'xgbclassifier__min_child_weight':range(1,6,2)
} 

xgb1 = xgb.XGBClassifier(random_state=2000, n_jobs=6, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=5, subsample=0.8, colsample_bytree=0.8, seed=100)
pipeline1 = make_pipeline(StandardScaler(), xgb1)
gscv = GridSearchCV(pipeline1, hp_space1, cv=5, scoring='f1_micro', n_jobs=5, return_train_score=True, verbose=True)
print(gscv)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.8,
                                                      gamma=0,
                                                      learning_rate=0.2,
                                                      max_delta_step=0,
    

In [None]:
gscv.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  2.1min


In [5]:
def save_cv(cv_results_, filepath):
    cv_results = pd.DataFrame(cv_results_)
    cv_results.to_csv(filepath)

In [21]:
print("Finished training XGB with grid search CV. Saving model...")
print("All ")
print("Best params: {}".format(gscv.best_params_))
print("Best Micro F1 score: {}".format(gscv.best_score_))
print("Summary of CV_results: {}".format(gscv.cv_results_.keys()))
save_cv(gscv.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid1.csv"))

Finished training SVC with grid search CV. Saving model...
All 
Best params: {'xgbclassifier__max_depth': 3, 'xgbclassifier__min_child_weight': 3}
Best Micro F1 score: 0.633
Summary of CV_results: dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_xgbclassifier__max_depth', 'param_xgbclassifier__min_child_weight', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])


##### Intermediate best param: {'max_depth': 9, 'min_child_weight': 3}

In [28]:
hp_space2 = {
    'xgbclassifier__max_depth': [8, 9, 10],
    'xgbclassifier__min_child_weight': [2, 3, 4]
} 
xgb2 = xgb.XGBClassifier(random_state=2000, n_jobs=7, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=9, subsample=0.8, colsample_bytree=0.8, min_child_weight=3)
pipeline2 = make_pipeline(StandardScaler(), xgb2)

gscv2 = GridSearchCV(pipeline2, hp_space2, cv=5, scoring='f1_micro', return_train_score=True, verbose=True)
print(gscv2)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.8,
                                                      gamma=0,
                                                      learning_rate=0.2,
                                                      max_delta_step=0,
    

In [29]:
gscv2.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   12.5s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.8,
                                                      gamma=0,
                                                      learning_rate=0.2,
                                                      max_delta_step=0,
    

In [30]:
print("Finished training XGB with grid search CV 2. Saving model...")
print("All ")
print("Best params: {}".format(gscv2.best_params_))
print("Best Micro F1 score: {}".format(gscv2.best_score_))
print("Summary of CV_results: {}".format(gscv2.cv_results_.keys()))    
save_cv(gscv2.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid2.csv"))

Finished training XGB with grid search CV 2. Saving model...
All 
Best params: {'xgbclassifier__max_depth': 9, 'xgbclassifier__min_child_weight': 2}
Best Micro F1 score: 0.629
Summary of CV_results: dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_xgbclassifier__max_depth', 'param_xgbclassifier__min_child_weight', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])


##### Intermediate best param: {'max_depth': 10, 'min_child_weight': 3}

In [32]:
hp_space3 = {
    'xgbclassifier__gamma': np.linspace(0, 1.0, 10)
} 
xgb3 = xgb.XGBClassifier(random_state=2000, n_jobs=7, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3)
pipeline3 = make_pipeline(StandardScaler(), xgb3)

gscv3 = GridSearchCV(pipeline3, hp_space3, cv=5, scoring='f1_micro', return_train_score=True, verbose=True)
print(gscv3)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.2, max_delta_step=0,
                                     max_depth=10, min_child_weight=3,
                                     missing=None, n_estimators=100, n_jobs=7,
                                     nthread=None, objective='multi:softmax',
                                     random_state=2000, reg_alpha=0,
                                     reg_lambda=0, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=0.8,
                                     verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'xgbclassifier__gamma': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0

In [36]:
gscv3.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   14.5s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.2, max_delta_step=0,
                                     max_depth=10, min_child_weight=3,
                                     missing=None, n_estimators=100, n_jobs=7,
                                     nthread=None, objective='multi:softmax',
                                     random_state=2000, reg_alpha=0,
                                     reg_lambda=0, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=0.8,
                                     verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'xgbclassifier__gamma': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0

In [38]:
print("Finished training XGB with grid search CV 3. Saving model...")
print("All ")
print("Best params: {}".format(gscv3.best_params_))
print("Best Micro F1 score: {}".format(gscv3.best_score_))
print("Summary of CV_results: {}".format(gscv3.cv_results_.keys()))    
save_cv(gscv3.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid3.csv"))

Finished training SVC with grid search CV 3. Saving model...
All 
Best params: {'xgbclassifier__gamma': 0.0}
Best Micro F1 score: 0.624
Summary of CV_results: dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_xgbclassifier__gamma', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])


##### Intermediate best param: {'max_depth': 10, 'min_child_weight': 3, 'gamma': 0}

In [45]:
hp_space4 = {
    'xgbclassifier__subsample' :[0.6, 0.7, 0.8, 0.9],
    'xgbclassifier__colsample_bytree' : [0.6, 0.7, 0.8, 0.9]
} 

xgb4 = xgb.XGBClassifier(random_state=2000, n_jobs=7, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3, gamma=0)
pipeline4 = make_pipeline(StandardScaler(), xgb4)

gscv4 = GridSearchCV(pipeline4, hp_space4, cv=5, scoring='f1_micro', return_train_score=True, verbose=True)

In [46]:
gscv4.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   22.4s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.8,
                                                      gamma=0,
                                                      learning_rate=0.2,
                                                      max_delta_step=0,
    

In [47]:
print("Finished training XGB with grid search CV 4. Saving model...")
print("All ")
print("Best params: {}".format(gscv4.best_params_))
print("Best Micro F1 score: {}".format(gscv4.best_score_))
print("Summary of CV_results: {}".format(gscv4.cv_results_.keys()))    
save_cv(gscv4.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid4.csv"))

Finished training XGB with grid search CV 4. Saving model...
All 
Best params: {'xgbclassifier__colsample_bytree': 0.7, 'xgbclassifier__subsample': 0.9}
Best Micro F1 score: 0.623
Summary of CV_results: dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_xgbclassifier__colsample_bytree', 'param_xgbclassifier__subsample', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])


##### Intermediate best param: {'max_depth': 10, 'min_child_weight': 3, 'gamma': 0, 'subsample': 0.8, 'colsample_by_tree': 0.8}

In [48]:
hp_space5 = {
    'xgbclassifier__reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
} 
xgb5 = xgb.XGBClassifier(random_state=2000, n_jobs=15, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3)
pipeline5 = make_pipeline(StandardScaler(), xgb5)

gscv5 = GridSearchCV(pipeline5, hp_space5, cv=5, scoring='f1_micro', return_train_score=True,verbose=True)

In [49]:
gscv5.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    5.5s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.8,
                                                      gamma=0,
                                                      learning_rate=0.2,
                                                      max_delta_step=0,
    

In [50]:
print("Finished training XGB with grid search CV 5. Saving model...")
print("All ")
print("Best params: {}".format(gscv5.best_params_))
print("Best Micro F1 score: {}".format(gscv5.best_score_))
print("Summary of CV_results: {}".format(gscv5.cv_results_.keys()))    
save_cv(gscv5.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid5.csv"))

Finished training XGB with grid search CV 5. Saving model...
All 
Best params: {'xgbclassifier__reg_alpha': 0.1}
Best Micro F1 score: 0.629
Summary of CV_results: dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_xgbclassifier__reg_alpha', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])


##### Intermediate best param: {'max_depth': 10, 'min_child_weight': 3, 'gamma': 0, 'subsample': 0.8, 'colsample_by_tree': 0.8, "reg_alpha': 1}

In [6]:
hp_space6 = {
    'xgbclassifier__learning_rate':[0.1, 0.2, 0.3, 0.4]
}  

xgb6 = xgb.XGBClassifier(random_state=2000, n_jobs=15, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3, reg_alpha=1)
pipeline6 = make_pipeline(StandardScaler(), xgb6)

gscv6 = GridSearchCV(pipeline6, hp_space6, cv=5, scoring='f1_micro', return_train_score=True, verbose=True)

In [7]:
gscv6.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


KeyboardInterrupt: 

In [8]:
print("Finished training XGB with grid search CV 6. Saving model...")
print("All ")
print("Best params: {}".format(gscv6.best_params_))
print("Best Micro F1 score: {}".format(gscv6.best_score_))
print("Summary of CV_results: {}".format(gscv6.cv_results_.keys()))    
save_cv(gscv6.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid6.csv"))

Finished training XGB with grid search CV 6. Saving model...
All 


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [16]:
hp_space7 = {'xgbclassifier__n_estimators':[50, 100, 200]} 

xgb7 = xgb.XGBClassifier(random_state=2000, n_jobs=15, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3, reg_alpha=1)
pipeline7 = make_pipeline(StandardScaler(), xgb7)

gscv7 = GridSearchCV(pipeline7, hp_space7, cv=5, scoring='f1_micro', return_train_score=True, verbose=True)

In [17]:
gscv7.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    4.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.8,
                                                      gamma=0,
                                                      learning_rate=0.2,
                                                      max_delta_step=0,
    

In [18]:
print("Finished training XGB with grid search CV 7. Saving model...")
print("All ")
print("Best params: {}".format(gscv7.best_params_))
print("Best Micro F1 score: {}".format(gscv7.best_score_))
print("Summary of CV_results: {}".format(gscv7.cv_results_.keys()))    
save_cv(gscv7.cv_results_, os.path.join(ROOT_PATH, "results", "xgb_grid7.csv"))

Finished training XGB with grid search CV 7. Saving model...
All 
Best params: {'xgbclassifier__n_estimators': 50}
Best Micro F1 score: 0.625
Summary of CV_results: dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_xgbclassifier__n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])


### Save best model from hyperparameter tuning

In [19]:
from joblib import dump

dump(gscv7.best_estimator_, os.path.join(ROOT_PATH, "models", "xgb.joblib"))

['./models/xgb.joblib']

In [21]:
clf = load(os.path.join(ROOT_PATH, 'models', 'xgb.joblib'))
print(clf)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.8, gamma=0, learning_rate=0.2,
                               max_delta_step=0, max_depth=10,
                               min_child_weight=3, missing=nan, n_estimators=50,
                               n_jobs=15, nthread=None,
                               objective='multi:softprob', random_state=2000,
                               reg_alpha=1, reg_lambda=0, scale_pos_weight=1,
                               seed=None, silent=None, subsample=0.8,
                               verbosity=1))],
         verbose=False)


In [22]:
y_pred = clf.predict(X_test)
print("Micro-f1 score: {}".format(f1_score(y_test, y_pred, average='micro')))

Micro-f1 score: 0.6273358658531906


## Feature engineering

In [23]:
# Dictionary to hold all feature engineering trials
fe_trials = {}
fe_trials['original'] = X_train.copy()

In [24]:
def remove_zero_var(df_X):
    # remove predetermined features that had zero variance
    return df_X[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_timber',
       'count_families', 'has_secondary_use', 'land_surface_condition_n',
       'land_surface_condition_t', 'foundation_type_r', 'roof_type_n',
       'roof_type_q', 'ground_floor_type_f', 'other_floor_type_j',
       'other_floor_type_q', 'other_floor_type_x', 'position_s', 'position_t']].copy()

In [25]:
def make_ratio(df_):
    df = df_.copy()
    # Commented means not good
#     df['area_height_ratio'] = df['area_percentage'] / df['height_percentage']
    df['count_floors_height_ratio'] = df['count_floors_pre_eq'] / df['height_percentage']
#     df['age_count_floors_pre_eq_ratio'] = df['age'] / df['count_floors_pre_eq']
    return df

In [26]:
def combine_secondaries(df_):
    df = df_.copy()
    features_sec_use = [col for col in df.columns if 'has_secondary_use_' in col]
    df['has_secondary_uses'] = df[features_sec_use[0]]
    for i in range(1, len(features_sec_use)):
        df['has_secondary_uses'] |= df[features_sec_use[i]]
    df.drop(columns=['has_secondary_use'], inplace=True)
    return df.copy()

In [38]:
fe_trials['Remove_zero_var'] = remove_zero_var(fe_trials['original'])
fe_trials['combine_secondaries'] = combine_secondaries(fe_trials['original'])
fe_trials['make_ratio'] = make_ratio(fe_trials['original'])
fe_trials['combine_secondaries&make_ratio'] = combine_secondaries(make_ratio(fe_trials['original']))

In [39]:
clf = xgb.XGBClassifier(random_state=2000, n_jobs=15, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3, reg_alpha=1, n_estimators=200)
scl = StandardScaler()
pipeline = make_pipeline(scl, clf)
results = {} 

for trial in fe_trials:
    print("Starting trial {}...".format(trial))
    results[trial]  = cross_validate(pipeline, fe_trials[trial], y_train,
                                     cv=5, scoring=['precision_micro', 'precision_weighted', 'precision_macro',
                                                    'recall_micro', 'recall_weighted', 'recall_macro', 
                                                    'f1_micro', 'f1_weighted', 'f1_macro'],
                                    return_train_score=True, verbose=True, n_jobs=1)

Starting trial original...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Starting trial Remove_zero_var...


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Starting trial combine_secondaries...


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Starting trial make_ratio...


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Starting trial combine_secondaries&make_ratio...


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s finished


In [42]:
import sys
with open(os.path.join(ROOT_PATH, 'results', 'feature_engineering.txt'), 'a+') as fp:
    original = sys.stdout
    sys.stdout = fp
    print("***************************************************************")
    for trial in results:
        print("For trial {}".format(trial))
        print("Mean test precision micro': {}".format(results[trial]['test_precision_micro'].mean()))
        print("Mean test precision macro': {}".format(results[trial]['test_precision_macro'].mean()))
        print("Mean test precision weighted': {}".format(results[trial]['test_precision_weighted'].mean()))
        print("Mean test recall micro': {}".format(results[trial]['test_recall_micro'].mean()))
        print("Mean test recall macro': {}".format(results[trial]['test_recall_macro'].mean()))
        print("Mean test recall weighted': {}".format(results[trial]['test_recall_weighted'].mean()))
        print("Mean test f1 micro': {}".format(results[trial]['test_f1_micro'].mean()))
        print("Mean test f1 macro': {}".format(results[trial]['test_f1_macro'].mean()))
        print("Mean test f1 weighted': {}".format(results[trial]['test_f1_weighted'].mean()))
        print("*********************************************************************")
    sys.stdout = original
print("Finished writing results for all trials")

Finished writing results for all trials


## Choose best to train and store

In [5]:
clf = xgb.XGBClassifier(random_state=2000, n_jobs=15, reg_lambda=0, learning_rate=0.2, objective='multi:softmax',
                       max_depth=10, subsample=0.8, colsample_bytree=0.8, min_child_weight=3, reg_alpha=1, n_estimators=200)
scl = StandardScaler()
pipeline = make_pipeline(scl, clf)

In [6]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.8, gamma=0, learning_rate=0.2,
                               max_delta_step=0, max_depth=10,
                               min_child_weight=3, missing=None,
                               n_estimators=200, n_jobs=15, nthread=None,
                               objective='multi:softprob', random_state=2000,
                               reg_alpha=1, reg_lambda=0, scale_pos_weight=1,
                               seed=None, silent=None, subsample=0.8,
                               verbosity=1))],
         verbose=False)

In [7]:
y_pred = pipeline.predict(X_test)
print("F1-Score: {}".format(f1_score(y_test, y_pred, average='micro')))
print("Classification report: \n{}".format(classification_report(y_test, y_pred)))
print('Confusion matrix:')
plot_confusion_matrix(y_test, y_pred, ['Grade 1', 'Grade 2', 'Grade 3'], cmap='Orange')

F1-Score: 0.6184336748397989
Classification report: 
              precision    recall  f1-score   support

           1       0.45      0.34      0.39      2519
           2       0.65      0.74      0.69     14830
           3       0.58      0.50      0.54      8712

    accuracy                           0.62     26061
   macro avg       0.56      0.53      0.54     26061
weighted avg       0.61      0.62      0.61     26061

Confusion matrix:


TypeError: only integer scalar arrays can be converted to a scalar index

## References

https://arxiv.org/abs/1603.02754