## 10. Ensembling

#### Import libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler # to standardize features 

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
print(os.getcwd())
dirRawData = "../RawData/"
dirPData = "../PData/"
dirPOutput = "../POutput/"

C:\Users\munch\Documents\Cass MSc\Term 3\Machine Learning\Coursework\PCode


#### Load the data

In [3]:
# load df_all
f_name = dirPData + '02_df.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)
    
df_all_onehot = dict_['df_all_onehot']

del f_name, dict_

In [4]:
# load the dict_ that was saved to pickle
#dict_ = {'vars_ind': vars_ind, 'idx_design': idx_design, 'idx_test': idx_test}
f_name = dirPData + '02_vars.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)
    
var_dep = dict_['var_dep']
vars_ind_num = dict_['vars_ind_num']
vars_ind_categorical = dict_['vars_ind_categorical']
vars_ind_onehot = dict_['vars_ind_onehot']

del f_name, dict_

#### Preparing the data

In [5]:
#Create indexes for the train, validatiom, design and test set
idx_train  = np.where(df_all_onehot['fold'].isin(np.arange(0,8)))[0] 
idx_val    = np.where(df_all_onehot['fold'].isin([8,9]))[0]
idx_design = np.where(df_all_onehot['fold'].isin(np.arange(0,10)))[0]
idx_test   = np.where(df_all_onehot['fold'].isin([10]))[0]

In [6]:
#Check if the validation set and the train set is the same as the design set
len(idx_val)+len(idx_train) == len(idx_design)

True

In [7]:
#Store the length of the indexes 
n_train = len(idx_train)
n_val = len(idx_val)
n_test = len(idx_test)

In [8]:
#store all the independent variables into one
vars_ind = vars_ind_num + vars_ind_onehot

In [9]:
X = df_all_onehot[vars_ind].values #extract the values

# Standardise (each feature to have mean:0 and stdev:1)
X_train  = X[idx_train, :]

standardScaler_ = StandardScaler()
standardScaler_.fit(X_train)

# Now standardise all the data
X = standardScaler_.transform(X)

In [10]:
X_design = X[idx_design, :]
X_train  = X[idx_train, :]
X_val    = X[idx_val, :]
X_test   = X[idx_test, :]

In [11]:
y_design = df_all_onehot[var_dep].iloc[idx_design].copy().values.ravel()
y_train  = df_all_onehot[var_dep].iloc[idx_train].copy().values.ravel()
y_val    = df_all_onehot[var_dep].iloc[idx_val].copy().values.ravel()
y_test   = df_all_onehot[var_dep].iloc[idx_test].copy().values.ravel()

# Load the models

###### 1. Xgboost

Kaggle Score: 0.87807

In [13]:
f_name = dirPData + 'xgb.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)  
xgb1_train = dict_['xgbopt_prob']
xgb1_test = dict_['xgbopt_prob_test']
    
del f_name, dict_

In [14]:
xgb1_train = xgb1_train[:,1]
xgb1_test = xgb1_test[:,1]

###### 2. Logistic Regression

Kaggle score: 0.86802

In [15]:
f_name = dirPData + 'logreg.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)  
log_reg_design = dict_['lropt_prob']
log_reg_test = dict_['lropt_prob_test']
    
del f_name, dict_

In [16]:
lr_train = log_reg_design[:,1]
lr_test = log_reg_test[:,1]

###### 3. Elastic Net

Kaggle score: 0.85632

In [17]:
f_name = dirPData + 'EN.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)  
EN_train = dict_['ENopt_prob']
EN_test = dict_['ENopt_prob_test']
    
del f_name, dict_

In [18]:
EN_train = EN_train[:,1]
EN_test = EN_test[:,1]

**4. Decision Tree**

Kaggle Score: 0.78208

In [19]:
f_name = dirPData + 'dt.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)  
dt_train = dict_['dt_prob_design']
dt_test = dict_['dt_prob_test']
    
del f_name, dict_

In [20]:
#obtain the probabilities of crash
dt_train = dt_train[:,1]
dt_test = dt_test[:,1]

**5. KNN**

Kaggle Score:  0.67657

In [53]:
f_name = dirPData + 'knn.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)  
knn_train = dict_['knn_prob_design']
knn_test = dict_['knn_prob_test']
    
del f_name, dict_

In [56]:
knn_train = knn_train[:,1]
knn_test = knn_test[:,1]
knn_train.shape

(11962,)

###### 6. lightGBM model

Kaggle Score: 0.87285

In [45]:
f_name = dirPData + 'lgbm.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f) 
lgbm_train = dict_['lgbm_prob_design']
lgbm_test = dict_['lgbm_prob_test']
    
del f_name, dict_

In [50]:
lgbm_train.shape

(11962,)

###### 7. Random Forest

Kaggle Score: 0.85673

In [47]:
f_name = dirPData + 'rf.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f) 
rf_train = dict_['rf_prob_design']
rf_test = dict_['rf_prob_test']
    
del f_name, dict_

In [51]:
rf_train.shape

(11962,)

**Combine train and test ensemble data into numpy arrays**

In [85]:
#combine design data into array
combined = np.concatenate((EN_train.reshape(-1,1),lr_train.reshape(-1,1),knn_train.reshape(-1,1),
                           xgb1_train.reshape(-1,1), lgbm_train.reshape(-1,1),
                           rf_train.reshape(-1,1),dt_train.reshape(-1,1)),axis=1)

In [86]:
combined.shape

(11962, 7)

In [88]:
#combine test data into array
combined_test = np.concatenate((EN_test.reshape(-1,1),lr_test.reshape(-1,1),knn_test.reshape(-1,1),
                                xgb1_test.reshape(-1,1), lgbm_test.reshape(-1,1),
                                rf_test.reshape(-1,1),dt_test.reshape(-1,1)),axis=1)

**XGBoost on stacked ensemble data**

Stacking is an ensemble learning technique that uses predictions from multiple models (for example decision tree, knn or svm) to build a new model. This model is used for making predictions on the test set. 

In [79]:
#instantiate and fit model to run on stack
import xgboost as xgb
xgb_ = xgb.XGBClassifier()
xgb_.fit(X = combined, y = np.ravel(y_design)) # fit model to train data with default settings

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [81]:
from sklearn.model_selection import GridSearchCV
#tune all 3 parameters simultaneously
min_child_weight = [1,3,5,7]
gamma = [0,1,2,3]
max_depth=[1,2,3,4,5,6]
param_grid = dict(gamma = gamma, min_child_weight = min_child_weight, max_depth=max_depth)
xgb_ = xgb.XGBClassifier()
grid = GridSearchCV(estimator=xgb_, param_grid=param_grid, cv = 10, n_jobs=-1,scoring='roc_auc')
#cv = number of folds for validation
grid_result = grid.fit(combined,np.ravel(y_design))
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # Summarize results

Best: 0.993996 using {'gamma': 2, 'max_depth': 4, 'min_child_weight': 1}


In [82]:
ensemble_proba2 = grid.predict_proba(combined_test)

This is an almost perfect AUC score however Kaggle Score was only 0.78648. This could be due to overfitting the models.

**Save data**

In [84]:
import csv
f_name = dirPOutput + 'ensemble_pred2.csv'

df_test = pd.read_csv(dirRawData + 'test.csv')

with open(f_name, 'w',newline='') as csvfile:
    writer=csv.writer(csvfile,delimiter=',')
    writer.writerow(["id", "target"])
    writer.writerows(zip(df_test[df_test.columns[0]], ensemble_proba2[:,1]))

**Logistic regression on ensemble data**

In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
C=[0.05,0.1,1.0,1.5,2.0]
penalty = ["l1","l2"]
param_grid = dict(penalty = penalty,C=C)
lr_= LogisticRegression(solver ='liblinear')
grid = GridSearchCV(estimator=lr_, param_grid=param_grid, cv = 10, n_jobs=-1, scoring = 'roc_auc')
#cv = number of folds for validation

In [90]:
#fit using best parameters
grid_result = grid.fit(combined,np.ravel(y_design))
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # Summarize results

Best: 0.984980 using {'C': 1.0, 'penalty': 'l2'}


In [91]:
ensemble_proba3 = grid.predict_proba(combined_test)

array([[0.95660045, 0.04339955],
       [0.97117894, 0.02882106],
       [0.97926052, 0.02073948],
       ...,
       [0.89066184, 0.10933816],
       [0.99468429, 0.00531571],
       [0.12847104, 0.87152896]])

In [92]:
#save prediction as csv
import csv
f_name = dirPOutput + 'ensemble_pred3.csv'

df_test = pd.read_csv(dirRawData + 'test.csv')

with open(f_name, 'w',newline='') as csvfile:
    writer=csv.writer(csvfile,delimiter=',')
    writer.writerow(["id", "target"])
    writer.writerows(zip(df_test[df_test.columns[0]], ensemble_proba3[:,1]))

The AUC score is still high but Kaggle Score was only 0.7702. This could be due to overfitting the models.

**Plot correlation matrix**

In [101]:
#save all as dataframe

base_predictions_train = pd.DataFrame( {'RandomForest': rf_train.ravel(),
                                        'DecisionTree': dt_train.ravel(),
                                        'LogisticReg': lr_train.ravel(),
                                        'XGB': xgb1_train.ravel(),
                                        'LGBM': lgbm_train.ravel(),
                                        'ElasticNet': EN_train.ravel(),
                                        'KNN': knn_train.ravel()                                      
    })

base_predictions_train.head()




Unnamed: 0,RandomForest,DecisionTree,LogisticReg,XGB,LGBM,ElasticNet,KNN
0,0.112704,0.28,0.22604,0.12978,0.132454,0.062931,0.285714
1,0.071387,0.067156,0.043165,0.053123,0.074115,0.000972,0.0
2,0.496627,0.120879,0.298983,0.363683,0.386933,0.200365,0.142857
3,0.46153,0.120879,0.355301,0.471014,0.525788,0.468819,0.285714
4,0.093595,0.067156,0.142167,0.076524,0.140744,0.065601,0.142857


In [99]:
#print corelation matrix
corr = base_predictions_train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,RandomForest,DecisionTree,LogisticReg,XGB,LGBM,ElasticNet,KNN
RandomForest,1.0,0.785593,0.813701,0.923658,0.938472,0.717921,0.641376
DecisionTree,0.785593,1.0,0.676288,0.793398,0.787262,0.602641,0.508306
LogisticReg,0.813701,0.676288,1.0,0.878046,0.858271,0.903061,0.620729
XGB,0.923658,0.793398,0.878046,1.0,0.971576,0.809184,0.628839
LGBM,0.938472,0.787262,0.858271,0.971576,1.0,0.782701,0.629821
ElasticNet,0.717921,0.602641,0.903061,0.809184,0.782701,1.0,0.533818
KNN,0.641376,0.508306,0.620729,0.628839,0.629821,0.533818,1.0


It seems that Random Forest is highly correlated with XGB and LGBM, thus we should only select one out of the three. ElasticNet and LogisticReg are also highly correlated, which is no surprise. In the end the only models would then be decision tree, XGB, LogisticReg and KNN. Alternately we could take the average of the correlated model predictions.

In [100]:
#try running models without the correlated pairs

#combine design data into array
combined = np.concatenate((lr_train.reshape(-1,1),knn_train.reshape(-1,1),
                           xgb1_train.reshape(-1,1),dt_train.reshape(-1,1)),axis=1)

#combine test data into array
combined_test = np.concatenate((lr_test.reshape(-1,1),knn_test.reshape(-1,1),
                                xgb1_test.reshape(-1,1),dt_test.reshape(-1,1)),axis=1)

min_child_weight = [1,3,5,7]
gamma = [0,1,2,3]
max_depth=[1,2,3,4,5,6]
param_grid = dict(gamma = gamma, min_child_weight = min_child_weight, max_depth=max_depth)
xgb_ = xgb.XGBClassifier()
grid = GridSearchCV(estimator=xgb_, param_grid=param_grid, cv = 10, n_jobs=-1,scoring='roc_auc')
#cv = number of folds for validation
grid_result = grid.fit(combined,np.ravel(y_design))
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # Summarize results

Best: 0.980926 using {'gamma': 1, 'max_depth': 4, 'min_child_weight': 1}
Best: 0.980926 using {'gamma': 1, 'max_depth': 4, 'min_child_weight': 1}


In [104]:
ensemble_proba4=grid.predict_proba(combined_test)

In [105]:
#save prediction as csv
import csv
f_name = dirPOutput + 'ensemble_pred4.csv'

df_test = pd.read_csv(dirRawData + 'test.csv')

with open(f_name, 'w',newline='') as csvfile:
    writer=csv.writer(csvfile,delimiter=',')
    writer.writerow(["id", "target"])
    writer.writerows(zip(df_test[df_test.columns[0]], ensemble_proba4[:,1]))

Sadly, the Kaggle Score still was only 0.77169..