In [1]:
# Import libraries
import os
import sys

# cpu_count returns the number of CPUs in the system.
from multiprocessing import cpu_count

import numpy as np
import pandas as pd

# Import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Import preprocessing methods from sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

# Import PCA
from sklearn.decomposition import PCA

# Import RFE
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

# Import models from sklearn
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Import XGBClassifier
from xgboost.sklearn import XGBClassifier

# Import from sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
#from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator


# Import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Modify notebook settings
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
%matplotlib inline



### Create paths to data file, append `src` directory to sys.path

In [2]:
# Create a variable for the project root directory
proj_root = os.path.join(os.pardir)

# Save path to the xgboost data file
# "booster_processed_data.csv"
xgboost_data_file = os.path.join(proj_root,
                                   "data",
                                   "processed",
                                   "booster_processed_data.csv")

In [5]:
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(proj_root, "src")
sys.path.append(src_dir)

### Read in the xgboost data

In [6]:
# Read in the xgboost data set.
df = pd.read_csv(xgboost_data_file, 
                           index_col=0)

df.head()

Unnamed: 0_level_0,y,limit_bal,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,bl_ratio_1,bl_ratio_2,bl_ratio_3,bl_ratio_4,bl_ratio_5,bl_ratio_6,blpl_ratio_1,blpl_ratio_2,blpl_ratio_3,blpl_ratio_4,blpl_ratio_5,blpl_ratio_6,limit_bal_log,age_log,bill_amt1_log,bill_amt2_log,bill_amt3_log,bill_amt4_log,bill_amt5_log,bill_amt6_log,pay_amt1_log,pay_amt2_log,pay_amt3_log,pay_amt4_log,pay_amt5_log,pay_amt6_log,sex_1,sex_2,edu_1,edu_2,edu_3,edu_4,marriage_0,marriage_1,marriage_2,marriage_3,pay_1_-1,pay_1_-2,pay_1_0,pay_1_1,pay_1_2,pay_1_3,pay_1_4,pay_1_5,pay_1_6,pay_1_7,pay_1_8,pay_2_-1,pay_2_-2,pay_2_0,pay_2_1,pay_2_2,pay_2_3,pay_2_4,pay_2_5,pay_2_6,pay_2_7,pay_2_8,pay_3_-1,pay_3_-2,pay_3_0,pay_3_1,pay_3_2,pay_3_3,pay_3_4,pay_3_5,pay_3_6,pay_3_7,pay_3_8,pay_4_-1,pay_4_-2,pay_4_0,pay_4_1,pay_4_2,pay_4_3,pay_4_4,pay_4_5,pay_4_6,pay_4_7,pay_4_8,pay_5_-1,pay_5_-2,pay_5_0,pay_5_2,pay_5_3,pay_5_4,pay_5_5,pay_5_6,pay_5_7,pay_5_8,pay_6_-1,pay_6_-2,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7,pay_6_8
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1
1,1,20000,24,3913,3102,689,0,0,0,0,689,0,0,0,0,0.19565,0.1551,0.03445,0.0,0.0,0.0,0.19565,0.12065,0.03445,0.0,0.0,0.0,9.903488,3.178054,8.272315,8.040125,6.536692,0.0,0.0,0.0,0.0,6.536692,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,120000,26,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,0.02235,0.014375,0.02235,0.027267,0.028792,0.027175,0.02235,0.006042,0.014017,0.018933,0.028792,0.010508,11.695247,3.258097,7.894691,7.453562,7.894691,8.093462,8.147867,8.090096,0.0,6.908755,6.908755,6.908755,0.0,7.601402,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,90000,34,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0.324878,0.155856,0.150656,0.159233,0.166089,0.172767,0.308011,0.139189,0.139544,0.148122,0.154978,0.117211,11.407565,3.526361,10.283293,9.548811,9.51488,9.57025,9.6124,9.651816,7.325808,7.313887,6.908755,6.908755,6.908755,8.517393,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,50000,37,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0.9398,0.96466,0.98582,0.56628,0.57918,0.59094,0.8998,0.92428,0.96182,0.54428,0.5578,0.57094,10.819778,3.610918,10.757711,10.783819,10.805517,10.251147,10.273671,10.293771,7.601402,7.610853,7.09091,7.003974,6.975414,6.908755,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,50000,57,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0.17234,0.1134,0.7167,0.4188,0.38292,0.38262,0.13234,-0.62022,0.5167,0.2388,0.36914,0.36904,10.819778,4.043051,9.061608,8.643121,10.486708,9.949464,9.859901,9.859118,7.601402,10.510041,9.21044,9.105091,6.536692,6.522093,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


### Train test split

In [8]:
# Extract X and y from df
X = df.drop('y', axis=1).values
#y = df[['y']].values
y = df['y'].values

# Train test split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42)

### Preprocessing

In [14]:
create_interactions = PolynomialFeatures(degree=2, 
                                  interaction_only=True, 
                                  include_bias=False)

In [17]:
X_train_interactions = create_interactions.fit_transform(X_train)

In [18]:
X_train.shape

(20100, 114)

In [19]:
X_train_interactions.shape

(20100, 6555)

### GridsearchCV

In [26]:
cores_used = cpu_count() - 1
cores_used

3

In [29]:
rfecv_instance = \
    RFECV(XGBClassifier(max_depth=4, 
                        learning_rate=0.01,
                        n_estimators=1000, 
                        objective='binary:logistic'),
          step=1,
          cv=5, 
          scoring='roc_auc', 
          verbose=100,  
          n_jobs=cores_used)

In [30]:
rfecv_instance.fit(X_train, y_train)

Fitting estimator with 114 features.
Fitting estimator with 114 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 113 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 112 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 111 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 109 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 108 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 107 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 106 features.
Fitting estimator with 106 features.
F

Fitting estimator with 38 features.
Fitting estimator with 38 features.
Fitting estimator with 39 features.
Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 38 features.
Fitting estimator with 36 features.
Fitting estimator with 36 features.
Fitting estimator with 37 features.
Fitting estimator with 35 features.
Fitting estimator with 35 features.
Fitting estimator with 36 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 35 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 34 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 33 features.
Fitting estimator with 31 features.
Fitting estimator with 31 features.
Fitting estimator with 32 features.
Fitting estimator with 30 features.
Fitting estimator with 30 features.
Fitting estimator with 31 features.
Fitting estimator with 29 fe

Fitting estimator with 57 features.
Fitting estimator with 55 features.
Fitting estimator with 56 features.
Fitting estimator with 54 features.
Fitting estimator with 55 features.
Fitting estimator with 53 features.
Fitting estimator with 54 features.
Fitting estimator with 52 features.
Fitting estimator with 53 features.
Fitting estimator with 51 features.
Fitting estimator with 52 features.
Fitting estimator with 50 features.
Fitting estimator with 51 features.
Fitting estimator with 49 features.
Fitting estimator with 50 features.
Fitting estimator with 48 features.
Fitting estimator with 49 features.
Fitting estimator with 47 features.
Fitting estimator with 48 features.
Fitting estimator with 46 features.
Fitting estimator with 47 features.
Fitting estimator with 45 features.
Fitting estimator with 46 features.
Fitting estimator with 44 features.
Fitting estimator with 45 features.
Fitting estimator with 43 features.
Fitting estimator with 44 features.
Fitting estimator with 42 fe

KeyboardInterrupt: 

In [None]:
scores_rfe_roc_auc = cross_val_score(XGBClassifier(max_depth=4, 
                             learning_rate=0.01,
                             n_estimators=1000, 
                             objective='binary:logistic'), 
                             X_train_rfe, y_train, 
                             scoring='roc_auc',
                             cv=5)
scores_rfe_roc_auc.mean()

In [12]:
cores_used = cpu_count() - 1
cores_used

3

In [13]:
baseline_mf = DummyClassifier(strategy='most_frequent', 
                              random_state=42)

param_grid = {'strategy': ['most_frequent', 'stratified']}

grid = GridSearchCV(baseline_mf, param_grid, cv=5, 
                    scoring='roc_auc',
                    verbose=0, 
                    n_jobs=cores_used)
# ...
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best params:
{'strategy': 'stratified'}

Best cross-validation score: 0.50


#### XGBoost Gridsearch

In [None]:
selector = RFE(XGBClassifier(max_depth=4, 
                             learning_rate=0.01,
                             n_estimators=1000, 
                             objective='binary:logistic'), 
               n_features_to_select=20, step=1, verbose=100)

In [None]:
selector.fit(X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_train_rfe = selector.transform(X_train)

In [None]:
X_train_rfe.shape

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores_rfe_roc_auc = cross_val_score(XGBClassifier(max_depth=4, 
                             learning_rate=0.01,
                             n_estimators=1000, 
                             objective='binary:logistic'), 
                             X_train_rfe, y_train, 
                             scoring='roc_auc',
                             cv=5)
scores_rfe_roc_auc.mean()

In [None]:
cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs')

In [None]:
scoring=None

In [None]:
cross_val_score?

    >>> clf = svm.SVC(kernel='linear', C=1)
    >>> scores = cross_val_score(clf, iris.data, iris.target, cv=5)
    >>> scores     

In [None]:
grid_rfe = GridSearchCV(grid, param_grid, cv=5, 
                    scoring='roc_auc',
                    verbose=100, 
                    n_jobs=cores_used)
# ...
grid.fit(X_train, y_train)

In [None]:

XGBClassifier(max_depth=4, learning_rate=0.01, 
              n_estimators=1000, objective='binary:logistic')
              


In [None]:
pipe = Pipeline([('preprocessing', None), 
                 ('classifier', DummyClassifier(strategy='most_frequent',
                                                random_state=42))])


param_grid = [
    {'classifier': [DummyClassifier(strategy='most_frequent',
                                    random_state=42)],
     'preprocessing': [None]},
    
    {'classifier': [XGBClassifier(objective='binary:logistic')], 
     'preprocessing': [None],
     'classifier__max_depth': [1, 2, 3, 4, 5, 6, 7],
     'classifier__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.3],
     'classifier__n_estimators': [1000]}]

In [None]:
cores_used = cpu_count() - 1
cores_used

In [None]:
cores_used = 1
cores_used

In [None]:
# verbose=100, 
# n_jobs=cores_used)

grid = GridSearchCV(pipe, param_grid, cv=5, 
                    scoring='roc_auc',
                    verbose=100, 
                    n_jobs=cores_used)
# ...
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
#print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

In [None]:
grid.best_score_

In [None]:
grid.best_params_

### Try 1 with dropped cols

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
0.77904315990602091

    {'classifier': XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
            gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=4,
            min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
            objective='binary:logistic', reg_alpha=0, reg_lambda=1,
            scale_pos_weight=1, seed=0, silent=True, subsample=1),
     'classifier__learning_rate': 0.01,
     'classifier__max_depth': 4,
     'classifier__n_estimators': 1000,
     'preprocessing': None}

# Try 2



    {'classifier': XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
            gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=4,
            min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
            objective='binary:logistic', reg_alpha=0, reg_lambda=1,
            scale_pos_weight=1, seed=0, silent=True, subsample=1),
     'classifier__learning_rate': 0.01,
     'classifier__max_depth': 4,
     'classifier__n_estimators': 1000,
     'preprocessing': None}

# Try lower learning rate

In [None]:
pipe = Pipeline([('preprocessing', None), 
                 ('classifier', DummyClassifier(strategy='most_frequent',
                                                random_state=42))])


param_grid = [
    {'classifier': [DummyClassifier(strategy='most_frequent',
                                    random_state=42)],
     'preprocessing': [None]},
    
    {'classifier': [XGBClassifier(objective='binary:logistic')], 
     'preprocessing': [None],
     'classifier__max_depth': [3, 5, 7],
     'classifier__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.3],
     'classifier__n_estimators': [1000]}]

0.77860250373216644

    {'classifier': XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
            gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=5,
            min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
            objective='binary:logistic', reg_alpha=0, reg_lambda=1,
            scale_pos_weight=1, seed=0, silent=True, subsample=1),
     'classifier__learning_rate': 0.01,
     'classifier__max_depth': 5,
     'classifier__n_estimators': 1000,
     'preprocessing': None}

In [None]:
grid.best_params_