### Import libraries and modify notebook settings

In [32]:
# Import libraries
import os
import sys
from multiprocessing import cpu_count
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Note: The xgboost package uses an older version of sklearn. 
# When you run "from xgboost.sklearn import XGBClassifier,"
# a DeprecationWarning is raised. You can ignore the warning.

from xgboost.sklearn import XGBClassifier

# Modify notebook settings
%matplotlib inline
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

### Create paths to data folders and files

In [3]:
# Create a variable for the project root directory
proj_root = os.path.join(os.pardir)

# Save the path to the folder that will contain the final,
# processed data: /data/processed
processed_data_dir = os.path.join(proj_root,
                                "data",
                                "processed")

# Save the path to final, processed Instacart data file.
final_csv_name = 'instacart_final.csv'

final_csv_path = os.path.join(processed_data_dir,
                              final_csv_name)

# Save path to the `models` folder, where we will save the
# pickled pipeline and grid search objects
models_folder = os.path.join(proj_root,
                             "models")

### Read in data

In [4]:
df = pd.read_csv(final_csv_path, index_col=0)

`total_buy_ratio_n5` is a linear combination of `total_buy_n5`. Therefore, we drop `total_buy_ratio_n5` from our feature set.

In [5]:
df = df.drop('total_buy_ratio_n5', axis=1)

### Prepare the train set and test set

In [6]:
# Extract X and y from df
X = df.drop('y', axis=1).values
y = df['y'].values

# Train test split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Check the number of samples in each set.
print('{:,}'.format(len(y)))
print('{:,}'.format(len(y_train)))
print('{:,}'.format(len(y_test)))

13,307,953
9,315,567
3,992,386


## Create pipeline

In [8]:
# Create pipeline
pipe = Pipeline([('sampler', SMOTE()),
                 ('classifier', XGBClassifier())])

In [9]:
# Create parameter grid
param_grid = [
    {'classifier': [XGBClassifier(objective='binary:logistic',
                                  n_estimators=100)], 
     'sampler': [None, SMOTE(random_state=42)],
     'classifier__n_estimators': [100],
     'classifier__learning_rate': [0.1],
     'classifier__gamma': [0.01],
     'classifier__max_delta_step': [0, 1],
     'classifier__max_depth': [3, 5],
     'classifier__subsample': [1],
     'classifier__reg_lambda': [1.0],
     'classifier__reg_alpha': [0.1]}]

In [10]:
# Set the number of cores to be used
cores_used = cpu_count() - 1
cores_used
cores_used = 1

In [11]:
# Set verbosity
verbosity = 1

# Execute Grid search
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc',
                    verbose=verbosity, n_jobs=cores_used)

grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation ROC AUC score: {:.2f}".format(grid.best_score_))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 164.7min finished


Best params:
{'classifier': XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.01, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0.1, reg_lambda=1.0,
       scale_pos_weight=1, seed=0, silent=True, subsample=1), 'classifier__gamma': 0.01, 'classifier__learning_rate': 0.1, 'classifier__max_delta_step': 0, 'classifier__max_depth': 5, 'classifier__n_estimators': 100, 'classifier__reg_alpha': 0.1, 'classifier__reg_lambda': 1.0, 'classifier__subsample': 1, 'sampler': None}

Best cross-validation ROC AUC score: 0.79


#### Save the grid search object as a pickle file

In [12]:
full_gridsearch_file_name = 'gridsearch_pickle.pkl'

full_gridsearch_path = os.path.join(models_folder,
                                    full_gridsearch_file_name)

joblib.dump(grid, full_gridsearch_path)

['../models/gridsearch_pickle.pkl']

In [13]:
best_pipeline_file_name = 'pipeline_pickle.pkl'

best_pipeline_path = os.path.join(models_folder, 
                                  best_pipeline_file_name)

joblib.dump(grid.best_estimator_, best_pipeline_path)

['../models/pipeline_pickle.pkl']

#### Search for optimal decision threshold
We want to find a decision threshold that will maximize the f1 score of our classifier.

In [18]:
clf = grid.best_estimator_

In [38]:
def f1_by_threshold(threshold, y, x, clf):
    f1_val = f1_score(y,
                      (clf.predict_proba(x)[:,1] < \
                       threshold).astype(int))
    
    if f1_val == 0:
        f1_val = np.nan
        
    return f1_val

In [39]:
# Threshold values over which we will search
thresholds = np.arange(1,10) / 10
thresholds

kf = KFold(n_splits=3)#5)
kf.get_n_splits(X_train)

best_thresholds = np.zeros(kf.n_splits)
best_f1s = np.zeros(kf.n_splits)

for i, (kf_train_index, kf_test_index) in enumerate(kf.split(X_train)):
    print("Processing fold {}...".format(str(i+1)))
    X_kf_train = X_train[kf_train_index,:]
    X_kf_test = X_train[kf_test_index]
    y_kf_train = y_train[kf_train_index]
    y_kf_test = y_train[kf_test_index]    
    
    clf.fit(X_kf_train, y_kf_train)
    
    f1_vals = np.zeros(len(thresholds))
    
    for (j, t) in enumerate(thresholds):
        
        #f1_vals[j] = f1_by_threshold(t)
        
        f1_vals[j] = f1_by_threshold(t, y_kf_test, X_kf_test, clf)
    
    best_thresholds[i] = thresholds[np.nanargmax(f1_vals)]
    best_f1s[i] = np.nanmax(f1_vals)

best_threshold = best_thresholds.mean()
best_f1s_mean = best_f1s.mean()

print("Best threshold:")
print("Mean best cross-validated threshold:\t{:.2f}".format(best_threshold))
print("Mean best cross-validated f1 score:\t{:.2f}".format(best_f1s_mean))

Processing fold 1...
Processing fold 2...
Processing fold 3...
Best threshold:
Mean best cross-validated threshold:	0.60
Mean best cross-validated f1 score:	0.12


### Use the optimal score to try classifier on the test set

In [40]:
clf = grid.best_estimator_

test_f1_score = f1_by_threshold(best_threshold, y_test, X_test, clf)

In [49]:
test_roc_auc_score = roc_auc_score(y_test,
                                   clf.predict_proba(X_test)[:,1])

In [53]:
print("Best cross-validation ROC AUC score:\t{:.2f}".format(grid.best_score_))
print("Test ROC AUC score:\t\t\t{:.2f}".format(test_roc_auc_score))
print("Mean best cross-validated f1 score:\t{:.2f}".format(best_f1s_mean))
print("Test f1 score:\t\t\t\t{:.2f}".format(test_f1_score))

Best cross-validation ROC AUC score:	0.79
Test ROC AUC score:			0.78
Mean best cross-validated f1 score:	0.12
Test f1 score:				0.12
