<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Adjusting-threshold" data-toc-modified-id="Adjusting-threshold-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Adjusting threshold</a></span><ul class="toc-item"><li><span><a href="#Random-Forests" data-toc-modified-id="Random-Forests-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Random Forests</a></span></li><li><span><a href="#Load-other-models-and-make-predictions" data-toc-modified-id="Load-other-models-and-make-predictions-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load other models and make predictions</a></span></li><li><span><a href="#Stacking" data-toc-modified-id="Stacking-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Stacking</a></span><ul class="toc-item"><li><span><a href="#Logistic-regression" data-toc-modified-id="Logistic-regression-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Logistic regression</a></span></li><li><span><a href="#Random-forests" data-toc-modified-id="Random-forests-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Random forests</a></span></li></ul></li><li><span><a href="#Adjust-the-decision-threshold" data-toc-modified-id="Adjust-the-decision-threshold-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Adjust the decision threshold</a></span><ul class="toc-item"><li><span><a href="#Linear-regression" data-toc-modified-id="Linear-regression-1.4.1"><span class="toc-item-num">1.4.1&nbsp;&nbsp;</span>Linear regression</a></span></li><li><span><a href="#SVM" data-toc-modified-id="SVM-1.4.2"><span class="toc-item-num">1.4.2&nbsp;&nbsp;</span>SVM</a></span></li><li><span><a href="#Other" data-toc-modified-id="Other-1.4.3"><span class="toc-item-num">1.4.3&nbsp;&nbsp;</span>Other</a></span></li></ul></li></ul></li><li><span><a href="#XGBoost" data-toc-modified-id="XGBoost-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>XGBoost</a></span></li></ul></div>

In [1]:
import pdb 
import glob
import copy
import math
import pickle

import numpy as np
import pandas as pd
import scipy as sp

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
 
import missingno  # for visualizing missing data
 
import xgboost as xgb

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, \
    GridSearchCV, ShuffleSplit

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, \
    LogisticRegressionCV, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, \
    precision_score, recall_score, \
    precision_recall_curve, average_precision_score, f1_score, \
    roc_curve, auc, roc_auc_score, make_scorer,\
    accuracy_score, balanced_accuracy_score

from sklearn.externals import joblib
from sklearn.utils import resample
from sklearn.utils.fixes import signature


# Set up pandas table display
pd.set_option('display.width', 120)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# Set plotting options
sns.set() # Use seaborn defaults for plotting
%matplotlib inline 


# Adjust number of CPU cores to use
n_jobs=1

In [2]:
# Load preprocessed training and test set, incl. feature names 
X_train_small = joblib.load('data_processed/X_train_small.joblib')
X_test_small = joblib.load('data_processed/X_test_small.joblib')
y_train_small = joblib.load('data_processed/y_train_small.joblib')
y_test_small = joblib.load('data_processed/y_test_small.joblib')
# feature_names_small = joblib.load('data_processed/feature_names_small')

In [3]:
# Dictionaries to store results for SMALL data set
average_precision_1 = {}
classification_reports_1 = {}
most_important_features_1 = {} 

### Adjusting threshold

#### Random Forests

In [4]:
rf = RandomForestClassifier(n_estimators=100, 
        oob_score=False, n_jobs=n_jobs, random_state=1,
        class_weight='balanced_subsample')
rf.fit(X_train_small, y_train_small)

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

#### Load other models and make predictions

In [5]:
# Load models
lr_gs_1 = joblib.load('saved_models/lr_gs_1.joblib')
svm_lin_gs_1 = joblib.load('saved_models/svm_lin_gs_1.joblib')
svm_rbf_gs_1 = joblib.load('saved_models/svm_rbf_gs_1.joblib')
svm_poly_gs_1 = joblib.load('saved_models/svm_poly_gs_1.joblib')

In [6]:
# # Split the test set into validation set and proper test set for stacking
# X_val, X_t, y_val, y_t = train_test_split(
#     X_test_small, y_test_small, train_size=0.5, test_size=0.5,
#     stratify=y_test_small, random_state=1)

In [7]:
## Predicted probability or distance from separating hyperplane 

# For TRAINING data:
y_proba_rf_tr = rf.predict_proba(X_train_small)[:, 1]
y_proba_lr_tr = lr_gs_1.predict_proba(X_train_small)[:, 1]
y_dist_svm_lin_tr = svm_lin_gs_1.decision_function(X_train_small)
y_dist_svm_rbf_tr = svm_rbf_gs_1.decision_function(X_train_small)
y_dist_svm_poly_tr = svm_poly_gs_1.decision_function(X_train_small)

In [8]:
# For TEST data:
y_proba_rf_1 = rf.predict_proba(X_test_small)[:, 1]
y_proba_lr_1 = lr_gs_1.predict_proba(X_test_small)[:, 1]
y_dist_svm_lin_1 = svm_lin_gs_1.decision_function(X_test_small)
y_dist_svm_rbf_1 = svm_rbf_gs_1.decision_function(X_test_small)
y_dist_svm_poly_1 = svm_poly_gs_1.decision_function(X_test_small)

#### Stacking

In [9]:
## Add predictions as additional feature
# For TRAINING data
X_train_small_stack = np.hstack(
   (X_train_small,
    y_proba_rf_tr[:, np.newaxis],
    y_proba_lr_tr[:, np.newaxis],
    y_dist_svm_lin_tr[:, np.newaxis],
    y_dist_svm_poly_tr[:, np.newaxis],
    y_dist_svm_poly_tr[:, np.newaxis]))

In [10]:
# For TEST data
X_test_small_stack = np.hstack(
   (X_test_small,
    y_proba_rf_1[:, np.newaxis],
    y_proba_lr_1[:, np.newaxis],
    y_dist_svm_lin_1[:, np.newaxis],
    y_dist_svm_poly_1[:, np.newaxis],
    y_dist_svm_poly_1[:, np.newaxis]))

##### Logistic regression

In [16]:
# Fit stacked model to training data
stacker_lr = LogisticRegressionCV(Cs=10, class_weight='balanced', cv=10,
                                  scoring='average_precision',
                                   max_iter=10000)
stacker_lr.fit(X_train_small_stack, y_train_small)

# Make predictions with stacked model for test data
y_stack_lr = stacker_lr.predict_proba(X_test_small_stack)[:, 1]

average_precision_score(y_test_small, y_stack_lr)

0.350441783918724

In [28]:
# View coefficients from individual learners
stacker_lr.coef_[:,-5:]

array([[10.28210531,  0.02022592,  0.05572553,  0.0443575 ,  0.0443575 ]])

In [34]:
# View most important coefficients
pd.Series(stacker_lr.coef_.flatten()) \
    .sort_values()

83     -0.095091
84     -0.089276
7      -0.079436
16     -0.064119
105    -0.055367
53     -0.049490
15     -0.046306
43     -0.040108
48     -0.039527
9      -0.031394
115    -0.031268
98     -0.028940
95     -0.028152
42     -0.026458
67     -0.025667
18     -0.024552
74     -0.024404
57     -0.023492
49     -0.022145
31     -0.020425
61     -0.019768
116    -0.018894
52     -0.018333
3      -0.018222
81     -0.017477
113    -0.017167
69     -0.016378
47     -0.015466
44     -0.014811
64     -0.013956
         ...    
58      0.015882
111     0.016119
0       0.016336
72      0.016427
20      0.017826
24      0.018468
78      0.019052
89      0.019080
60      0.019684
120     0.020226
59      0.020955
26      0.022169
71      0.022821
28      0.025442
99      0.028855
39      0.031170
91      0.031838
6       0.031891
11      0.035510
122     0.044357
123     0.044357
19      0.044403
100     0.047347
14      0.048003
121     0.055726
90      0.057813
21      0.057827
93      0.0895

In [18]:
X_test_small_stack.shape

(10000, 124)

In [19]:
X_test_small.shape

(10000, 119)

##### Random forests

In [15]:
# Fit stacked model to training data
stacker_rf = RandomForestClassifier(n_estimators=100, 
        oob_score=False, n_jobs=n_jobs, random_state=1,
        class_weight='balanced_subsample')
stacker_rf.fit(X_train_small_stack, y_train_small)

# Make predictions with stacked model for test data
y_stack_rf = stacker_rf.predict_proba(X_test_small_stack)[:, 1]

average_precision_score(y_test_small, y_stack_rf)

0.3187370524374364

In [14]:
predictions = [y_proba_lr_1, y_proba_rf_1, y_dist_svm_lin_1, 
               y_dist_svm_rbf_1, y_dist_svm_poly_1]
for prediction in predictions:
    print(average_precision_score(y_test_small, prediction))

0.3754287919433627
0.3408734704057727
0.37711824602853794
0.38893719706733143
0.3809588272345418


In [None]:
## Make prediction with stacked model for test set
# First we need the predictions of the individual models
y_proba_rf_te = rf.predict_proba(X_t)[:, 1]
y_proba_lr_te = lr_gs_1.predict_proba(X_t)[:, 1]
y_dist_svm_lin_te = svm_lin_gs_1.decision_function(X_t)
y_dist_svm_rbf_te = svm_rbf_gs_1.decision_function(X_t)
y_dist_svm_poly_te = svm_poly_gs_1.decision_function(X_t)

In [None]:
# Concatenate predictions for test set
X_stack_te = pd.DataFrame({
    'rf': y_proba_rf_t,
    'lr': y_proba_lr_t,
    'svm_lin': y_dist_svm_lin_t,
    'svm_rbf': y_dist_svm_poly_t,
    'svm_poly': y_dist_svm_poly_t})

In [None]:
y_st_lr = stacker_lr.predict(X_t)

#### Adjust the decision threshold

In [None]:
# Define function to make a prediction with custom threshold
def custom_prediction(proba_or_dist, threshold):
    """
    Makes predictions for binary classification from probabilities or distance
    to separating hyperplane, given a custom threshold.
    """

    return (proba_or_dist > threshold).astype(int)


# Define function to plot effect of threshold on precision and recall
def plot_threshold(proba_or_dist, y_true, thresholds=None):
    """
    Plot precision and recall as a function of decision threshold for an array of
    probabilities or distances to separating hyperplane.

    If no custom list of thresholds is supplied, it defaults to a grid of length 
    100, spanning between the minimum and maximum probability/distance. (In
    order to avoid thresholds that assign all observations to a single class, 
    the smallest and largest thresholds are offset by 0.01 from the minimum and 
    maximum probability/distance.)

    Parameters
    ----------
    proba_or_dist: array-like
        Probabilities or distance from separating hyperplane
    y_true: array-like
        True values of target variable.
    thresholds: list-like
        Custom thresholds. Optional.  
        
    Returns
    -------
        None
    """
    
    # Initialize dictionaries to store results
    classes = {}
    precisions = {}
    recalls = {}
            
    # If no custom thresholds were passed, make grid between minimum and maximum
    # values for probability or distance.
    if thresholds is None:
        thresholds = np.linspace(proba_or_dist.min() + 0.01,
                                 proba_or_dist.max() - 0.01, 100)

    # Iterate over thresholds
    for threshold in thresholds:
        # Classify as 1 if probability/distance is greater than threshold, else 0
        classes[threshold] = custom_prediction(proba_or_dist, threshold)

        # Compute precision and recall
        precisions[threshold] = precision_score(y_true, classes[threshold])
        recalls[threshold] = recall_score(y_true, classes[threshold])

    # Combine precision and recall into a data frame, indexed by threshold
    pr_rec = pd.DataFrame({'precision': precisions, 'recall': recalls})

    # Plot results
    pr_rec.plot()
    plt.title('Effect of threshold on Precision and Recall')
    plt.xlabel('Threshold')
    plt.ylabel('Precision\nRecall')
    plt.show()

##### Linear regression

In [None]:
# Plot effect of probability threshold for linear regression
plot_threshold(proba_or_dist=y_proba_lr_1, y_true=y_test_small)

In [None]:
y_lr_custom = custom_prediction(y_proba_lr_1, 0.4)
print(classification_report(y_test_small, y_lr_custom))

In [None]:
y_lr_custom = custom_prediction(y_proba_lr_1, 0.4)
def plot_confusion_matrix(y_test, y_pred, digits=3):
    cf = pd.DataFrame(confusion_matrix(y_test, y_pred,
                                      labels=[1,0]),
                      columns=['True', 'False'])
    cf.index=['True', 'False']
    cf.columns.name = 'Predicted'
    cf.index.name = 'Actual'
    print(round(cf / len(y_test), digits))    

# Plot confusion matrix 
plot_confusion_matrix(y_test_small, y_lr_custom)

##### SVM

In [None]:
# Plot effect of probability threshold for linear regression
plot_threshold(proba_or_dist=y_dist_svm_lin_1, y_true=y_test_small)

In [None]:
y_svm_lin_custom = custom_prediction(y_dist_svm_lin_1, -0.5)
print(classification_report(y_test_small, y_svm_lin_custom))

In [None]:
y_svm_lin_custom = custom_prediction(y_dist_svm_lin_1, -0.4)
def plot_confusion_matrix(y_test, y_pred, digits=3):
    cf = pd.DataFrame(confusion_matrix(y_test, y_pred,
                                      labels=[1,0]),
                      columns=['True', 'False'])
    cf.index=['True', 'False']
    cf.columns.name = 'Predicted'
    cf.index.name = 'Actual'
    print(round(cf / len(y_test), digits))    

# Plot confusion matrix 
plot_confusion_matrix(y_test_small, y_svm_lin_custom)

##### Other

In [None]:
#     def plot_roc(y_test, y_pred, model_name=None):
#         false_positive_rate, true_positive_rate, thresholds = \
#             roc_curve(y_test, y_pred)
#         roc_auc = auc(false_positive_rate, true_positive_rate)
#         plt.plot(false_positive_rate, true_positive_rate, 'b',
#                  label='AUC = {0:.3f}'.format(roc_auc))
#         plt.legend(loc='lower right')
#         plt.plot([0, 1],[0, 1], 'r--')
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         title='ROC Curve'
#         # Add custom title, if specified
#         if model_name is not None:
#             title = ', '.join([title, model_name])
#         plt.title(title)
#         plt.show();

#     # Plot ROC curve for random forests
#     y_proba_rf = rf.predict_proba(X_test_p)[:, 1]
#     plot_roc(y_test, y_proba_rf, 'Random Forests')


#     def plot_precision_recall(y_test, y_pred):
#         """Plots precision-recall curve."""

#         average_precision = average_precision_score(y_test, y_pred)
#         precision, recall, _ = precision_recall_curve(y_test, y_pred)
#         # pdb.set_trace()
#         step_kwargs = ({'step': 'post'}
#                        if 'step' in signature(plt.fill_between).parameters
#                        else {})
#         plt.step(recall, precision, color='b', alpha=0.2,
#                  where='post')
#         plt.figtext(0.2, 0.2, 'Average Precision={0:0.3f}' \
#                                         .format(average_precision))
#         plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

#         plt.xlabel('Recall')
#         plt.ylabel('Precision')
#         plt.ylim([0.0, 1.05])
#         plt.xlim([0.0, 1.0])
#         plt.title('Precision-Recall curve')
#         plt.show();

#     # Plot precision recall curve for random forests classifier
#     plot_precision_recall(y_test, y_proba_rf)

#     # Get a list of feature names
#     cat_names = preprocessor.named_transformers_['cat'] \
#                     .named_steps['onehot'].get_feature_names()
#     feature_names = list(numeric_features) + list(cat_names)
#     # Compute feature importance and sort
#     feature_importances = pd.Series(
#                                 rf.feature_importances_,
#                                 index=feature_names) \
#                             .sort_values(ascending=False)
#     print(feature_importances)

### XGBoost

In [None]:
data_xgb_train = xgb.DMatrix(data=X_train_small, label=y_train_small)
# X_train_small_dmat = xgb.DMatrix(X_train_small)
# X_test_small_dmat = xgb.DMatrix(X_test_small)
# y_train_small_dmat = xgb.DMatrix(y_train_small[:, np.newaxis])
# y_test_small_dmat = xgb.DMatrix(y_test_small)

In [None]:
# # specify parameters via map
# param = {'objective':'binary:logistic', 'eval_metric':'map',
#          'scale_pos_weight':5  # Balance class weight
#          'seed':0}
# num_round = 2

# train(param, X_train_small, num_round)
# y_pred = xgb_.predict(X_test_small)

In [None]:
xgb_ = xgb.XGBClassifier(objective='binary:logistic', eval_metric='map',
                         scale_pos_weight=5,  # Balance class weight
                         seed=0, nthread=n_jobs)
# xgb_.fit(X_train_small, y_train_small)

# Parameters to search over
param_grid = {'max_depth'= [3, 5, 7, 10],  # Control complexity
              'min_child_weight'= , # The higher, the more regularization
              'gamma'= , # Higher value leads to fewer splits for a given node (i.e. more regularization) if
              'subsample'= [0.5, 0.75, 1],  # Fraction of observations per tree 
              'colsample_bytree': [0.5, 0.75, 1]} # Fraction of features per tree
# Grid search
xgb_gs_1 = GridSearchCV(xgb_1, param_grid=param_grid, 
                       scoring='average_precision',
                       return_train_score=True,
                       n_jobs=n_jobs, cv=3, verbose=5)
xgb_gs_1.fit(X_train_small, y_train_small) 

# Save model
joblib.dump(xgb_gs_1, 'saved_models/xgb_gs_1.joblib')

# Predictions
y_proba_xgb = xgb_.predict_proba(X_test_small)[:, 1]
average_precision_score(y_test_small, y_proba_xgb)

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)

In [None]:
params = {'objective':'binary:logistic', 'eval_metric':'map',
         'scale_pos_weight':5,  # Balance class weight
         'seed':0}
xgb_cv = xgb.cv(dtrain=data_xgb_train, params=params, nfold=3,
                num_boost_round=50, as_pandas=True)

In [None]:
y_xgb_cv = xgb_cv.predict(X_test_small)
average_precision_score(y_test_small, y_xgb_cv)

In [None]:
xgb_cv