<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing</a></span></li><li><span><a href="#Predictive-models" data-toc-modified-id="Predictive-models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Predictive models</a></span><ul class="toc-item"><li><span><a href="#Random-Forests" data-toc-modified-id="Random-Forests-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Random Forests</a></span></li><li><span><a href="#Logistic-regression" data-toc-modified-id="Logistic-regression-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Logistic regression</a></span></li><li><span><a href="#Support-Vector-Machine" data-toc-modified-id="Support-Vector-Machine-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Support Vector Machine</a></span><ul class="toc-item"><li><span><a href="#Linear-SVM" data-toc-modified-id="Linear-SVM-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>Linear SVM</a></span></li><li><span><a href="#SVM-with-RBF-Kernel" data-toc-modified-id="SVM-with-RBF-Kernel-2.3.2"><span class="toc-item-num">2.3.2&nbsp;&nbsp;</span>SVM with RBF-Kernel</a></span></li><li><span><a href="#SVM-with-polynomial-kernel" data-toc-modified-id="SVM-with-polynomial-kernel-2.3.3"><span class="toc-item-num">2.3.3&nbsp;&nbsp;</span>SVM with polynomial kernel</a></span></li></ul></li></ul></li><li><span><a href="#Performance-comparison" data-toc-modified-id="Performance-comparison-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Performance comparison</a></span></li></ul></div>

In [62]:
import pdb 
import glob
import copy
import math
import pickle

import numpy as np
import pandas as pd
import scipy as sp

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
 
import missingno  # for visualizing missing data

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, ShuffleSplit

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, \
    LogisticRegressionCV, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, \
    precision_recall_curve, average_precision_score, f1_score, \
    roc_curve, auc, roc_auc_score, make_scorer,\
    accuracy_score, balanced_accuracy_score

from sklearn.externals import joblib
from sklearn.utils import resample
from sklearn.utils.fixes import signature


# Set up pandas table display
pd.set_option('display.width', 120)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# Set plotting options
sns.set() # Use seaborn defaults for plotting
%matplotlib inline 

In [61]:
# Adjust number of CPU cores to use
n_jobs=2

In [31]:
# Load original data
with open('data_processed/all_data.pickle', 'rb') as pickled_file: 
    all_data = pickle.load(pickled_file) 

# TEMPORARY:
# ---------
# Reduce number of observations to speed up computations
n_samples=100000
all_data = resample(all_data, replace=False, 
                    n_samples=n_samples, random_state=1)


## Preprocessing

In [58]:
# Find categorical variables with more than 50 unique values and drop them
unique_values_cat = all_data.select_dtypes(include='object') \
                        .nunique() \
                        .sort_values(ascending=False) 
# Drop categorical variables with more than 50 categories
all_data = all_data.drop(unique_values_cat[unique_values_cat > 50].index,
                 axis='columns')

# Train-test split (for now only select 1% for training)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(all_data.drop('default', axis='columns'),
                     all_data.default,
#                      train_size=0.9,
                     train_size=20000, test_size=1000,
                     random_state=1,
                     shuffle=True, stratify=all_data.default) 

# Imputation and standardization for numeric features
numeric_features = X_train.select_dtypes(include=[np.number]).columns
numeric_transformer = Pipeline(steps =[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())]) 

# Imputation and one-hot encoding for categorical features
categorical_features = X_train.select_dtypes(include=[object]).columns
categorical_transformer = Pipeline(steps =[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combining preprocessing for both kinds of features
# (Features of other dtypes – in our case, boolean – will be
# appended at the end without transformation.)
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_transformer', 
             numeric_transformer, numeric_features),
        ('categorical_transformer', 
             categorical_transformer, categorical_features)],
    remainder='passthrough', n_jobs=n_jobs)

# Print dtypes of untransformed data
print('data types of columns that were not transformed:\n {}'
        .format(X_train.select_dtypes(exclude=[np.number, object]) \
                .dtypes.unique()))

# Apply preprocessing
X_train_p = preprocessor.fit_transform(X_train)
X_test_p = preprocessor.transform(X_test)

data types of columns that were not transformed:
 [dtype('bool')]


## Predictive models

In [6]:
# Dictionaries to store results
average_precision = {}
classification_reports = {}
most_important_features = {}

### Random Forests

In [59]:
rf = RandomForestClassifier(n_estimators=100, 
        oob_score=False, n_jobs=n_jobs, random_state=1,
        class_weight='balanced_subsample')
rf.fit(X_train_p, y_train)

# Predictions of class and probability
y_pred_rf = rf.predict(X_test_p) 
y_pred_proba_rf = rf.predict_proba(X_test_p)[:, 1]

# Save results
average_precision['random forests'] = \
    average_precision_score(y_test, y_pred_proba_rf)
classification_reports['random forests'] = \
    classification_report(y_test, y_pred_rf)

# Save most important features
# First get a list of feature names for each dtype
categorical_names = preprocessor \
    .named_transformers_['categorical_transformer'] \
    .named_steps['onehot'] \
    .get_feature_names()
other_names = X_train \
    .select_dtypes(exclude=[np.number, object]) \
    .columns
# Concatenate feature names
feature_names = \
    list(numeric_features) + list(categorical_names) + \
        list(other_names) 
# Compute feature importance and sort
most_important_features['random forests'] = \
    pd.Series(rf.feature_importances_, index=feature_names) \
            .sort_values(ascending=False) \
            .iloc[: 10]

### Logistic regression

In [52]:
# Logistic regression (Elastic Net) 
# --------------------------------
lr = SGDClassifier(loss='log', penalty='elasticnet', 
                      class_weight='balanced', 
                      max_iter=1000, tol=1E-3, # those are defaults for sklearn 0.21+
                      random_state=1, n_jobs=n_jobs) 

# Parameters to search over
param_grid = {'l1_ratio': np.linspace(0, 1, 3),
              'alpha': np.logspace(-10, 1, 3)}
# Grid search
lr_gs = GridSearchCV(lr, param_grid=param_grid, 
                     scoring='average_precision',
                     return_train_score=True,
                     n_jobs=n_jobs, cv=3)
lr_gs.fit(X_train_p, y_train) 

# Predictions
y_pred_lr = lr_gs.predict(X_test_p)    
y_pred_proba_lr = lr_gs.predict_proba(X_test_p)[:, 1]

In [53]:
# Save results
average_precision['logistic regression'] = \
    average_precision_score(y_test, y_pred_proba_lr)
classification_reports['logistic regression'] = \
    classification_report(y_test, y_pred_lr)

# Compute feature importance and sort
most_important_features['logistic regression'] = \
    pd.Series(lr_gs.best_estimator_.coef_[0], 
          index=feature_names) \
    .sort_values(ascending=False) \
    .iloc[: 10] 

In [54]:
# Score of best model
lr_gs.best_score_

0.3453702376955112

### Support Vector Machine
#### Linear SVM

In [None]:
# Linear SVM
# ----------
# Elastic net, logistic regression (Loss='hinge')
lr_en = SGDClassifier(loss='log', penalty='elasticnet', random_state=1,
                   class_weight='balanced', n_jobs=n_jobs) 

# Parameters to search over
grid={'l1_ratio': np.linspace(0, 1, 4),
      'alpha': np.logspace(-10, 1, 11)}


svm_lin = LinearSVC(penalty='l2', class_weight='balanced', 
                  dual=False)
param_grid = {'C': np.logspace(-3, 3, 5)}
svm_lin_gs = GridSearchCV(svm_lin, param_grid=param_grid,
                      scoring=make_scorer(average_precision_score),
                                       #   needs_proba=True),
                     n_jobs=n_jobs, cv=4)
svm_lin_gs.fit(X_train_p, y_train)

# Prediction of class
y_pred_svm = svm_lin_gs.predict(X_test_p)
# Distance from separating hyperplane
y_pred_distance_svm = svm_lin_gs.decision_function(X_test_p)

# Save results
average_precision[('SVM', df_name)] = \
    average_precision_score(y_test, y_pred_distance_svm)
classification_reports[('SVM', df_name)] = \
    classification_report(y_test, y_pred_svm)

#### SVM with RBF-Kernel

In [None]:
svm_rbf = SVC(kernel='rbf', probability=False, 
              cache_size=5000, class_weight='balanced')
# Parameters to search over
param_grid = {'C': np.logspace(-3, 1, 2),
              'gamma': np.logspace(-3, 2, 3)}
# Define indices (for train-test split instead of proper cross-validation)
split_indices = ShuffleSplit(n_splits=1, test_size=.2, random_state=1)

svm_rbf_gs = GridSearchCV(svm_rbf, param_grid=param_grid,
                          scoring='average_precision', cv=split_indices)
svm_rbf_gs.fit(X_train_p, y_train)

In [37]:
# Prediction of class
y_pred_svm = svm_rbf_gs.predict(X_test_p)

In [38]:
# Distance from separating hyperplane
y_pred_distance_svm = svm_rbf_gs.decision_function(X_test_p)

In [39]:
# Save results
average_precision['SVM (RBF Kernel)'] = \
    average_precision_score(y_test, y_pred_distance_svm)

In [40]:
classification_reports['SVM (RBF Kernel)'] = \
    classification_report(y_test, y_pred_svm)

In [45]:
svm_rbf_gs.best_score_

0.3893497755284131

In [50]:
pd.DataFrame(svm_rbf_gs.cv_results_) \
    .set_index('params') \
    .loc[:,['mean_test_score', 'mean_train_score',
           'split0_test_score', 'split1_test_score','split2_test_score',
           'split0_train_score', 'split1_train_score','split2_train_score']] 

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,mean_test_score,mean_train_score,split0_test_score,split1_test_score,split2_test_score,split0_train_score,split1_train_score,split2_train_score
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"{'C': 0.001, 'gamma': 0.001}",0.34866,0.354288,0.332209,0.365111,,0.356518,0.352057,
"{'C': 0.001, 'gamma': 0.31622776601683794}",0.262435,1.0,0.260647,0.264222,,0.999999,1.0,
"{'C': 0.001, 'gamma': 100.0}",0.2045,1.0,0.2045,0.2045,,1.0,1.0,
"{'C': 10.0, 'gamma': 0.001}",0.38935,0.437067,0.376807,0.401892,,0.448363,0.425772,
"{'C': 10.0, 'gamma': 0.31622776601683794}",0.257872,1.0,0.257478,0.258266,,1.0,1.0,
"{'C': 10.0, 'gamma': 100.0}",0.2045,1.0,0.2045,0.2045,,1.0,1.0,


#### SVM with polynomial kernel 

## Performance comparison

In [60]:
average_precision

{'random forests': 0.3567486213824882,
 'SVM (RBF Kernel)': 0.3904120052613318,
 'logistic regression': 0.3486049688125562}

In [None]:
average_precision = 
plt.title('Impact of Dropping Columns on Classifier Performance')
plt.ylabel('Average Precision');