In [16]:
# Import

# Basic
from time import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Viz
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

---
# Objective:

- Train a model to predict whether a user will default or not
- Evaluate the performance of model


---

# My plan

1. Model the 6 selected features on a few other models and tune hyper parameters
2. Select the best model.
3. Tune probability threshold to improve accuracy
4. Show model performance with k fold cross validation.


---
# 1. Load model and set up

In [6]:
df_trans = pd.read_pickle("df_trans.pkl")
feature_names = ['N_PAY_DULY', 'N_DELAYED', 'UTILIZE_PTG1', 'PAY_PTG1', 'PAY_1_delayed']

In [8]:
from sklearn.model_selection import train_test_split

# split the data
df_train,df_test = train_test_split(df_trans,test_size=0.2,random_state=2020)

In [9]:
# Check training data
d_rate = df_train.groupby(['DEFAULT_PAY']).size().reset_index(name='N_USERS')
d_rate['PTG'] = d_rate['N_USERS']/sum(d_rate['N_USERS'])
d_rate

Unnamed: 0,DEFAULT_PAY,N_USERS,PTG
0,0,6211,0.776375
1,1,1789,0.223625


In [10]:
# Check testing data
d_rate = df_test.groupby(['DEFAULT_PAY']).size().reset_index(name='N_USERS')
d_rate['PTG'] = d_rate['N_USERS']/sum(d_rate['N_USERS'])
d_rate

Unnamed: 0,DEFAULT_PAY,N_USERS,PTG
0,0,1551,0.7755
1,1,449,0.2245


In [17]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn import metrics

# get feature and labels
features_train, features_test = \
    df_train[feature_names].values, df_test[feature_names].values
labels_train, labels_test = \
    df_train['DEFAULT_PAY'].values,df_test['DEFAULT_PAY'].values

clf = GradientBoostingClassifier(n_estimators=35, 
                                 max_depth = 3, 
                                 min_samples_split = 100, 
                                 min_samples_leaf = 50,
                                 #max_features=10,
                                 random_state = 0)

# Train the model using the training sets
t0 = time()
clf.fit(features_train, labels_train)
training_time = round(time() - t0, 3)

# Predict the response for test dataset
t0 = time()
predictions = clf.predict(features_test)
prediction_time = round(time() - t0, 3)

# evaluate result
training_score = f1_score(y_true = labels_train, y_pred = clf.predict(features_train))
score = f1_score(y_true = labels_test, y_pred = predictions)

# Print results
print ("F1 training score:", training_score)
print ("F1 testing score:", score)
print ("Training time:", training_time, "s")
print ("Prediction time:", prediction_time, "s")

# quickly check the accuracy score as well
print ("Accuracy training score:", metrics.accuracy_score(labels_train, clf.predict(features_train)))
print ("Accuracy testing score:", metrics.accuracy_score(labels_test, predictions))

F1 training score: 0.4904480722473081
F1 testing score: 0.4524137931034483
Training time: 0.176 s
Prediction time: 0.001 s
Accuracy training score: 0.816625
Accuracy testing score: 0.8015


---
# 2. Try a few more models

## 2.1. Gradient Boosted Tree

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": [100],
    "min_samples_leaf": [50],
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "n_estimators":[35]
    }

acc_scorer = make_scorer(f1_score)
grid_clf = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring = acc_scorer, cv=5)
%time grid_clf = grid_clf.fit(features_train, labels_train)
best_clf = grid_clf.best_estimator_
print(grid_clf.best_estimator_)
print(grid_clf.best_score_)

CPU times: user 25.5 s, sys: 92 ms, total: 25.6 s
Wall time: 25.6 s
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.075, loss='deviance', max_depth=3,
                           max_features='log2', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=50, min_samples_split=100,
                           min_weight_fraction_leaf=0.0, n_estimators=35,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.47640251636018816


In [25]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn import metrics

# get feature and labels
features_train, features_test = \
    df_train[feature_names].values, df_test[feature_names].values
labels_train, labels_test = \
    df_train['DEFAULT_PAY'].values,df_test['DEFAULT_PAY'].values

clf = best_clf

# Train the model using the training sets
t0 = time()
clf.fit(features_train, labels_train)
training_time = round(time() - t0, 3)

# Predict the response for test dataset
t0 = time()
predictions = clf.predict(features_test)
prediction_time = round(time() - t0, 3)

# evaluate result
training_score = f1_score(y_true = labels_train, y_pred = clf.predict(features_train))
score = f1_score(y_true = labels_test, y_pred = predictions)

# Print results
print ("F1 training score:", training_score)
print ("F1 testing score:", score)
print ("Training time:", training_time, "s")
print ("Prediction time:", prediction_time, "s")

# quickly check the accuracy score as well
print ("Accuracy training score:", metrics.accuracy_score(labels_train, clf.predict(features_train)))
print ("Accuracy testing score:", metrics.accuracy_score(labels_test, predictions))

F1 training score: 0.48299791811242193
F1 testing score: 0.45604395604395603
Training time: 0.106 s
Prediction time: 0.001 s
Accuracy training score: 0.81375
Accuracy testing score: 0.802


---
## 2.2. Random Forest

In [45]:
np.linspace(100, 200, 11)

array([100., 110., 120., 130., 140., 150., 160., 170., 180., 190., 200.])

In [47]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "min_samples_split": [100, 150, 200],
    "min_samples_leaf": [50],
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "n_estimators":[35,100]
    }

acc_scorer = make_scorer(f1_score)
grid_clf = GridSearchCV(RandomForestClassifier(), param_grid, scoring = acc_scorer, cv=5)
%time grid_clf = grid_clf.fit(features_train, labels_train)
best_clf = grid_clf.best_estimator_
print(grid_clf.best_estimator_)
print(grid_clf.best_score_)

CPU times: user 34.5 s, sys: 190 ms, total: 34.7 s
Wall time: 34.8 s
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=50, min_samples_split=200,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.49663023307403076


In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn import metrics

# get feature and labels
features_train, features_test = \
    df_train[feature_names].values, df_test[feature_names].values
labels_train, labels_test = \
    df_train['DEFAULT_PAY'].values,df_test['DEFAULT_PAY'].values

clf = RandomForestClassifier(n_estimators=35, 
                                 max_depth = 3, 
                                 min_samples_split = 100, 
                                 min_samples_leaf = 50,
                                 max_features='log2',
                                 random_state = 0)

# Train the model using the training sets
t0 = time()
clf.fit(features_train, labels_train)
training_time = round(time() - t0, 3)

# Predict the response for test dataset
t0 = time()
predictions = clf.predict(features_test)
prediction_time = round(time() - t0, 3)

# evaluate result
training_score = f1_score(y_true = labels_train, y_pred = clf.predict(features_train))
score = f1_score(y_true = labels_test, y_pred = predictions)

# Print results
print ("F1 training score:", training_score)
print ("F1 testing score:", score)
print ("Training time:", training_time, "s")
print ("Prediction time:", prediction_time, "s")

# quickly check the accuracy score as well
print ("Accuracy training score:", metrics.accuracy_score(labels_train, clf.predict(features_train)))
print ("Accuracy testing score:", metrics.accuracy_score(labels_test, predictions))

F1 training score: 0.4966532797858099
F1 testing score: 0.46586345381526106
Training time: 0.104 s
Prediction time: 0.006 s
Accuracy training score: 0.812
Accuracy testing score: 0.8005


---
## 2.3. XG Boost

In [52]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn import metrics

# get feature and labels
features_train, features_test = \
    df_train[feature_names].values, df_test[feature_names].values
labels_train, labels_test = \
    df_train['DEFAULT_PAY'].values,df_test['DEFAULT_PAY'].values

clf = XGBClassifier(learning_rate = 0.05, n_estimators=300, max_depth=5)

# Train the model using the training sets
t0 = time()
clf.fit(features_train, labels_train)
training_time = round(time() - t0, 3)

# Predict the response for test dataset
t0 = time()
predictions = clf.predict(features_test)
prediction_time = round(time() - t0, 3)

# evaluate result
training_score = f1_score(y_true = labels_train, y_pred = clf.predict(features_train))
score = f1_score(y_true = labels_test, y_pred = predictions)

# Print results
print ("F1 training score:", training_score)
print ("F1 testing score:", score)
print ("Training time:", training_time, "s")
print ("Prediction time:", prediction_time, "s")

# quickly check the accuracy score as well
print ("Accuracy training score:", metrics.accuracy_score(labels_train, clf.predict(features_train)))
print ("Accuracy testing score:", metrics.accuracy_score(labels_test, predictions))

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/yickminglee/.virtualenvs/ming-ds/lib/python3.6/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/yickminglee/.virtualenvs/ming-ds/lib/python3.6/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


---
# 3. Try tuning threshold

In [None]:
clf_prob = clf.predict_proba(features_test)[:,1]

df_test_result = pd.DataFrame({'actual_values':labels_test, 'prediction': pd.Series(clf_prob),'prediction-yn': predictions})

df_test_result.head(5)


In [None]:
#Compute Classifier's Accuracy for various thresholds 

def _compute_accuracy_for_thresholds(data: pd.core.frame.DataFrame):

    thresholds = np.linspace(0, 1, 100)
    accuracies = []
    for threshold in thresholds:
        preds = df_test_result['prediction'].map(lambda x: 1 if x > threshold else 0)
        accuracy_ = metrics.accuracy_score(df_test_result['actual_values'].values, preds)
        accuracies.append(accuracy_)
    return accuracies, thresholds

accuracies, threshold = _compute_accuracy_for_thresholds(df_test_result)

max_value = np.amax(accuracies)
max_position = np.where(accuracies == max_value)
print("maximum accuracies: {}; maximum threshold: {}".format(max_value, threshold[max_position]))


In [None]:
plt.figure(figsize=(10,7))

ax = sns.lineplot(x=threshold, y=accuracies,
                  markers=True, dashes=False)

axes = ax.axes
axes.set_ylim(0.7,0.85)


The highest point is around 0.5, not too far off from the default setting. So, I am not going to make any changes here.



---
# 4. Show model performance