In [None]:
import pandas as pd
import numpy as np
import csv
import sqlite3
import re
import pickle
import itertools


from sklearn.dummy import DummyClassifier

from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report


from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.style.use('seaborn')

In [None]:
pkl_filename= '/Users/xzhou/github/project_files/lending_club/loan.pkl'

with open(pkl_filename, 'rb') as pklfile:
    df=pd.read_pickle(pkl_filename)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Noted loan_status is highly imbalanced. good vs. bad = 12 : 1

df.groupby('loan_status').count()

In [None]:
df.head()

In [None]:
# Assign bad loan as value 1 and good loan as value 0

df['loan_status'] = (df['loan_status']=='bad').astype(int)

In [None]:
df=df.reset_index()

In [None]:
df.describe(include = 'all')

In [None]:
y=df['loan_status']
X=df.drop('loan_status', axis=1)

In [None]:
# Undersampling will be applied to X_train. Even a test_size of 0.1 will give me a test_size of 87K

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.1, random_state=99)

In [None]:
ssX=StandardScaler()
X_train_scaled=ssX.fit_transform(X_train)
X_test_scaled=ssX.transform(X_test)

In [None]:
runs=RandomUnderSampler()
X_train_scaled_under, y_train_under=runs.fit_sample(X_train_scaled, y_train)

In [None]:
X_train_scaled_under.shape

In [None]:
y_train_under.shape

In [None]:
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10), DecisionTreeClassifier(max_depth=None), 
          BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=100, n_jobs=-1),
          SVC(probability=True), LogisticRegression(C=0.1), SGDClassifier(loss='log', random_state=42, alpha=0.01)]
model_names = ["Random Forest", "Decision Tree",'Bagging Classifier','SVC','Logistic','SGD-Log']

In [None]:
model_pkl_filename= '/Users/xzhou/github/project_files/lending_club/loan_model1.pkl'

In [None]:
# Open the pickle file that stores details for model comparison
# If not existed, create pickle file by runing through various models

roc_plotting=[]

try: 
    with open (model_pkl_filename, 'rb') as pklfile2:
        roc_plotting = pickle.load(pklfile2)    

except: 
    for clf, name in zip(models, model_names):
        clf.fit(X_train_scaled_under, y_train_under)
        y_pred = clf.predict_proba(X_test_scaled)
        fpr, tpr, _ = roc_curve (y_test, y_pred[:, 1])
        auc_score = auc(fpr, tpr)
        roc_plotting.append((name, tpr, fpr, auc_score))
        
        # Store in pickle file if file is not existed
        with open (model_pkl_filename, 'wb') as pklfile2:
            pickle.dump(roc_plotting, pklfile2)

In [None]:
# Plot ROC curve for model comparison
# Noted Ramdom Forecast produces best result with an AUC of 0.77

plt.figure(dpi=250)
for name, tpr, fpr, auc_score in roc_plotting:
    plt.plot(fpr, tpr, label=name+' (auc: %.2f)'%auc_score)
plt.plot([0,1], [0,1], color='k', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend (loc='lower right')
plt.title('ROC Curves')

In [None]:
param_grid = {
    'n_estimators': [100, 300],
    'max_features': ['auto', 'log2'], # 'auto'=sqrt
    'max_depth': [10, 15, 20],
    'criterion': ['gini', 'entropy']
}

In [None]:
rfc = RandomForestClassifier(random_state=99)

In [None]:
rf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=4, verbose=2, n_jobs=-1)
rf.fit(X_train_scaled_under, y_train_under)

In [None]:
plt.figure()