In [31]:
#import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import probplot
from scipy.stats.mstats import zscore
import statsmodels.stats.api as sms

pd.set_option('max_columns', None)

import nltk
import collections as co
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

In [32]:
#read loans.csv as a dataframe
loans_df = pd.read_csv('~/Downloads/tanay/data_springboard/loan.csv',low_memory=False, engine='c')

In [33]:
loans_df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [34]:
#define a function to classify loan status into one of the following bins ('Fully Paid', 'Default', 'Current')
def loan_status_bin(text):
    if text in ('Fully Paid', 'Does not meet the credit policy. Status:Fully Paid'):
        return 'Fully Paid'
    elif text in ('Current', 'Issued'):
        return 'Current'
    elif text in ('Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'):
        return 'Default'
    elif text in ('Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period'):
        return 'Late'
    else:
        'UNKNOWN BIN'

In [35]:
#create a new attribute 'loan_status_bin' in the dataframe
loans_df['loan_status_bin']=loans_df['loan_status'].apply(loan_status_bin)
loans_df['loan_status_bin'].unique()

array(['Fully Paid', 'Default', 'Current', 'Late'], dtype=object)

In [36]:
loans_df.fillna(loans_df.median()['annual_inc'], inplace=True)

In [37]:
loans_df[loans_df['annual_inc'].isnull()==True]['annual_inc'].count()

0

In [38]:
loans_df_fp=loans_df[loans_df['loan_status_bin']=='Fully Paid']

In [39]:
loans_df_def=loans_df[loans_df['loan_status_bin']=='Default']

In [40]:
print('For Default loans, mean annual income is {0}, standard deviation is {1}, size of dataframe is {2}'.format(loans_df_def['annual_inc'].mean(), loans_df_def['annual_inc'].std(), len(loans_df_def['annual_inc'])))

For Default loans, mean annual income is 65199.76680867284, standard deviation is 56955.15545104668, size of dataframe is 47228


In [41]:
print('For Fully Paid loans, mean annual income is {0}, standard deviation is {1}, size of dataframe is {2}'.format(loans_df_fp['annual_inc'].mean(), loans_df_fp['annual_inc'].std(), len(loans_df_fp['annual_inc'])))

For Fully Paid loans, mean annual income is 74142.5024192341, standard deviation is 59205.29202398379, size of dataframe is 209711


In [42]:
def_mean=loans_df_def['annual_inc'].mean()
def_std=loans_df_def['annual_inc'].std()

fp_mean=loans_df_fp['annual_inc'].mean()
fp_std=loans_df_fp['annual_inc'].std()

In [43]:
h0_mean = 0
mean_diff = abs(def_mean-fp_mean)
sigma_diff = np.sqrt((fp_std**2)/len(loans_df_fp) + (def_std**2)/len(loans_df_def))
mean_diff, sigma_diff

(8942.7356105612562, 292.23360521799054)

In [44]:
z = (mean_diff - h0_mean) / sigma_diff
z

30.601325278420518

In [45]:
p = (1-stats.norm.cdf(z))*2
p

0.0

In [46]:
#define a function to convert grade into numerical values
def credit_grade(grade):
    if grade in ('A'):
        return 1
    elif grade in ('B'):
        return 2
    elif grade in ('C'):
        return 3
    elif grade in ('D'):
        return 4
    elif grade in ('E'):
        return 5
    elif grade in ('F'):
        return 6
    elif grade in ('G'):
        return 7
    else:
        99

In [47]:
#create a new attribute 'loan_status_bin' in the dataframe
loans_df['credit_grade']=loans_df['grade'].apply(credit_grade)
loans_df['credit_grade'].unique()

array([2, 3, 1, 5, 6, 4, 7])

In [48]:
loans_df['application_type'].unique()

array(['INDIVIDUAL', 'JOINT'], dtype=object)

In [49]:
def derived_income(x, y, z):
    if x == 'INDIVIDUAL':
        return y
    elif x == 'JOINT':
        return z
    else:
        0

In [50]:
loans_df['derived_income']=loans_df.apply(lambda x: derived_income(x['application_type'], x['annual_inc'], x['annual_inc_joint']), axis=1)

In [51]:
def derived_dti(x, y, z):
    if x == 'INDIVIDUAL':
        return y
    elif x == 'JOINT':
        return z
    else:
        0

In [52]:
loans_df['derived_dti']=loans_df.apply(lambda x: derived_dti(x['application_type'], x['dti'], x['dti_joint']), axis=1)

In [53]:
loans_df['inst_inc_ratio']=loans_df['installment']/ (loans_df['derived_income'] /12)

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Features: 
* loan_amount
* credit_grade 
* interest_rate 
* derived_inc
* derived_dti 
* inst_inc_ratio

### Training and Test Datasets

When fitting models, we would like to ensure two things:

* We have found the best model (in terms of model parameters).
* The model is highly likely to generalize i.e. perform well on unseen data.

<br/>
<div class="span5 alert alert-success">
<h4>Purpose of splitting data into Training/testing sets</h4>
<ul>
  <li> We built our model with the requirement that the model fit the data well. </li>
  <li> As a side-effect, the model will fit <b>THIS</b> dataset well. What about new data? </li>
    <ul>
      <li> We wanted the model for predictions, right?</li>
    </ul>
  <li> One simple solution, leave out some data (for <b>testing</b>) and <b>train</b> the model on the rest </li>
  <li> This also leads directly to the idea of cross-validation, next section. </li>  
</ul>
</div>

First, we try a basic Logistic Regression:

* Split the data into a training and test (hold-out) set
* Train on the training set, and test for accuracy on the testing set

In [55]:
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(loans_df[['loan_amnt', 'credit_grade', 'int_rate', 'derived_income', 'derived_dti', 'inst_inc_ratio']].values,
                                              (loans_df.loan_status_bin).values,
                                              random_state=5)

In [56]:
clf = LogisticRegression()
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))

0.686339561405


### Tuning the Model

The model has some hyperparameters we can tune for hopefully better performance. For tuning the parameters of your model, you will use a mix of *cross-validation* and *grid search*. In Logistic Regression, the most important parameter to tune is the *regularization parameter* `C`. Note that the regularization parameter is not always part of the logistic regression model. 

The regularization parameter is used to control for unlikely high regression coefficients, and in other cases can be used when data is sparse, as a method of feature selection.

You will now implement some code to perform model tuning and selecting the regularization parameter $C$.

We use the following `cv_score` function to perform K-fold cross-validation and apply a scoring function to each test fold. In this incarnation we use accuracy score as the default scoring function.


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

Below is an example of using the `cv_score` function for a basic logistic regression model without regularization.

In [None]:
clf1 = LogisticRegression()
score = cv_score(clf1, Xlr, ylr)
print(score)

<div class="span5 alert alert-info">
<h3>Checkup Exercise Set II</h3>

<b>Exercise:</b> Implement the following search procedure to find a good model
<ul>
<li> You are given a list of possible values of `C` below
<li> For each C:
  <ol>
  <li> Create a logistic regression model with that value of C
  <li> Find the average score for this model using the `cv_score` function **only on the training set** `(Xlr, ylr)`
  </ol>
<li> Pick the C with the highest average score
</ul>
Your goal is to find the best model parameters based *only* on the training set, without showing the model test set at all (which is why the test set is also called a *hold-out* set).
</div>

In [None]:
#the grid of parameters to search over
Cs = [0.001, 0.1, 1, 10, 100]
max_score=0

for C in Cs:
    clf2 = LogisticRegression(C=C)
    score = cv_score(clf2, Xlr, ylr)
    if score > max_score:
        max_score = score
        best_C =C
print ('max_score: ',max_score, 'best_C: ', best_C)

# your turn

<div class="span5 alert alert-info">
<h3>Checkup Exercise Set III</h3>
**Exercise:** Now you want to estimate how this model will predict on unseen data in the following way:
<ol>
<li> Use the C you obtained from the procedure earlier and train a Logistic Regression on the training data
<li> Calculate the accuracy on the test data
</ol>

<p>You may notice that this particular value of `C` may or may not do as well as simply running the default model on a random train-test split. </p>

<ul>
<li> Do you think that's a problem? 
<li> Why do we need to do this whole cross-validation and grid search stuff anyway?
</ul>

</div>

In [None]:
clf3=LogisticRegression(C=best_C)
clf3.fit(Xlr, ylr)
ypred=clf3.predict(Xtestlr)
print('accuracy score: ', accuracy_score(ypred, ytestlr), '\n')
print('I don\'t think there is a problem, since model accuracy has '
      'increased with addition of a regularization parameter')
print('We perform cross-validation and grid search to tune hyperparameters of our model')

### Black Box Grid Search in `sklearn`

<div class="span5 alert alert-info">
<h3>Checkup Exercise Set IV</h3>

<b>Exercise:</b> Use scikit-learn's [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) tool to perform cross validation and grid search. 

* Instead of writing your own loops above to iterate over the model parameters, can you use GridSearchCV to find the best model over the training set? 
* Does it give you the same best value of `C`?
* How does this model you've obtained perform on the test set?


In [None]:
# your turn
from sklearn.model_selection import GridSearchCV

clf4=LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
fitmodel = GridSearchCV(clf4, param_grid=parameters, cv=5, scoring="accuracy", return_train_score=True)
fitmodel.fit(Xlr, ylr)
fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_, fitmodel.cv_results_

clf5=LogisticRegression(C=fitmodel.best_params_['C'])
clf5.fit(Xlr, ylr)
ypred=clf5.predict(Xtestlr)

print('accuracy score: ', accuracy_score(ypred, ytestlr), '\n')
print('No, the new value of the C is: ', fitmodel.best_params_['C'], '\n')

In [None]:
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
clf_dt = DecisionTreeClassifier()
clf_dt.fit(Xlr, ylr)
print(clf_dt)
# make predictions
ypred = clf_dt.predict(Xtestlr)
# summarize the fit of the model
print(metrics.classification_report(ytestlr, ypred))
print(metrics.confusion_matrix(ytestlr, ypred))

## Over Sampling using SMOTE

### Training Logistic Regression using SMOTE sampled data

In [57]:
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

In [58]:
X_resampled, y_resampled = SMOTE().fit_sample(Xlr, ylr)
print(sorted(Counter(y_resampled).items()))

[('Current', 457983), ('Default', 457983), ('Fully Paid', 457983), ('Late', 457983)]


In [59]:
X_test_resampled, y_test_resampled = SMOTE().fit_sample(Xtestlr, ytestlr)
print(sorted(Counter(y_test_resampled).items()))

[('Current', 152256), ('Default', 152256), ('Fully Paid', 152256), ('Late', 152256)]


In [60]:
clf_smote = LogisticRegression().fit(X_resampled, y_resampled)
print(clf_smote)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [61]:
# make predictions
ypred = clf_smote.predict(X_test_resampled)
# summarize the fit of the model
print(metrics.classification_report(y_test_resampled, ypred))
print(metrics.confusion_matrix(y_test_resampled, ypred))

             precision    recall  f1-score   support

    Current       0.39      0.45      0.41    152256
    Default       0.33      0.51      0.40    152256
 Fully Paid       0.36      0.23      0.28    152256
       Late       0.29      0.19      0.23    152256

avg / total       0.34      0.34      0.33    609024

[[68098 42591 22300 19267]
 [24314 78263 21123 28556]
 [47090 47991 35390 21785]
 [36710 66505 20803 28238]]


In [62]:
print('accuracy score: ', accuracy_score(ypred, y_test_resampled), '\n')

accuracy score:  0.344795935792 



### Training Decision tree (CART) using SMOTE sampled data

In [None]:
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
clf_dt_smote = DecisionTreeClassifier()
clf_dt_smote.fit(X_resampled, y_resampled)
print(clf_dt_smote)
# make predictions
ypred = clf_dt_smote.predict(X_test_resampled)
# summarize the fit of the model
print(metrics.classification_report(y_test_resampled, ypred))
print(metrics.confusion_matrix(y_test_resampled, ypred))

## Training Random forest using SMOTE sampled data

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
clf_rf_1 = RandomForestClassifier(max_depth=5, random_state=0)

In [64]:
clf_rf_1.fit(X_resampled, y_resampled)
print(clf_rf_1.feature_importances_)

[ 0.02463866  0.52023963  0.36261194  0.01070639  0.06651044  0.01529294]


In [65]:
print(clf_rf_1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


In [66]:
# make predictions
ypred = clf_rf_1.predict(X_test_resampled)
# summarize the fit of the model
print(metrics.classification_report(y_test_resampled, ypred))
print(metrics.confusion_matrix(y_test_resampled, ypred))

             precision    recall  f1-score   support

    Current       0.58      0.66      0.62    152256
    Default       0.40      0.43      0.41    152256
 Fully Paid       0.48      0.29      0.36    152256
       Late       0.41      0.48      0.44    152256

avg / total       0.47      0.46      0.46    609024

[[100343  16404  13641  21868]
 [ 13631  64941  19660  54024]
 [ 41394  36803  44733  29326]
 [ 18456  45895  14915  72990]]


### Hyperparameter tuning for Random Forest - Attempt I

In [67]:
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [70]:
print(y_resampled)

['Current' 'Current' 'Fully Paid' ..., 'Late' 'Late' 'Late']


In [None]:
print(__doc__)

RANDOM_STATE = 123

# Generate a binary classification dataset.
X, y = X_resampled, y_resampled

#Xtestlr, ytestlr

# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.
ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(warm_start=True, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(warm_start=True, max_features='log2',
                               oob_score=True,
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features=None",
        RandomForestClassifier(warm_start=True, max_features=None,
                               oob_score=True,
                               random_state=RANDOM_STATE))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 175

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

### Hyperparameter tuning for Random Forest - II

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf_rf_2 = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf_rf_2, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_resampled, y_resampled)

rf_random.be

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

### Training SVM on resampled data (SMOTE)

In [None]:
from sklearn.svm import SVC
clf_svm_smote = SVC()
clf_svm_smote.fit(X_resampled, y_resampled)
print(clf_svm_smote)

In [None]:
# make predictions
#ypred = clf_svm_smote.predict(X_test_resampled)
# summarize the fit of the model
#print(metrics.classification_report(y_test_resampled, ypred))
#print(metrics.confusion_matrix(y_test_resampled, ypred))

## Over Sampling using of ADASYN

In [None]:
X_resampled, y_resampled = ADASYN().fit_sample(Xlr, ylr)
print(sorted(Counter(y_resampled).items()))

In [None]:
X_test_resampled, y_test_resampled = ADASYN().fit_sample(Xtestlr, ytestlr)
print(sorted(Counter(y_test_resampled).items()))

### Logistic Regression using ADASYN sampled data

In [None]:
clf_adasyn = LogisticRegression().fit(X_resampled, y_resampled)
print(clf_adasyn)

In [None]:
# make predictions
ypred = clf_adasyn.predict(X_test_resampled)
# summarize the fit of the model
print(metrics.classification_report(y_test_resampled, ypred))
print(metrics.confusion_matrix(y_test_resampled, ypred))

## Undersampling using imblearn

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_resampled, y_resampled)
print(sorted(Counter(y_resampled).items()))

In [None]:
clf_rus = LogisticRegression(C=0).fit(X_resampled, y_resampled)

In [None]:
print(clf_rus)

In [None]:
X_test_resampled, y_test_resampled = rus.fit_sample(Xtestlr, ytestlr)
print(sorted(Counter(y_test_resampled).items()))

In [None]:
# make predictions
ypred = clf_smote.predict(X_test_resampled)
# summarize the fit of the model
print(metrics.classification_report(y_test_resampled, ypred))
print(metrics.confusion_matrix(y_test_resampled, ypred))