- Is there a difference between the annual income of default and full paid loans?
- Is the difference statistically significant?

In [1]:
#import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import probplot
from scipy.stats.mstats import zscore
import statsmodels.stats.api as sms

pd.set_option('max_columns', None)

import nltk
import collections as co
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

  from pandas.core import datetools


In [2]:
#read loans.csv as a dataframe
loans_df = pd.read_csv('
                       ',low_memory=False, engine='c')

In [3]:
loans_df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [4]:
#define a function to classify loan status into one of the following bins ('Fully Paid', 'Default', 'Current')
def loan_status_bin(text):
    if text in ('Fully Paid', 'Does not meet the credit policy. Status:Fully Paid'):
        return 'Fully Paid'
    elif text in ('Current', 'Issued'):
        return 'Current'
    elif text in ('Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'):
        return 'Default'
    elif text in ('Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period'):
        return 'Late'
    else:
        'UNKNOWN BIN'

In [5]:
#create a new attribute 'loan_status_bin' in the dataframe
loans_df['loan_status_bin']=loans_df['loan_status'].apply(loan_status_bin)
loans_df['loan_status_bin'].unique()

array(['Fully Paid', 'Default', 'Current', 'Late'], dtype=object)

In [6]:
loans_df.fillna(loans_df.median()['annual_inc'], inplace=True)

In [7]:
loans_df[loans_df['annual_inc'].isnull()==True]['annual_inc'].count()

0

In [8]:
loans_df_fp=loans_df[loans_df['loan_status_bin']=='Fully Paid']

In [9]:
loans_df_def=loans_df[loans_df['loan_status_bin']=='Default']

In [10]:
print('For Default loans, mean annual income is {0}, standard deviation is {1}, size of dataframe is {2}'.format(loans_df_def['annual_inc'].mean(), loans_df_def['annual_inc'].std(), len(loans_df_def['annual_inc'])))

For Default loans, mean annual income is 65199.76680867284, standard deviation is 56955.15545104668, size of dataframe is 47228


In [11]:
print('For Fully Paid loans, mean annual income is {0}, standard deviation is {1}, size of dataframe is {2}'.format(loans_df_fp['annual_inc'].mean(), loans_df_fp['annual_inc'].std(), len(loans_df_fp['annual_inc'])))

For Fully Paid loans, mean annual income is 74142.5024192341, standard deviation is 59205.29202398379, size of dataframe is 209711


In [12]:
def_mean=loans_df_def['annual_inc'].mean()
def_std=loans_df_def['annual_inc'].std()

fp_mean=loans_df_fp['annual_inc'].mean()
fp_std=loans_df_fp['annual_inc'].std()

In [13]:
h0_mean = 0
mean_diff = abs(def_mean-fp_mean)
sigma_diff = np.sqrt((fp_std**2)/len(loans_df_fp) + (def_std**2)/len(loans_df_def))
mean_diff, sigma_diff

(8942.7356105612562, 292.23360521799054)

In [14]:
z = (mean_diff - h0_mean) / sigma_diff
z

30.601325278420518

In [15]:
p = (1-stats.norm.cdf(z))*2
p

0.0

In [16]:
#define a function to convert grade into numerical values
def credit_grade(grade):
    if grade in ('A'):
        return 1
    elif grade in ('B'):
        return 2
    elif grade in ('C'):
        return 3
    elif grade in ('D'):
        return 4
    elif grade in ('E'):
        return 5
    elif grade in ('F'):
        return 6
    elif grade in ('G'):
        return 7
    else:
        99

In [17]:
#create a new attribute 'loan_status_bin' in the dataframe
loans_df['credit_grade']=loans_df['grade'].apply(credit_grade)
loans_df['credit_grade'].unique()

array([2, 3, 1, 5, 6, 4, 7])

In [18]:
loans_df['application_type'].unique()

array(['INDIVIDUAL', 'JOINT'], dtype=object)

In [19]:
def derived_income(x, y, z):
    if x == 'INDIVIDUAL':
        return y
    elif x == 'JOINT':
        return z
    else:
        0

In [20]:
loans_df['derived_income']=loans_df.apply(lambda x: derived_income(x['application_type'], x['annual_inc'], x['annual_inc_joint']), axis=1)

In [21]:
def derived_dti(x, y, z):
    if x == 'INDIVIDUAL':
        return y
    elif x == 'JOINT':
        return z
    else:
        0

In [22]:
loans_df['derived_dti']=loans_df.apply(lambda x: derived_dti(x['application_type'], x['dti'], x['dti_joint']), axis=1)

In [23]:
loans_df['inst_inc_ratio']=loans_df['installment']/ (loans_df['derived_income'] /12)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Features: 
* loan_amount
* credit_grade 
* interest_rate 
* derived_inc
* derived_dti 
* inst_inc_ratio

### Training and Test Datasets

When fitting models, we would like to ensure two things:

* We have found the best model (in terms of model parameters).
* The model is highly likely to generalize i.e. perform well on unseen data.

<br/>
<div class="span5 alert alert-success">
<h4>Purpose of splitting data into Training/testing sets</h4>
<ul>
  <li> We built our model with the requirement that the model fit the data well. </li>
  <li> As a side-effect, the model will fit <b>THIS</b> dataset well. What about new data? </li>
    <ul>
      <li> We wanted the model for predictions, right?</li>
    </ul>
  <li> One simple solution, leave out some data (for <b>testing</b>) and <b>train</b> the model on the rest </li>
  <li> This also leads directly to the idea of cross-validation, next section. </li>  
</ul>
</div>

First, we try a basic Logistic Regression:

* Split the data into a training and test (hold-out) set
* Train on the training set, and test for accuracy on the testing set

In [25]:
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(loans_df[['loan_amnt', 'credit_grade', 'int_rate', 'derived_income', 'derived_dti', 'inst_inc_ratio']].values,
                                              (loans_df.loan_status_bin).values,
                                              random_state=5)
##Xlr, Xtestlr, ylr, ytestlr = train_test_split(dflog[['Height','Weight']].values, (dflog.Gender == "Male").values, random_state=5)
 
clf = LogisticRegression()
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))

0.686339561405


### Tuning the Model

The model has some hyperparameters we can tune for hopefully better performance. For tuning the parameters of your model, you will use a mix of *cross-validation* and *grid search*. In Logistic Regression, the most important parameter to tune is the *regularization parameter* `C`. Note that the regularization parameter is not always part of the logistic regression model. 

The regularization parameter is used to control for unlikely high regression coefficients, and in other cases can be used when data is sparse, as a method of feature selection.

You will now implement some code to perform model tuning and selecting the regularization parameter $C$.

We use the following `cv_score` function to perform K-fold cross-validation and apply a scoring function to each test fold. In this incarnation we use accuracy score as the default scoring function.


In [27]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

Below is an example of using the `cv_score` function for a basic logistic regression model without regularization.

In [28]:
clf1 = LogisticRegression()
score = cv_score(clf1, Xlr, ylr)
print(score)

0.688176711924


<div class="span5 alert alert-info">
<h3>Checkup Exercise Set II</h3>

<b>Exercise:</b> Implement the following search procedure to find a good model
<ul>
<li> You are given a list of possible values of `C` below
<li> For each C:
  <ol>
  <li> Create a logistic regression model with that value of C
  <li> Find the average score for this model using the `cv_score` function **only on the training set** `(Xlr, ylr)`
  </ol>
<li> Pick the C with the highest average score
</ul>
Your goal is to find the best model parameters based *only* on the training set, without showing the model test set at all (which is why the test set is also called a *hold-out* set).
</div>

In [32]:
#the grid of parameters to search over
Cs = [0.001, 0.1, 1, 10, 100]
max_score=0

for C in Cs:
    clf2 = LogisticRegression(C=C)
    score = cv_score(clf2, Xlr, ylr)
    if score > max_score:
        max_score = score
        best_C =C
print ('max_score: ',max_score, 'best_C: ', best_C)

# your turn

max_score:  0.688224793543 best_C:  10


<div class="span5 alert alert-info">
<h3>Checkup Exercise Set III</h3>
**Exercise:** Now you want to estimate how this model will predict on unseen data in the following way:
<ol>
<li> Use the C you obtained from the procedure earlier and train a Logistic Regression on the training data
<li> Calculate the accuracy on the test data
</ol>

<p>You may notice that this particular value of `C` may or may not do as well as simply running the default model on a random train-test split. </p>

<ul>
<li> Do you think that's a problem? 
<li> Why do we need to do this whole cross-validation and grid search stuff anyway?
</ul>

</div>

In [33]:
clf3=LogisticRegression(C=best_C)
clf3.fit(Xlr, ylr)
ypred=clf3.predict(Xtestlr)
print('accuracy score: ', accuracy_score(ypred, ytestlr), '\n')
print('I don\'t think there is a problem, since model accuracy has '
      'increased with addition of a regularization parameter')
print('We perform cross-validation and grid search to tune hyperparameters of our model')

accuracy score:  0.68632603845 

I don't think there is a problem, since model accuracy has increased with addition of a regularization parameter
We perform cross-validation and grid search to tune hyperparameters of our model


### Black Box Grid Search in `sklearn`

<div class="span5 alert alert-info">
<h3>Checkup Exercise Set IV</h3>

<b>Exercise:</b> Use scikit-learn's [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) tool to perform cross validation and grid search. 

* Instead of writing your own loops above to iterate over the model parameters, can you use GridSearchCV to find the best model over the training set? 
* Does it give you the same best value of `C`?
* How does this model you've obtained perform on the test set?


In [35]:
# your turn
from sklearn.model_selection import GridSearchCV

clf4=LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
fitmodel = GridSearchCV(clf4, param_grid=parameters, cv=5, scoring="accuracy", return_train_score=True)
fitmodel.fit(Xlr, ylr)
fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_, fitmodel.cv_results_

clf5=LogisticRegression(C=fitmodel.best_params_['C'])
clf5.fit(Xlr, ylr)
ypred=clf5.predict(Xtestlr)

print('accuracy score: ', accuracy_score(ypred, ytestlr), '\n')
print('No, the new value of the C is: ', fitmodel.best_params_['C'], '\n')

accuracy score:  0.686339561405 

No, the new value of the C is:  0.0001 



In [None]:
from sklearn import svm

# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(loans_df[['loan_amnt', 'credit_grade', 'int_rate', 'derived_income', 'derived_dti', 'inst_inc_ratio']].values,
                                              (loans_df.loan_status_bin).values,
                                              random_state=5)
 
clf6 = svm.SVC()
# Fit the model on the trainng data.
clf6.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(clf6.predict(Xtestlr), ytestlr))
