# Lending Club Modeling Code

Part 2 of 3: This code should take the cleaned code from part 1 and build a model! Future versions will contain a process for updating the model to check against past iterations of the model. This code is largely taken from a Python coding notebook put together by Aaron Mcguire and Summer Li with additions and customization by Tim Horan.

### Import necessary Python Packages

In [1]:
import json 

from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

from sklearn.ensemble import GradientBoostingClassifier

import numpy as np
import pandas as pd

import pickle
from sklearn.externals import joblib

### Download Build and Validation datasets from Part 1

In [2]:
writeLocation = ''

build = pd.io.parsers.read_csv(writeLocation+'build_step1.csv', sep=',', index_col='id')
validation = pd.io.parsers.read_csv(writeLocation+'validation_step1.csv', sep=',', index_col='id')
build.head()

Unnamed: 0_level_0,member_id,loan_amnt,int_rate,installment,CO18M,loan_age,annual_inc,dti,delinq_2yrs,fico_range_low,...,grade_imputed,sub_grade_imputed,emp_length_imputed,home_ownership_imputed,verification_status_imputed,purpose_imputed,zip_code_imputed,addr_state_imputed,application_type_imputed,disbursement_method_imputed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1077501,,5000.0,0.1065,162.87,0,50.0,24000.0,27.65,0.0,735.0,...,0,0,0,0,0,0,0,0,0,0
1077430,,2500.0,0.1527,59.83,0,50.0,30000.0,1.61,0.0,740.0,...,0,0,0,0,0,0,0,0,0,0
1077175,,2400.0,0.1596,84.33,0,50.0,18980.0,8.72,0.0,735.0,...,0,0,0,0,0,0,0,0,0,0
1076863,,10000.0,0.1349,339.31,0,50.0,49200.0,20.0,0.0,690.0,...,0,0,0,0,0,0,0,0,0,0
1075358,,3000.0,0.1269,67.79,0,50.0,80000.0,17.94,0.0,695.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
columns = build.iloc[0].index
values = build.iloc[0]
for i in range(len(build.iloc[0])):
    print(columns[i])
    print(values [i])
    print()

member_id
nan

loan_amnt
5000.0

int_rate
0.1065

installment
162.87

CO18M
0.0

loan_age
50.0

annual_inc
24000.0

dti
27.65

delinq_2yrs
0.0

fico_range_low
735.0

fico_range_high
739.0

inq_last_6mths
1.0

mths_since_last_delinq
34.337096036853396

mths_since_last_record
76.78658947368422

open_acc
3.0

pub_rec
0.0

revol_bal
13648.0

revol_util
0.8370000000000001

total_acc
9.0

collections_12_mths_ex_med
0.0

mths_since_last_major_derog
41.97770934154568

acc_now_delinq
0.0

tot_coll_amt
178.11351379132012

tot_cur_bal
138462.01129709394

total_rev_hi_lim
29782.106916057164

acc_open_past_24mths
4.114945787402169

avg_cur_bal
13651.211684928383

bc_open_to_buy
8229.570125226335

bc_util
65.84251531650042

chargeoff_within_12_mths
0.0

delinq_amnt
0.0

mo_sin_old_il_acct
127.03380450327603

mo_sin_old_rev_tl_op
181.94297149538414

mo_sin_rcnt_rev_tl_op
13.596868796536016

mo_sin_rcnt_tl
8.50598903375637

mort_acc
1.846621326135147

mths_since_recent_bc
25.11319704975648

mths_since

### Set up X/Y structure for SKLearn

In [4]:
# Running sklearn's modeling methods generally takes a specific data format.
#   this puts the data in that format by removing several columns we cannot use,
#   as well as the target (can't model on the target!) for X/Y constructions.

target = 'CO18M'
exclude = ['member_id', target]

relevantVariable = []
for i in build.columns:
    if i not in exclude:
        relevantVariable.append(i)

xTrain = build[relevantVariable]
xTest  = validation[relevantVariable]
xTrain.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,loan_age,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,...,grade_imputed,sub_grade_imputed,emp_length_imputed,home_ownership_imputed,verification_status_imputed,purpose_imputed,zip_code_imputed,addr_state_imputed,application_type_imputed,disbursement_method_imputed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1077501,5000.0,0.1065,162.87,50.0,24000.0,27.65,0.0,735.0,739.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1077430,2500.0,0.1527,59.83,50.0,30000.0,1.61,0.0,740.0,744.0,4.0,...,0,0,0,0,0,0,0,0,0,0
1077175,2400.0,0.1596,84.33,50.0,18980.0,8.72,0.0,735.0,739.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1076863,10000.0,0.1349,339.31,50.0,49200.0,20.0,0.0,690.0,694.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1075358,3000.0,0.1269,67.79,50.0,80000.0,17.94,0.0,695.0,699.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
yTrain=pd.DataFrame()
yTest=pd.DataFrame()
yTrain = build[target]
yTest = validation[target]
yTrain.head()

id
1077501    0
1077430    0
1077175    0
1076863    0
1075358    0
Name: CO18M, dtype: int64

In [6]:
# Finally, extract those specific values.
cols = xTrain.columns

predictorsTrain = xTrain.values
predictorsTest = xTest.values

targetTrain = yTrain.values
targetTest = yTest.values

### Gradient Boosting in SKLearn

In [8]:
# This runs a gradient boosting algorithm, fitting it to the given parameters.
gbaModel = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=4, random_state=0)
gbaModel.fit(predictorsTrain, targetTrain)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [9]:
# indices is a list which records the location of each predictor in the model build sample
indices = np.argsort(gbaModel.feature_importances_)[::-1] # -1 indicates most important to least important

# Print the feature ranking
print("Feature ranking:")
for f in range(15): # here we set to output top predictors, but you can change it to your need
    print(cols[indices[f]], "\t",indices[f],"\t", gbaModel.feature_importances_[indices[f]]) # this format the output

Feature ranking:
sub_grade 	 234 	 0.20613454508655563
int_rate 	 1 	 0.2000016758499546
zip_code 	 239 	 0.11923786993794987
annual_inc 	 4 	 0.09252066030389137
acc_open_past_24mths 	 23 	 0.04762868603227222
grade 	 233 	 0.0370005641427235
purpose 	 238 	 0.036150558551570254
inq_last_6mths 	 9 	 0.024732943711813834
installment 	 2 	 0.018737775022505232
dti 	 5 	 0.017795079229267905
mths_since_recent_bc 	 34 	 0.01636934825235888
pct_tl_nvr_dlq 	 52 	 0.015529335345790865
mths_since_recent_inq 	 36 	 0.012236536189308402
mo_sin_old_rev_tl_op 	 30 	 0.011201819178239657
home_ownership 	 236 	 0.0104562804347689


### Score GB Model on validation sample

In [10]:
validation['GB_Score'] = gbaModel.predict_proba(xTest)[:,1]
build['GB_Score'] = gbaModel.predict_proba(xTrain)[:,1]

### Add back grades to enable graphing review

In [11]:
#Import dictionary that enables mapping back to character grades
writeLocation = ''

with open(writeLocation+'d_treatment_charVars.json', 'r') as fp:
    d_treatment_charVars = json.load(fp)

In [12]:
#Define funciton to reverse the char treatment dictionary and add back grade chars
def dictReverse (output, dictionary):
    """
    Input: Post treatment output and dictionary build defining the output 
    Output: The dictionary key needed to get the output
    """
    for i in dictionary:
        if dictionary[i] == output:
            return i
#Appl function to the sub_grade variable
validation['sub_grade_group'] = validation.apply(lambda row: dictReverse(row['sub_grade'],d_treatment_charVars['sub_grade']), axis=1)
build['sub_grade_group'] = build.apply(lambda row: dictReverse(row['sub_grade'],d_treatment_charVars['sub_grade']), axis=1)

### Graphically review model results on validation sample

In [13]:
# Have Bokeh outputs occur in notebook
output_notebook()

In [14]:
#Calculate means by sub_grade_group
graph_validation = validation.groupby(['sub_grade_group'])['CO18M','GB_Score','sub_grade'].mean()

#Define x and y metrics
x = list(graph_validation.index)
y1 = list(graph_validation['CO18M'])
y2 = list(graph_validation['GB_Score'])
y3 = list(graph_validation['sub_grade'])
y4 = list(validation['sub_grade_group'].value_counts())

#Create graph figures
p = figure(x_range=x, plot_width=900, title="Actual and Predicted 18M CO by Lending Club Grades")
p2 = figure(x_range=x, plot_width=900, title="Lending Club Loan Distribution")

#Add lines to figures
p.line(x,y1, color='blue',legend='Actual 18M CO')
p.line(x,y2, color='green',legend='GB Model')
p.line(x,y3, color='orange',legend='Lending Club Model')
p2.line(x,y4, color='orange')

#Add labeling to figure
p.xaxis.axis_label = 'Lending Club Sub-Grades'
p.yaxis.axis_label = '18M CO%'
p2.xaxis.axis_label = 'Lending Club Sub-Grades'
p2.yaxis.axis_label = 'Loan Count'

#Plot figure
show(p)
show(p2)

In [15]:
#Create risk adjusted return metric (CO/APR)
validation['sub_grade_RAR'] = validation['sub_grade']/validation['int_rate']
validation['GB_Score_RAR'] = validation['GB_Score']/validation['int_rate']
validation['CO18M_RAR'] = validation['CO18M']/validation['int_rate']

build['sub_grade_RAR'] = build['sub_grade']/build['int_rate']
build['GB_Score_RAR'] = build['GB_Score']/build['int_rate']
build['CO18M_RAR'] = build['CO18M']/build['int_rate']

In [16]:
#Calculate means by sub_grade_group
validation['graph_RAR_decile'] = (validation['GB_Score_RAR'].argsort().argsort()/float(len(validation))*50).astype(int)
#build['graph_RAR_decile'] = (build['GB_Score_RAR'].argsort().argsort()/float(len(build))*50).astype(int)
graph_validation = validation.groupby(['graph_RAR_decile'])['sub_grade_RAR','GB_Score_RAR','CO18M_RAR'].mean()
#graph_validation = build.groupby(['graph_RAR_decile'])['sub_grade_RAR','GB_Score_RAR','CO18M_RAR'].mean()

#Define x and y metrics
x = list(graph_validation.index)
y1 = list(graph_validation['CO18M_RAR'])
y2 = list(graph_validation['GB_Score_RAR'])
y3 = list(graph_validation['sub_grade_RAR'])

#Create graph figure
p = figure(x_range = (0,50), plot_width=900, title = "18M CO/Interest Rate by GB Model 50-tiles")

#Add lines to figure
p.line(x,y1, color='blue',legend='Actual 18M CO/Interest Rate')
p.line(x,y2, color='green',legend='GB Model/Interest Rate')
p.line(x,y3, color='orange',legend='Lending Club Model/Interest Rate')

#Add labeling to figure
p.xaxis.axis_label = 'GB Model/Interest Rate 50-tiles'
p.yaxis.axis_label = '18M CO/Interest Rate'


#Plot figure
show(p)

#### Ideal loans are those that have better than expected risk adjusted return (lower is better). The yellow line represents the expected risk adjusted return, and the green line represents better than expected return. The blue line helps validate that insights are well grounded.

### Export model into pickle file for usage in scoring

In [17]:
# This dumps the model into a pickle file, which allows you to close the notebook
#   and reimport it whenever necessary. This is useful for when you run an intensive
#   modeling operation and want to save the exact model that was generated.

writeLocation = ''

joblib.dump(gbaModel, writeLocation+'gbaModel.pkl')

['gbaModel.pkl']