In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline


loan = pd.read_csv('accepted_2007_to_2018Q4.csv')

return_number = {
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12
}

def convert_date(d):
    try:
        return datetime.date(year=int(d[4:]), month=return_number[d[:3]], day=1)
    except:
        pass

loan['issue_d'] = loan['issue_d'].apply(convert_date)
loan['earliest_cr_line'] = loan['earliest_cr_line'].apply(convert_date)

loan = loan[loan.issue_d < datetime.date(2015,7,1)]
loan = loan[loan.issue_d >= datetime.date(2010,1,1)]

loan = loan[loan.term == ' 36 months']

loan['earliest_cr_line'] = loan.apply(lambda x: (x['issue_d'] - x['earliest_cr_line']).days, axis=1)

loan = loan[(loan.loan_status == 'Fully Paid') | (loan.loan_status == 'Charged Off')]

loan = loan[loan.annual_inc < 1000000]

loan['ln_annual_inc'] = np.log(loan.annual_inc)

loan = loan[loan.revol_util < 150]
loan['ln_revol_bal'] = np.log(loan.revol_bal+1)

loan['ln_earliest_cr_line'] = np.log(loan.earliest_cr_line)

loan['ln_open_acc'] = np.log(loan.open_acc)

loan.rename(columns = {'delinq_2yrs': 'num_delinq_2yrs'}, inplace=True)
loan['delinq_2yrs'] = (loan['num_delinq_2yrs'] >= 1)

loan.rename(columns = {'pub_rec': 'num_pub_rec'}, inplace=True)
loan['pub_rec'] = (loan['num_pub_rec'] >= 1)

loan.rename(columns = {'inq_last_6mths': 'num_inq_last_6mths'}, inplace=True)
loan['inq_last_6mths'] = (loan['num_inq_last_6mths'] >= 1)

loan['target'] = (loan['loan_status'] == 'Fully Paid')

In [2]:
col_list = ['loan_amnt', 'int_rate', 'ln_annual_inc', 'dti', 'fico_range_high', 'num_delinq_2yrs', 'ln_earliest_cr_line', 'num_inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'ln_open_acc', 'num_pub_rec', 'ln_revol_bal', 'revol_util', 'total_acc']


In [17]:
# We will use 2010-2013 to tune our models, and save data from 2014 and the first 
# 6 months of 2015 for walk-forward testing

train = loan[loan.issue_d < datetime.date(2014, 1, 1)]

In [6]:
loan.shape

(449486, 158)

In [18]:
train.shape

(166313, 159)

In [16]:
print('2010: {}'.format(sum([x.year == 2010 for x in train.issue_d])))
print('2011: {}'.format(sum([x.year == 2011 for x in train.issue_d])))
print('2012: {}'.format(sum([x.year == 2012 for x in train.issue_d])))
print('2013: {}'.format(sum([x.year == 2013 for x in train.issue_d])))

2010: 8445
2011: 14092
2012: 43423
2013: 100353


In [47]:
import statsmodels.api as sm

y = train['target']
X1 = train[['loan_amnt', 'ln_annual_inc', 'dti', 'fico_range_high', 'delinq_2yrs', 'num_delinq_2yrs', 'ln_earliest_cr_line', 'inq_last_6mths', 'num_inq_last_6mths', 'ln_open_acc', 'pub_rec', 'num_pub_rec', 'ln_revol_bal', 'revol_util', 'total_acc']]
X2 = pd.concat([X1, train.int_rate, pd.get_dummies(train.grade)], axis=1)
X3 = pd.concat([X1, train.int_rate, pd.get_dummies(train.sub_grade)], axis=1)

In [32]:
type(y)

pandas.core.series.Series

In [33]:
type(X1)

pandas.core.frame.DataFrame

In [36]:
X1.shape

(166313, 17)

In [56]:
logit = sm.Probit(y, X1.astype(float)).fit()
logit_me = logit.get_margeff()
logit_me.summary()

Optimization terminated successfully.
         Current function value: 0.363638
         Iterations 6


0,1
Dep. Variable:,target
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
loan_amnt,-6.524e-07,1.26e-07,-5.195,0.0,-8.99e-07,-4.06e-07
ln_annual_inc,0.0235,0.002,13.166,0.0,0.02,0.027
dti,-0.0023,0.0,-18.336,0.0,-0.002,-0.002
fico_range_high,0.0003,2.65e-05,9.721,0.0,0.0,0.0
delinq_2yrs,-0.0009,0.004,-0.261,0.794,-0.008,0.006
num_delinq_2yrs,-0.003,0.002,-1.635,0.102,-0.007,0.001
ln_earliest_cr_line,-0.0197,0.002,-10.648,0.0,-0.023,-0.016
inq_last_6mths,-0.0117,0.003,-4.51,0.0,-0.017,-0.007
num_inq_last_6mths,-0.017,0.001,-13.759,0.0,-0.019,-0.015
ln_open_acc,-0.0399,0.003,-15.119,0.0,-0.045,-0.035


In [57]:
logit = sm.Probit(y, X2.astype(float)).fit()
logit_me = logit.get_margeff()
logit_me.summary()

Optimization terminated successfully.
         Current function value: 0.356357
         Iterations 6


0,1
Dep. Variable:,target
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
loan_amnt,-1.761e-06,1.34e-07,-13.134,0.0,-2.02e-06,-1.5e-06
ln_annual_inc,0.0568,0.002,27.824,0.0,0.053,0.061
dti,-0.0012,0.0,-9.379,0.0,-0.001,-0.001
fico_range_high,0.0004,4.44e-05,9.824,0.0,0.0,0.001
delinq_2yrs,0.0079,0.004,2.199,0.028,0.001,0.015
num_delinq_2yrs,-0.001,0.002,-0.534,0.593,-0.005,0.003
ln_earliest_cr_line,-0.0032,0.002,-1.689,0.091,-0.007,0.001
inq_last_6mths,0.0015,0.003,0.593,0.553,-0.004,0.007
num_inq_last_6mths,-0.0123,0.001,-9.929,0.0,-0.015,-0.01
ln_open_acc,-0.0201,0.003,-7.623,0.0,-0.025,-0.015


In [58]:
logit = sm.Probit(y, X3.astype(float)).fit()
logit_me = logit.get_margeff()
logit_me.summary()

         Current function value: 0.356124
         Iterations: 35


0,1
Dep. Variable:,target
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
loan_amnt,-1.691e-06,1.34e-07,-12.576,0.0,-1.95e-06,-1.43e-06
ln_annual_inc,0.0567,0.002,27.783,0.0,0.053,0.061
dti,-0.0012,0.0,-9.386,0.0,-0.001,-0.001
fico_range_high,0.0004,4.57e-05,8.109,0.0,0.0,0.0
delinq_2yrs,0.0078,0.004,2.193,0.028,0.001,0.015
num_delinq_2yrs,-0.001,0.002,-0.554,0.58,-0.005,0.003
ln_earliest_cr_line,-0.004,0.002,-2.128,0.033,-0.008,-0.0
inq_last_6mths,0.0022,0.003,0.849,0.396,-0.003,0.007
num_inq_last_6mths,-0.0121,0.001,-9.73,0.0,-0.015,-0.01
ln_open_acc,-0.021,0.003,-7.958,0.0,-0.026,-0.016


In [72]:
sum_stats = train.groupby(['grade'])[['int_rate', 'annual_inc', 'fico_range_high', 'dti', 'loan_amnt', 'target']].mean()
sum_stats

Unnamed: 0_level_0,int_rate,annual_inc,fico_range_high,dti,loan_amnt,target
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,7.533552,78061.632801,742.191457,14.213065,12528.419975,0.943643
B,11.784927,68131.638415,701.306904,16.463607,12044.910911,0.89452
C,15.17503,65099.059213,686.804995,17.200045,11744.074976,0.842224
D,18.259644,64625.170046,680.909519,17.079463,11350.114301,0.797595
E,20.96796,69862.437261,678.605408,16.816023,12137.651132,0.774016
F,23.139601,69501.044426,678.23821,15.984619,11426.995163,0.740024
G,23.226515,121153.817576,673.090909,17.18,22742.045455,0.727273


grade
A    1.014733
B    0.999938
C    0.970032
D    0.943233
E    0.936312
F    0.911263
G    0.896193
dtype: float64