In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline


loan = pd.read_csv('accepted_2007_to_2018Q4.csv')

return_number = {
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12
}

def convert_date(d):
    try:
        return datetime.date(year=int(d[4:]), month=return_number[d[:3]], day=1)
    except:
        pass

loan['issue_d'] = loan['issue_d'].apply(convert_date)
loan['earliest_cr_line'] = loan['earliest_cr_line'].apply(convert_date)

loan = loan[loan.issue_d < datetime.date(2015,7,1)]
loan = loan[loan.issue_d >= datetime.date(2010,1,1)]

loan = loan[loan.term == ' 36 months']

loan['earliest_cr_line'] = loan.apply(lambda x: (x['issue_d'] - x['earliest_cr_line']).days, axis=1)

loan = loan[(loan.loan_status == 'Fully Paid') | (loan.loan_status == 'Charged Off')]

loan = loan[loan.annual_inc < 1000000]

loan['ln_annual_inc'] = np.log(loan.annual_inc)

loan = loan[loan.revol_util < 150]
loan['ln_revol_bal'] = np.log(loan.revol_bal+1)

loan['ln_earliest_cr_line'] = np.log(loan.earliest_cr_line)

loan['ln_open_acc'] = np.log(loan.open_acc)

loan.rename(columns = {'delinq_2yrs': 'num_delinq_2yrs'}, inplace=True)
loan['delinq_2yrs'] = (loan['num_delinq_2yrs'] >= 1)

loan.rename(columns = {'pub_rec': 'num_pub_rec'}, inplace=True)
loan['pub_rec'] = (loan['num_pub_rec'] >= 1)

loan.rename(columns = {'inq_last_6mths': 'num_inq_last_6mths'}, inplace=True)
loan['inq_last_6mths'] = (loan['num_inq_last_6mths'] >= 1)

loan['target'] = (loan['loan_status'] == 'Fully Paid')

In [3]:
col_list = ['loan_amnt', 'int_rate', 'ln_annual_inc', 'dti', 'fico_range_high', 'num_delinq_2yrs', 'ln_earliest_cr_line', 'num_inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'ln_open_acc', 'num_pub_rec', 'ln_revol_bal', 'revol_util', 'total_acc']


In [4]:
# We will use 2010-2013 to tune our models, and save data from 2014 and the first 
# 6 months of 2015 for walk-forward testing

train = loan[loan.issue_d < datetime.date(2014, 1, 1)]
wf = loan[(loan.issue_d >= datetime.date(2014, 1, 1)) & (loan.issue_d < datetime.date(2015, 7, 1))]


In [5]:
y = train['target']
X1 = train[['loan_amnt', 'ln_annual_inc', 'dti', 'fico_range_high', 'delinq_2yrs', 'num_delinq_2yrs', 'ln_earliest_cr_line', 'inq_last_6mths', 'num_inq_last_6mths', 'ln_open_acc', 'pub_rec', 'num_pub_rec', 'ln_revol_bal', 'revol_util', 'total_acc']]
X2 = pd.concat([X1, train.int_rate, pd.get_dummies(train.grade)], axis=1)
X3 = pd.concat([X1, train.int_rate, pd.get_dummies(train.sub_grade)], axis=1)


In [15]:
from sklearn.neighbors import KNeighborsClassifier

neighbors1 = KNeighborsClassifier(n_neighbors=100)
n1 = neighbors1.fit(X1, y)

neighbors2 = KNeighborsClassifier(n_neighbors=100)
n2 = neighbors2.fit(X2, y)

neighbors3 = KNeighborsClassifier(n_neighbors=100)
n3 = neighbors3.fit(X3, y)



In [10]:
X1_test = test[['loan_amnt', 'ln_annual_inc', 'dti', 'fico_range_high', 'delinq_2yrs', 'num_delinq_2yrs', 'ln_earliest_cr_line', 'inq_last_6mths', 'num_inq_last_6mths', 'ln_open_acc', 'pub_rec', 'num_pub_rec', 'ln_revol_bal', 'revol_util', 'total_acc']]
X2_test = pd.concat([X1_test, test.int_rate, pd.get_dummies(test.grade)], axis=1)
X3_test = pd.concat([X1_test, test.int_rate, pd.get_dummies(test.sub_grade)], axis=1)
y_test = test['target']

In [16]:
n1.predict_proba(X1_test)[:10]

array([[0.02, 0.98],
       [0.15, 0.85],
       [0.15, 0.85],
       [0.17, 0.83],
       [0.12, 0.88],
       [0.18, 0.82],
       [0.11, 0.89],
       [0.06, 0.94],
       [0.11, 0.89],
       [0.2 , 0.8 ]])

In [17]:
from scipy.stats import zscore
X1.apply(zscore)

Unnamed: 0,loan_amnt,ln_annual_inc,dti,fico_range_high,delinq_2yrs,num_delinq_2yrs,ln_earliest_cr_line,inq_last_6mths,num_inq_last_6mths,ln_open_acc,pub_rec,num_pub_rec,ln_revol_bal,revol_util,total_acc
1611879,-0.965863,-1.736506,1.502039,1.120775,-0.417327,-0.328159,1.445670,1.035797,0.215256,-2.598591,-0.314991,-0.254705,0.295397,1.137754,-1.314237
1611881,-1.325031,-3.013950,-0.989311,1.120775,-0.417327,-0.328159,-0.604724,1.035797,1.202619,-3.502361,-0.314991,-0.254705,-0.972233,1.751469,-1.224002
1611882,-0.275155,-0.372662,0.495234,-0.293574,-0.417327,-0.328159,0.337583,1.035797,0.215256,0.085030,-0.314991,-0.254705,-0.443122,-1.462242,1.212357
1611884,-0.965863,-0.966151,-0.662922,0.963625,-0.417327,-0.328159,-1.342511,1.035797,2.189982,-0.149815,-0.314991,-0.254705,-0.151103,-1.159531,-1.043531
1611886,-1.242146,-0.419576,-1.432832,-1.236474,-0.417327,-0.328159,-2.104860,1.035797,1.202619,-1.957356,-0.314991,-0.254705,-0.124679,1.295330,-1.765415
1611890,0.001128,0.428337,-0.718197,-0.765024,-0.417327,-0.328159,1.040452,-0.965440,-0.772108,0.491420,-0.314991,-0.254705,0.739939,0.449398,0.941650
1611891,-0.413297,-1.312549,-0.810323,0.335025,-0.417327,-0.328159,-1.176499,1.035797,0.215256,-1.957356,-0.314991,-0.254705,0.074294,1.469492,-1.314237
1611892,-1.242146,-2.629479,-0.483934,0.177875,-0.417327,-0.328159,-0.980882,1.035797,1.202619,0.297474,-0.314991,-0.254705,-0.220534,-0.545816,-1.133766
1611893,-0.275155,0.974913,-1.207781,0.649325,-0.417327,-0.328159,0.886070,1.035797,1.202619,0.835017,-0.314991,-0.254705,0.188543,-0.031622,0.490473
1611894,-1.518429,-1.443630,0.536032,-1.079324,-0.417327,-0.328159,-2.408584,1.035797,0.215256,0.297474,-0.314991,-0.254705,-0.316273,1.046526,-0.050940


Need to investigate:

1. How to save transformations for testing dataset or live data (for example, an income of $100,000 won't necessarily be the same z-score in the testing dataset as it is in the training dataset.

2. Leave dummy variables alone?