# Identifying Safe Loans with Decision Trees

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('lending-club-data.csv')
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


In [3]:
data.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

In [4]:
data['safe_loans'] = data['bad_loans'].apply(lambda x: 1 if x==0 else -1)
print('# safe loans:', sum(data['safe_loans']==1))
print('# bad loans:', sum(data['safe_loans']==-1))

# safe loans: 99457
# bad loans: 23150


In [5]:
features = ['grade', 'sub_grade', 'short_emp', 'emp_length_num', 'home_ownership', 'dti', 'purpose', 'term', 'last_delinq_none', 'last_major_derog_none', 'revol_util', 'total_rec_late_fee']
target = 'safe_loans'
data = data[features + [target]]
data.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.7,0.0,1
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.5,0.0,1
3,C,C1,0,11,RENT,20.0,other,36 months,0,1,21.0,16.97,1
4,A,A4,0,4,RENT,11.2,wedding,36 months,1,1,28.3,0.0,1


In [6]:
data.dtypes

grade                     object
sub_grade                 object
short_emp                  int64
emp_length_num             int64
home_ownership            object
dti                      float64
purpose                   object
term                      object
last_delinq_none           int64
last_major_derog_none      int64
revol_util               float64
total_rec_late_fee       float64
safe_loans                 int64
dtype: object

In [7]:
# turn categorical variables into binary features via one-hot encoding
categorical_variables = ['grade', 'sub_grade', 'home_ownership', 'purpose', 'term']
data = pd.get_dummies(data, columns=categorical_variables)
data.head()

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
0,0,11,27.65,1,1,83.7,0.0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,1,1.0,1,1,9.4,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,11,8.72,1,1,98.5,0.0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,11,20.0,0,1,21.0,16.97,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,4,11.2,1,1,28.3,0.0,1,1,0,...,0,0,0,0,0,0,0,1,1,0


In [8]:
# split data into train dat and validation data
train_index = list(pd.read_json('module-5-assignment-1-train-idx.json')[0])
val_index = list(pd.read_json('module-5-assignment-1-validation-idx.json')[0])

train = data.iloc[train_index]
val = data.iloc[val_index]

train_X = train.drop('safe_loans', axis=1).values
train_y = train['safe_loans'].values
val_X = val.drop('safe_loans', axis=1).values
val_y = val['safe_loans'].values

In [9]:
# train 2 decision tree models using different maximum depth
model = DecisionTreeClassifier(max_depth=6).fit(X=train_X, y=train_y)
small_model = DecisionTreeClassifier(max_depth=2).fit(X=train_X, y=train_y)

In [10]:
# make predictions
sample_safe = val[val['safe_loans']==1][0:2]
sample_bad = val[val['safe_loans']==-1][0:2]
sample = sample_safe.append(sample_bad)
sample

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
79,0,10,16.85,1,1,96.4,0.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
24,0,3,13.97,0,1,59.5,0.0,-1,0,0,...,0,0,0,0,1,0,0,0,0,1
41,0,11,16.33,1,1,62.1,0.0,-1,1,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
sample_X = sample.drop('safe_loans', axis=1).values

In [12]:
# prediction of y
model.predict(sample_X)

array([ 1, -1, -1,  1], dtype=int64)

In [13]:
# actual y
np.array(sample['safe_loans'])

array([ 1,  1, -1, -1], dtype=int64)

In [14]:
model.predict_proba(sample_X)

array([[ 0.34156543,  0.65843457],
       [ 0.53630646,  0.46369354],
       [ 0.64750958,  0.35249042],
       [ 0.20789474,  0.79210526]])

In [15]:
model.score(val_X, val_y)

0.63614821197759586

In [16]:
# train another decision tree model with a deeper depth
big_model = DecisionTreeClassifier(max_depth=10).fit(train_X, train_y)

In [17]:
big_model.score(val_X, val_y)

0.62688496337785438

In [18]:
model_estimation = model.predict(val_X)
df = pd.DataFrame({'actual y':val_y, 'estimated y':model_estimation})
df.head()

Unnamed: 0,actual y,estimated y
0,-1,-1
1,-1,1
2,-1,-1
3,-1,-1
4,-1,1


In [19]:
false_negative = df[(df['actual y']==1) & (df['estimated y']==-1)].shape[0]
false_positive = df[(df['actual y']==-1) & (df['estimated y']==1)].shape[0]
10000*false_negative + 20000*false_positive

50390000