In [12]:
import pandas as pd
import numpy as np

from tabulate import tabulate

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

Load source data from Github

In [13]:
url = 'https://raw.githubusercontent.com/vicky727luo/Maxlulu/main/Lending%20Club%20Project/loan_raw_data.csv'
LoanData = pd.read_csv(url)
LoanData.head()

Unnamed: 0,id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,policy_code,loan_status
0,1077501,5000,5000,36 months,10.65,162.87,B,B2,10+ years,3,24000.0,3,27.65,0,31048,1,,,3,0,13648,83.7,9,0.0,0.0,1,2
1,1077430,2500,2500,60 months,15.27,59.83,C,C4,< 1 year,3,30000.0,2,1.0,0,36251,5,,,3,0,1687,9.4,4,0.0,0.0,1,1
2,1077175,2400,2400,36 months,15.96,84.33,C,C5,10+ years,3,12252.0,1,8.72,0,37196,2,,,2,0,2956,98.5,10,0.0,0.0,1,2
3,1076863,10000,10000,36 months,13.49,339.31,C,C1,10+ years,3,49200.0,2,20.0,0,35096,1,35.0,,10,0,5598,21.0,37,0.0,0.0,1,2
4,1075269,5000,5000,36 months,7.9,156.46,A,A4,3 years,3,36000.0,2,11.2,0,38292,3,,,9,0,7963,28.3,12,0.0,0.0,1,2


Clean Data

In [15]:
# Clean raw data
CleanData = LoanData.select_dtypes(include=[np.number]).interpolate().dropna()

yPredict = CleanData.loan_status
XClean = CleanData.drop(["loan_status"], axis=1)

CleanData.head()

Unnamed: 0,id,loan_amnt,funded_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,policy_code,loan_status
61,1069093,12500,8925,17.27,223.11,3,30000.0,2,13.16,0,34790,0,47.4,113.0,7,1,9844,70.8,28,0.0,0.0,1,1
62,1069030,16425,16425,14.27,563.53,3,44544.0,2,22.71,0,37530,1,47.52,111.0,10,0,15747,83.4,18,0.0,0.0,1,2
63,1068906,8200,8200,21.28,223.14,3,75000.0,2,12.48,0,36526,3,47.64,109.0,6,0,9012,73.9,11,0.0,0.0,1,1
64,1069073,15000,15000,14.65,517.42,2,61000.0,3,11.88,0,38139,1,47.76,107.0,8,0,19397,79.5,18,0.0,0.0,1,2
65,1069043,20975,13575,17.58,341.63,1,44000.0,3,18.79,0,36617,0,47.88,105.0,13,1,20657,79.4,21,0.0,0.0,1,2


Setup Training

In [16]:
# RandomForest
X_train, X_test, y_train, y_test = train_test_split(XClean, yPredict, random_state=42, test_size=.33)
clf = RandomForestRegressor(n_jobs=2, n_estimators=1000)
model = clf.fit(X_train, y_train)

Feature Importance

In [17]:
# Feature Importance ranked by score
headers = ["name", "score"]
values = sorted(zip(X_train.columns, model.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                          score
int_rate                0.112652
dti                     0.0795894
mths_since_last_record  0.0789419
id                      0.0788009
annual_inc              0.0773945
earliest_cr_line        0.0772323
mths_since_last_delinq  0.076061
revol_bal               0.0756117
revol_util              0.0746663
installment             0.0559532
total_acc               0.0535592
open_acc                0.0417665
loan_amnt               0.0309885
funded_amnt             0.0305165
inq_last_6mths          0.0198885
verification_status     0.012644
home_ownership          0.0105908
pub_rec                 0.00656479
delinq_2yrs             0.00643488
out_prncp_inv           7.72085e-05
out_prncp               6.58455e-05
policy_code             0
