In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [4]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))


In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [6]:
# Convert categorical data to numeric and separate target feature for training data
train_df = train_df.dropna(axis='columns', how='all')

train_df = train_df.dropna()

#remove loan status of issued, fully paid and charged off
issued_df = train_df['loan_status'] !='Issued'
train_df = train_df.loc[issued_df]
fully_df =train_df['loan_status'] !='Fully Paid'
train_df = train_df.loc[fully_df]
chargeoff_df = train_df['loan_status'] !='Charged Off'
train_df = train_df.loc[chargeoff_df]

#convert the interest rate to numeric
train_df['int_rate'] = train_df['int_rate'].replace('%', '')
train_df['int_rate'] = train_df['int_rate'].astype('float')

x = {'Current': 'low_risk'}
train_df = train_df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
train_df = train_df.replace(x)

train_df.reset_index(inplace=True, drop=True)

train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [7]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [8]:
# Convert categorical data to numeric and separate target feature for testing data

test_df = test_df.dropna(axis='columns', how='all')

test_df =test_df.dropna()

#remove loan status of issued, fully paid and charged off
issued_df = test_df['loan_status'] !='Issued'
test_df = test_df.loc[issued_df]
fully_df =test_df['loan_status'] !='Fully Paid'
test_df =test_df.loc[fully_df]
chargeoff_df =test_df['loan_status'] !='Charged Off'
test_df = test_df.loc[chargeoff_df]

#convert the interest rate to numeric
test_df['int_rate'] = test_df['int_rate'].replace('%', '')
test_df['int_rate'] = test_df['int_rate'].astype('float')

x = {'Current': 'low_risk'}
test_df = test_df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
test_df = test_df.replace(x)

test_df.reset_index(inplace=True, drop=True)

test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [9]:
train_df['train'] =1
test_df['train'] = 0

In [10]:
combined = pd.concat([train_df, test_df])

In [11]:
comb_df=pd.get_dummies(combined['loan_status'])

In [12]:
train_df = combined[combined["train"]==1]
test_df = combined[combined["train"]==0]
train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

In [13]:
#Prediction for modeling - My prediction is the Logistic Regression model will be a better predictor as it performs better when
#the noise variables are less than the number of explanatory variables.  The variables impacting the target in the dataset are few.

In [14]:
X = train_df
y = test_df

In [15]:
X.describe()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,...,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0
mean,187893.694745,187893.694745,17399.579228,0.140899,509.150241,89893.2,22.589516,0.256979,0.566502,12.680296,...,0.0633,2.36289,94.510148,33.05413,0.11601,0.0,198980.1,62404.81798,28038.013136,56222.15
std,99710.342179,99710.342179,10238.160884,0.052679,294.617726,168437.0,23.903223,0.803027,0.814674,6.154663,...,0.425228,1.987853,8.88965,33.889522,0.3296,0.0,192389.2,58987.382738,25331.908015,52074.85
min,24.0,24.0,1000.0,0.06,30.89,500.0,0.1,0.0,0.0,2.0,...,0.0,0.0,22.2,0.0,0.0,0.0,3300.0,1275.0,200.0,127.0
25%,104269.25,104269.25,10000.0,0.1033,286.9025,50000.0,14.5,0.0,0.0,8.0,...,0.0,1.0,92.3,0.0,0.0,0.0,62600.0,26863.5,10800.0,23298.75
50%,190128.0,190128.0,15000.0,0.1308,438.53,73000.0,20.375,0.0,0.0,11.0,...,0.0,2.0,100.0,25.0,0.0,0.0,128134.5,46287.0,20700.0,42644.5
75%,274595.75,274595.75,25000.0,0.1774,682.23,103585.0,27.29,0.0,1.0,16.0,...,0.0,3.0,100.0,54.5,0.0,0.0,285407.0,77730.0,36900.0,73132.75
max,355312.0,355312.0,40000.0,0.3084,1671.88,9682505.0,999.0,15.0,5.0,65.0,...,15.0,20.0,100.0,100.0,4.0,0.0,3137619.0,917986.0,284800.0,1319104.0


In [16]:
y.value_counts()

Unnamed: 0  index  loan_amnt  int_rate  installment  home_ownership  annual_inc  verification_status  loan_status  pymnt_plan  dti    delinq_2yrs  inq_last_6mths  open_acc  pub_rec  revol_bal  total_acc  initial_list_status  out_prncp  out_prncp_inv  total_pymnt  total_pymnt_inv  total_rec_prncp  total_rec_int  total_rec_late_fee  recoveries  collection_recovery_fee  last_pymnt_amnt  collections_12_mths_ex_med  policy_code  application_type  acc_now_delinq  tot_coll_amt  tot_cur_bal  open_acc_6m  open_act_il  open_il_12m  open_il_24m  mths_since_rcnt_il  total_bal_il  il_util  open_rv_12m  open_rv_24m  max_bal_bc  all_util  total_rev_hi_lim  inq_fi  total_cu_tl  inq_last_12m  acc_open_past_24mths  avg_cur_bal  bc_open_to_buy  bc_util  chargeoff_within_12_mths  delinq_amnt  mo_sin_old_il_acct  mo_sin_old_rev_tl_op  mo_sin_rcnt_rev_tl_op  mo_sin_rcnt_tl  mort_acc  mths_since_recent_bc  mths_since_recent_inq  num_accts_ever_120_pd  num_actv_bc_tl  num_actv_rev_tl  num_bc_sats  num_bc_tl  

In [19]:
# Train the Logistic Regression model on the unscaled data and print the model score
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

NameError: name 'X_train' is not defined

In [None]:
# Train a Random Forest Classifier model and print the model score

In [None]:
# Scale the data

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score