In [118]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [61]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [106]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [105]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [95]:
a = train_df['home_ownership'].unique()
print(a)

['MORTGAGE' 'RENT' 'OWN' 'ANY']


In [96]:
a = train_df['verification_status'].unique()
print(a)

['Not Verified' 'Source Verified' 'Verified']


In [97]:
a = train_df['loan_status'].unique()
print(a)

['low_risk' 'high_risk']


In [98]:
a = train_df['initial_list_status'].unique()
print(a)

['w' 'f']


In [99]:
a = train_df['application_type'].unique()
print(a)

['Individual' 'Joint App']


In [100]:
a = train_df['hardship_flag'].unique()
print(a)

['N' 'Y']


In [101]:
a = train_df['debt_settlement_flag'].unique()
print(a)

['N' 'Y']


In [102]:
a = test_df['debt_settlement_flag'].unique()
print(a)

['N']


In [65]:
# Convert categorical data to numeric and separate target feature for training data
home_ownership_dict = {'MORTGAGE': 0, 'RENT': 1, 'OWN': 2, 'ANY': 3}
home_ownership_train_df = train_df.replace({'home_ownership': home_ownership_dict})
home_ownership_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,0,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,1,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [74]:
home_ownership_train_df['home_ownership'].unique()

array([0, 1, 2, 3])

In [66]:
verification_status_dict = {'Not Verified': 0, 'Source Verified': 1, 'Verified': 2}
verification_status_train_df = home_ownership_train_df.replace({'verification_status': verification_status_dict})
verification_status_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,0,197000.0,1,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,1,45000.0,0,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [73]:
verification_status_train_df['verification_status'].unique()

array([0, 1, 2])

In [69]:
loan_status_dict = {'low_risk': 0, 'high_risk': 1}
loan_status_train_df = verification_status_train_df.replace({'loan_status': loan_status_dict})
loan_status_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,0,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,0,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,0,197000.0,1,0,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,1,45000.0,0,0,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,0,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [72]:
loan_status_train_df['loan_status'].unique()

array([0, 1])

In [70]:
initial_list_status_dict = {'w': 0, 'f': 1}
initial_list_status_train_df = loan_status_train_df.replace({'initial_list_status': initial_list_status_dict})
initial_list_status_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,0,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,0,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,0,197000.0,1,0,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,1,45000.0,0,0,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,0,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [71]:
initial_list_status_train_df['initial_list_status'].unique()

array([0, 1])

In [75]:
application_type_dict = {'Individual': 0, 'Joint App': 1}
application_type_train_df = initial_list_status_train_df.replace({'application_type': application_type_dict})
application_type_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,0,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,0,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,0,197000.0,1,0,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,1,45000.0,0,0,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,0,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [76]:
application_type_train_df['application_type'].unique()

array([0, 1])

In [77]:
hardship_flag_dict = {'N': 0, 'Y': 1}
hardship_flag_train_df = application_type_train_df.replace({'hardship_flag': hardship_flag_dict})
hardship_flag_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,0,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,N
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,0,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,N
2,321143,321143,20000.0,0.124,448.95,0,197000.0,1,0,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,N
3,11778,11778,3000.0,0.124,100.22,1,45000.0,0,0,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,N
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,0,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,N


In [78]:
hardship_flag_train_df['hardship_flag'].unique()

array([0, 1])

In [79]:
debt_settlement_flag_dict = {'N': 0, 'Y': 1}
debt_settlement_flag_train_df = hardship_flag_train_df.replace({'debt_settlement_flag': debt_settlement_flag_dict})
debt_settlement_flag_train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,0,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,0,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,321143,321143,20000.0,0.124,448.95,0,197000.0,1,0,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,11778,11778,3000.0,0.124,100.22,1,45000.0,0,0,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,0,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0


In [80]:
debt_settlement_flag_train_df['debt_settlement_flag'].unique()

array([0, 1])

In [81]:
# Convert categorical data to numeric and separate target feature for testing data
home_ownership_dict = {'MORTGAGE': 0, 'RENT': 1, 'OWN': 2, 'ANY': 3}
home_ownership_test_df = test_df.replace({'home_ownership': home_ownership_dict})
home_ownership_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,1,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [82]:
home_ownership_test_df['home_ownership'].unique()

array([0, 1, 2, 3])

In [83]:
verification_status_dict = {'Not Verified': 0, 'Source Verified': 1, 'Verified': 2}
verification_status_test_df = home_ownership_test_df.replace({'verification_status': verification_status_dict})
verification_status_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,0,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,0,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,1,50000.0,0,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [84]:
verification_status_test_df['verification_status'].unique()

array([0, 2, 1])

In [85]:
loan_status_dict = {'low_risk': 0, 'high_risk': 1}
loan_status_test_df = verification_status_test_df.replace({'loan_status': loan_status_dict})
loan_status_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,0,0,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,0,0,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,0,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,0,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,1,50000.0,0,0,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [86]:
loan_status_test_df['loan_status'].unique()

array([0, 1])

In [87]:
initial_list_status_dict = {'w': 0, 'f': 1}
initial_list_status_test_df = loan_status_test_df.replace({'initial_list_status': initial_list_status_dict})
initial_list_status_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,0,0,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,0,0,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,0,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,0,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,1,50000.0,0,0,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [88]:
initial_list_status_test_df['initial_list_status'].unique()

array([0, 1])

In [89]:
application_type_dict = {'Individual': 0, 'Joint App': 1}
application_type_test_df = initial_list_status_test_df.replace({'application_type': application_type_dict})
application_type_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,0,0,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,0,0,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,0,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,0,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,1,50000.0,0,0,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [90]:
application_type_test_df['application_type'].unique()

array([0, 1])

In [91]:
hardship_flag_dict = {'N': 0, 'Y': 1}
hardship_flag_test_df = application_type_test_df.replace({'hardship_flag': hardship_flag_dict})
hardship_flag_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,0,0,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,N
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,0,0,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,N
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,0,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,N
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,0,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,N
4,37505,37505,3600.0,0.124,120.27,1,50000.0,0,0,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,N


In [92]:
hardship_flag_test_df['hardship_flag'].unique()

array([0, 1])

In [93]:
debt_settlement_flag_dict = {'N': 0, 'Y': 1}
debt_settlement_flag_test_df = hardship_flag_test_df.replace({'debt_settlement_flag': debt_settlement_flag_dict})
debt_settlement_flag_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,0,140000.0,0,0,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,0
1,25429,25429,6000.0,0.1524,208.7,1,55000.0,0,0,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,0
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,0,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,0
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,0,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,0
4,37505,37505,3600.0,0.124,120.27,1,50000.0,0,0,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,0


In [94]:
debt_settlement_flag_test_df['debt_settlement_flag'].unique()

array([0])

In [115]:
clean_test_df = debt_settlement_flag_test_df.drop(columns=['Unnamed: 0','index','pymnt_plan' ])

In [116]:
clean_train_df = debt_settlement_flag_train_df.drop(columns=['Unnamed: 0','index','pymnt_plan' ])

In [121]:
X = clean_train_df.drop('loan_status', axis=1)
X

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,inq_last_6mths,open_acc,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,0,223000.0,0,29.99,0.0,0.0,15.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,21000.0,0.1308,478.68,0,123000.0,1,11.26,2.0,0.0,16.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,20000.0,0.1240,448.95,0,197000.0,1,11.28,0.0,0.0,12.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,3000.0,0.1240,100.22,1,45000.0,0,18.08,0.0,0.0,12.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,30000.0,0.1612,1056.49,0,133000.0,1,27.77,0.0,2.0,13.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,1,28000.0,0,28.42,0.0,0.0,15.0,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,0,0
12176,15000.0,0.1774,540.34,1,50000.0,2,23.43,4.0,0.0,16.0,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,0,0
12177,3600.0,0.1862,131.28,1,60000.0,0,28.80,0.0,1.0,14.0,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,0,0
12178,15000.0,0.0881,475.68,0,62000.0,1,11.44,0.0,0.0,5.0,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,0,0


In [120]:
y_label = LabelEncoder().fit_transform(clean_train_df['loan_status'])
y_label

array([0, 0, 0, ..., 1, 1, 1])

In [112]:
# add missing dummy variables to testing set
train_objs_num = len(clean_train_df)
dataset = pd.concat(objs=[clean_train_df, clean_test_df], axis=0)
dataset_preprocessed = pd.get_dummies(dataset)
train_preprocessed = dataset_preprocessed[:train_objs_num]
test_preprocessed = dataset_preprocessed[train_objs_num:]


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, random_state=1)

In [125]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6526546250684182
Testing Data Score: 0.6476190476190476


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Train a Random Forest Classifier model and print the model score

In [None]:
# Scale the data

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score