In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
#Predicting Credit Risk
#
#LendingClub is a peer-to-peer lending services company that allows individual investors to partially fund personal loans as well as buy and sell notes backing the loans on a secondary market. LendingClub offers their previous data through an API.
#You will be using this data to create machine learning models to classify the risk level of given loans. Specifically, you will be comparing the Logistic Regression model and Random Forest Classifier.

In [3]:
# Loading the csv provided to train the dataset

train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [4]:
# Display the dataframe for visualization

train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [5]:
#Display the test dataframe for visualization

test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [6]:
# Convert and separate the data for training

y_train = train_df["loan_status"]
X_train = train_df.drop(columns = ["loan_status"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0


In [7]:
y_test = test_df["loan_status"]
X_test = test_df.drop(columns = ["loan_status"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,0,1,0,1,1,0,1,0,1


In [8]:
# add missing dummy variables to testing set (in training only)
for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = 0

In [9]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# Train the Logistic Regression model on the scaled data and print the model score
logisticRegr_scaled = LogisticRegression(
    solver='lbfgs',
    max_iter=500,
    random_state=0
)
logisticRegr_scaled.fit(X_train_scaled, y_train)
print("Logistic Regressin scaled score: ", logisticRegr_scaled.score(X_test_scaled, y_test))

Logistic Regressin scaled score:  0.6599319438536793


In [23]:
# Train a Random Forest Classifier model and print the model score
randomForestClass = RandomForestClassifier(random_state=0)
randomForestClass.fit(X_train, y_train)
print("Random Forest Classifier score: ", randomForestClass.score(X_test,y_test))

Random Forest Classifier score:  0.6210123351765207


In [22]:
# Train a Random Forest Classifier model on the scaled data and print the model score
randomForestClass_scaled = RandomForestClassifier(random_state=0)
randomForestClass_scaled.fit(X_train_scaled, y_train)
print("Random Forest Classifier scaled score: ", randomForestClass_scaled.score(X_test_scaled, y_test))

Random Forest Classifier scaled score:  0.5646533390046788


In [13]:
# Convert categorical data to numeric and separate target feature for training data
# Data is exported to csv file 
Yes_No_dict = {'Y':1,'N':0}
train_df2 = train_df.replace({'hardship_flag':Yes_No_dict, 'debt_settlement_flag':Yes_No_dict})
Home_ownership_dict = {'ANY':0,'RENT':1,'MORTGAGE':2,'OWN':3}
train_df3 = train_df2.replace({'home_ownership':Home_ownership_dict})
verification_dict = {'Not Verified':0,'Source Verified':1,'Verified':1}
train_df4 = train_df3.replace({'verification_status':verification_dict})
Loan_status_dict = {'low_risk':1,'high_risk':0}
train_df5 = train_df4.replace({'loan_status':Loan_status_dict})
Initial_list_status_dict = {'w':0,'f':1}
train_df6 = train_df5.replace({'initial_list_status':Initial_list_status_dict})
Application_Type_dict = {'Individual':1,'Joint App':0}
train_df7 = train_df6.replace({'application_type':Application_Type_dict})

train_df8 = train_df7.drop(['index','pymnt_plan'],axis='columns')

file_path = Path('Resources/cleaned2019creditdata.csv')
train_df8.to_csv(file_path, index=False)

In [14]:
train_df9 = train_df8.drop(['Unnamed: 0'],axis='columns')
train_df9.head()

X_train = train_df9.drop('loan_status', axis=1)
y_train = train_df9['loan_status'].values
print(X_train.select_dtypes(include=[object]))

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[12180 rows x 0 columns]


In [15]:
# Convert categorical data to numeric and separate target feature for testing data
# Data is exported to csv file 
testdf2 = test_df.replace({'hardship_flag':Yes_No_dict, 'debt_settlement_flag':Yes_No_dict})
testdf3 = testdf2.replace({'home_ownership':Home_ownership_dict})
testdf4 = testdf3.replace({'verification_status':verification_dict})
testdf5 = testdf4.replace({'loan_status':Loan_status_dict})
testdf6 = testdf5.replace({'initial_list_status':Initial_list_status_dict})
testdf7 = testdf6.replace({'application_type':Application_Type_dict})

testdf8 = testdf7.drop(['index','pymnt_plan'],axis='columns')

file_path = Path('Resources/cleaned2020creditdata.csv')
testdf8.to_csv(file_path, index=False)

In [16]:
testdf9 = testdf8.drop(['Unnamed: 0'],axis='columns')
testdf9.head()



Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,2,140000.0,0,1,19.75,0.0,1.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,0
1,6000.0,0.1524,208.7,1,55000.0,0,1,11.52,2.0,0.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,0
2,3600.0,0.1695,128.27,1,42000.0,0,1,6.74,0.0,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,0
3,20000.0,0.1524,478.33,1,100000.0,0,1,12.13,0.0,2.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,0
4,3600.0,0.124,120.27,1,50000.0,0,1,16.08,0.0,3.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,0


In [17]:
# add missing dummy variables the dataset for testing


testdf10 = pd.get_dummies(testdf9)

X_test = testdf10.drop('loan_status', axis=1)
y_test = testdf10['loan_status'].values



In [18]:
# Train the Logistic Regression model on the unscaled data and print the model score

reg = LogisticRegression().fit(X_train, y_train)
reg.score(X_test, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5168013611229264

In [19]:
# Train a Random Forest Classifier model and print the model score

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6399404508719694


In [20]:
# Scale the data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [21]:
# Train a Random Forest Classifier model on the scaled data and print the model score
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Testing Score: 0.5


# Below is the result from the model:

LogisticRegressin scaled score:  0.7201190982560612
RandomForestClassifier scaled score:  0.6150574223734581

The Logistic Regression is a great tool for two common applications: binary classification, and attributing cause-effect relationships where the response is a categorical variable. Above we see that we had a score of 72%. Was expecting a higher score based on the dataset.

Before I stared the analysis, my assumption was that the Random Forect Classifier would do better. The reason is because logisitic regression tend to do better when the number of noise variables is than or equal to the number of explanatory vairables which is not really the case here.



I guess with machine learning, the best is to run it to all the model and see which one performed better