In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
#Find all non-numeric columns

list = train_df.dtypes
list2 = train_df.columns

for i in range(len(list)):
    if (list[i] == 'object'):
        print(list2[i])

home_ownership
verification_status
loan_status
pymnt_plan
initial_list_status
application_type
hardship_flag
debt_settlement_flag


In [4]:
# Determine Unique values for each categorical column

print(train_df['home_ownership'].unique())
print(train_df['verification_status'].unique())
print(train_df['loan_status'].unique())
print(train_df['pymnt_plan'].unique())
print(train_df['initial_list_status'].unique())
print(train_df['application_type'].unique())
print(train_df['hardship_flag'].unique())
print(train_df['debt_settlement_flag'].unique())


['MORTGAGE' 'RENT' 'OWN' 'ANY']
['Not Verified' 'Source Verified' 'Verified']
['low_risk' 'high_risk']
['n']
['w' 'f']
['Individual' 'Joint App']
['N' 'Y']
['N' 'Y']


In [5]:
# Create dictionaries to replace all categorical data with appropriate numeric data

home_ownership_dict = {'MORTGAGE': 0, 'RENT': 1, 'OWN': 2, 'ANY': 3}
verification_status_dict = {'Not Verified': 0, 'Source Verified': 1, 'Verified': 2}
loan_status_dict = {'low_risk': 0, 'high_risk': 1}
pymnt_plan_dict = {'n': 0, 'y': 1}
initial_list_status_dict = {'w': 0, 'f': 1}
application_type_dict = {'Individual': 0, 'Joint App': 1}
hardship_flag_dict = {'N': 0, 'Y': 1}
debt_settlement_flag_dict = {'N': 0, 'Y': 1}


In [6]:
# Convert categorical data to numeric and separate target feature for training data

train_df2 = train_df.replace({'home_ownership': home_ownership_dict})
train_df2 = train_df2.replace({'verification_status': verification_status_dict})
train_df2 = train_df2.replace({'loan_status': loan_status_dict})
train_df2 = train_df2.replace({'pymnt_plan': pymnt_plan_dict})
train_df2 = train_df2.replace({'initial_list_status': initial_list_status_dict})
train_df2 = train_df2.replace({'application_type': application_type_dict})
train_df2 = train_df2.replace({'hardship_flag': hardship_flag_dict})
train_df2 = train_df2.replace({'debt_settlement_flag': debt_settlement_flag_dict})
train_df2

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,0,223000.0,0,0,0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,141451,141451,21000.0,0.1308,478.68,0,123000.0,1,0,0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,321143,321143,20000.0,0.1240,448.95,0,197000.0,1,0,0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,11778,11778,3000.0,0.1240,100.22,1,45000.0,0,0,0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,169382,169382,30000.0,0.1612,1056.49,0,133000.0,1,0,0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,1,28000.0,0,1,0,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,0,0
12176,354944,354944,15000.0,0.1774,540.34,1,50000.0,2,1,0,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,0,0
12177,354973,354973,3600.0,0.1862,131.28,1,60000.0,0,1,0,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,0,0
12178,355002,355002,15000.0,0.0881,475.68,0,62000.0,1,1,0,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,0,0


In [7]:
#Find all non-numeric columns

list = train_df2.dtypes
list2 = train_df2.columns

for i in range(len(list)):
    if (list[i] == 'object'):
        print(list2[i])

In [8]:
# Check for nulls
list = train_df2.isnull().sum()
list2 = train_df2.columns

for i in range(len(list)):
    if (list[i] > 0):
        print(list2[i])

In [9]:
# Convert categorical data to numeric and separate target feature for testing data

test_df2 = test_df.replace({'home_ownership': home_ownership_dict})
test_df2 = test_df2.replace({'verification_status': verification_status_dict})
test_df2 = test_df2.replace({'loan_status': loan_status_dict})
test_df2 = test_df2.replace({'pymnt_plan': pymnt_plan_dict})
test_df2 = test_df2.replace({'initial_list_status': initial_list_status_dict})
test_df2 = test_df2.replace({'application_type': application_type_dict})
test_df2 = test_df2.replace({'hardship_flag': hardship_flag_dict})
test_df2 = test_df2.replace({'debt_settlement_flag': debt_settlement_flag_dict})
test_df2

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.70,0,140000.0,0,0,0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,0
1,25429,25429,6000.0,0.1524,208.70,1,55000.0,0,0,0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,0
2,38496,38496,3600.0,0.1695,128.27,1,42000.0,0,0,0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,0
3,19667,19667,20000.0,0.1524,478.33,1,100000.0,0,0,0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,0
4,37505,37505,3600.0,0.1240,120.27,1,50000.0,0,0,0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,1,140480.0,1,1,0,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,0,0
4698,77291,77291,24000.0,0.0756,747.22,1,50000.0,0,1,0,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,0,0
4699,77292,77292,10000.0,0.2305,387.36,1,33000.0,2,1,0,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,0,0
4700,77297,77297,8000.0,0.1862,205.86,1,38000.0,1,1,0,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,0,0


In [10]:
#Find all non-numeric columns

list = train_df2.dtypes
list2 = train_df2.columns

for i in range(len(list)):
    if (list[i] == 'object'):
        print(list2[i])

In [11]:
# Check for nulls
list = train_df2.isnull().sum()
list2 = train_df2.columns

for i in range(len(list)):
    if (list[i] > 0):
        print(list2[i])

In [12]:
# Create Test and train data
y_train = train_df2['loan_status']
y_test = test_df2['loan_status']

train_df2 = train_df2.drop('loan_status', 1)
train_df2 = train_df2.drop('Unnamed: 0', 1)
X_train = train_df2

test_df2 = test_df2.drop('loan_status', 1)
test_df2 = test_df2.drop('Unnamed: 0', 1)
X_test = test_df2


Data is cleansed and all categorical data has been converted to numeric in the cells above. 
Given that we are comparing a logistic regression to a random forest classifer, I would predict
that the random forest classifier will perform better. I believe this is due to the nature of the
algorithm used and that by sampling at random it will help to eliminate some of the noise created by 
an excess number of features.


In [13]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6569786535303777
Testing Data Score: 0.5199914929817099


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# Train a Random Forest Classifier model and print the model score

clf = RandomForestClassifier(random_state=1, n_estimators=35).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6818375159506593


As predicted, the random forest classifier performed better, but only slightly. The results from both are bit surprising.


After scaling below, I would expect both models to improve.

In [15]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [16]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier = LogisticRegression()
classifier
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.6352216748768473
Testing Data Score: 0.5053168864313058


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Train a Random Forest Classifier model on the scaled data and print the model score

clf = RandomForestClassifier(random_state=1, n_estimators=35).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6816248404934071


The results after scaling are a bit surprising. I did not see the expected improvement on either model. 

I can think of two possible explanations for the relatively poor fit. 
The first being that the model is based on too many features as stated above. 

The second explanation is that maybe 2019 loan behavior is NOT a good predictor of loan behavior in 2020. 
We know that we are all living through a pandemic that started in 2020. Many lost their jobs, and subsequently, 
their ability pay their mortgages. This could easily be tested by splitting the 2019 data alone into train and test data. 