In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
# Import the data
path = './Resources/lending_data.csv'
lending_df = pd.read_csv(path)
lending_df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


# Prediction

Taking a look at the data and what we'd like to predict using a Logistic Regression and a Random Forest, I predict that the Logistic Regression will perform better. This is because we are concerned with predicting the risk levels of the loans and so 'loan status' will be the value which we are concerned about. Looking through the data we can see that this is a binary value with 1 representing loans which have a higher number of derogatory marks, accounts and debt to income ratio at least from a cursory glance. Since Logistic Regressions are best at predicting binary relationships, I believe that it will perform the best.

# Analysis

In [5]:
#Ensuring no null values
lending_df.dropna(inplace=True)
lending_df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [8]:
# Split the data into X_train, X_test, y_train, y_test
X = lending_df.drop('loan_status', axis=1)
Y = lending_df['loan_status']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
29175,8600.0,6.792,44500,0.325843,3,0,14500
23020,7800.0,6.419,41000,0.268293,2,0,11000
31269,10000.0,7.386,50100,0.401198,4,1,20100
35479,9300.0,7.093,47300,0.365751,3,0,17300
13470,9200.0,7.045,46900,0.360341,3,0,16900


In [12]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled

array([[-0.57708952, -0.56367666, -0.5652314 , ..., -0.43489843,
        -0.67289855, -0.5652314 ],
       [-0.95927354, -0.98302549, -0.98332378, ..., -0.96014741,
        -0.67289855, -0.98332378],
       [ 0.09173251,  0.10413354,  0.10371642, ...,  0.09035056,
         1.04334691,  0.10371642],
       ...,
       [ 0.18727852,  0.19070153,  0.18733489, ...,  0.09035056,
         1.04334691,  0.18733489],
       [ 0.61723554,  0.61229888,  0.61737277, ...,  0.61559954,
         1.04334691,  0.61737277],
       [ 0.37837052,  0.3807014 ,  0.37846284, ...,  0.61559954,
         1.04334691,  0.37846284]])

In [15]:
# Train a Logistic Regression model print the model score
clf = LogisticRegression().fit(X_train_scaled, Y_train)
print(f'Training Score: {clf.score(X_train_scaled, Y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, Y_test)}')

Training Score: 0.9942908240473243
Testing Score: 0.9936545604622369


In [14]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, Y_train)
print(f'Training Score: {clf.score(X_train_scaled, Y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, Y_test)}')

Training Score: 0.9975409272252029
Testing Score: 0.9917457697069748


# Thoughts

Both of these models scored highly with the data though I do believe that the Logistic Regression does fit the data a little better. Despite the Random Forest having a higher Training Score, the Logistic Regression's Training and Testing Scores are much closer. Additionally the Testing Score for the Logistic Regression is higher the Random Forest which makes me believe it is the stronger model if we are using this one data set as the only evidence.