In [52]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [53]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [54]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [55]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


In [56]:
# Convert categorical data to numeric and separate target feature for training data
x_train_df= pd.get_dummies(train_df.drop(['target'], axis=1))
y_train_df= train_df['target']

In [57]:
# Convert categorical data to numeric and separate target feature for testing data
x_test_df= pd.get_dummies(test_df.drop(['target'], axis=1))
y_test_df= test_df['target']

In [58]:
# add missing dummy variables to testing set
for col in x_train_df.columns:
    if col not in x_test_df.columns:
       x_test_df[col] = 0

In [59]:
x_test_df.shape

(4702, 92)

In [60]:
x_train_df.shape

(12180, 92)

# Model Prediction

I think that the RandomForestClassifer model will be more effective with this particular dataset rather than Logstitc Regression due to the amount of categories that are in the dataset. I think that RandomForestClassifer will be better suited to handle any outliers in the data as well as the categorical nature of the data. 

# Logistic Regression

In [61]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=15000)
classifier

LogisticRegression(max_iter=15000)

In [62]:
classifier.fit(x_train_df, y_train_df)

LogisticRegression(max_iter=15000)

In [63]:
print(f"Training Data Score: {classifier.score(x_train_df, y_train_df)}")
print(f"Testing Data Score: {classifier.score(x_test_df, y_test_df)}")

Training Data Score: 0.7052545155993432
Testing Data Score: 0.5627392598894088


# RandomForestClassifier

In [64]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=250).fit(x_train_df, y_train_df)
print(f'Training Score: {clf.score(x_train_df, y_train_df)}')
print(f'Testing Score: {clf.score(x_test_df, y_test_df)}')

Training Score: 1.0
Testing Score: 0.6448319863887707


# Analysis of Unscaled Data

When comparing the test and training scores from both models, the RandomForrestClassifer outperformed the Linear Regression which proves my earlier prediction. Although, RandomForestClassifer only received a 0.64 when in comparison with its 1.0 on the training data which leaves alot of room for improvement in the model. 

# Standard Scaler

In [48]:
# Scale the data
scaler = StandardScaler().fit(x_train_df)
x_train_scaled = scaler.transform(x_train_df)
x_test_scaled = scaler.transform(x_test_df)

In [51]:
# Train the Logistic Regression model on the scaled data and print the model score
print(f"Logistic Regression - Train Scaled Data: {classifier.score(x_train_scaled, y_train_df)}")
print(f"Logistic Regression - Test Scaled Data: {classifier.score(x_test_scaled, y_test_df)}")

Logistic Regression - Train Scaled Data: 0.5623152709359606
Logistic Regression - Test Scaled Data: 0.5572096980008507


In [50]:
# Train a Random Forest Classifier model on the scaled data and print the model score
print(f'Random Forrest - Train Scaled Score: {clf.score(x_train_scaled, y_train_df)}')
print(f'Random Forrest - Test Scaled Score: {clf.score(x_test_scaled, y_test_df)}')

Random Forrest - Train Scaled Score: 0.5
Random Forrest - Test Scaled Score: 0.5


# Analysis of Scaled Data

After scaling the data and putting it through both models, they both received almost identical scores with the testing and training data. At this point, it is inconclusive to say that one model is better than the other. 