# Data Retrieval

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path

In [5]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# Preprocessing: Convert categorical data to numeric

In [6]:
# Convert categorical data to numeric and separate target feature for training data
dummies_2019 = pd.get_dummies(train_df)
dummies_2019.shape

(12180, 96)

In [7]:
# Convert categorical data to numeric and separate target feature for testing data
dummies_2020 = pd.get_dummies(test_df)
dummies_2020.shape

(4702, 95)

In [8]:
# add missing dummy variables to testing set
for column in dummies_2019.columns:
    if (not column in dummies_2020.columns):
        print(column)  

debt_settlement_flag_Y


In [9]:
dummies_2019.loc[dummies_2019['debt_settlement_flag_Y'] == 1, ['debt_settlement_flag_N', 'debt_settlement_flag_Y']]

Unnamed: 0,debt_settlement_flag_N,debt_settlement_flag_Y
6896,0,1
6930,0,1
7243,0,1
7730,0,1
9018,0,1


In [10]:
def cleaning (row):
   if row['debt_settlement_flag_N'] == 1 :
      return 0
   if row['debt_settlement_flag_N'] == 0 :
        return 1
dummies_2020['debt_settlement_flag_Y'] = dummies_2020.apply (lambda row: cleaning(row), axis=1)
dummies_2020.shape

(4702, 96)

In [11]:
X_train1 = dummies_2019.drop(columns='loan_status_high_risk', axis=1)
X_train = X_train1.drop(columns='loan_status_low_risk', axis=1) 
y_train = dummies_2019['loan_status_low_risk']
X_test1 = dummies_2020.drop(columns='loan_status_high_risk', axis=1)
X_test = X_test1.drop(columns='loan_status_low_risk', axis=1) 
y_test = dummies_2020['loan_status_low_risk']

# Consider the models: Prediction and comparison
Prediction: Random Forest Classifier would perform better as it has more categorial data than numeric which is typically not suitable for Logistic Regression.

In [18]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression().fit(X_train , y_train)

print(f'Training Score: {LR.score(X_train , y_train)}')
print(f'Test Score: {LR.score(X_test , y_test)}')

Training Score: 0.6485221674876848
Test Score: 0.5253083794130158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
# Train a Random Forest Classifier model on the unscaled data and print the model score
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier().fit(X_train , y_train)

print(f'Training Score: {RF.score(X_train , y_train)}')
print(f'Test Score: {RF.score(X_test , y_test)}')

Training Score: 1.0
Test Score: 0.6256911952360698


# Interim Result
The result of the "unscaled" data turned out as following:
* Logistic Regression: Training Score: 0.64 / Test Score: 0.52
* Random Forest Classifier: Training Score: 1.0 / Test Score: 0.60 <p>

It seems the Random Forest Classifier performed better than Logistic Regression for the unscaled data, but the training score of 1.0 may be indicating that there is overfitting. 

# Revisit the Preprocessing: Scale the data
Prediction: Scaling would have positive impact to improve accuracy of the gradient descent algorithm such as Logistic Regression while tree-based algorithms such as Random Forest Classifier do not.

In [15]:
# Scaling the training data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Fitting the models to the scaled data

# Logistic Regression
LRS = LogisticRegression().fit(X_train_scaled , y_train)
print(f'Logistic Regression Scaled Score: {LRS.score(X_test_scaled , y_test)}')

# Random Forest Classifier
RFS = RandomForestClassifier().fit(X_train_scaled , y_train)
print(f'Random Forest Classifier Score: {RFS.score(X_test_scaled , y_test)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression Scaled Score: 0.7201190982560612
Random Forest Classifier Score: 0.6397277754147171


# Conclusion
The result of the "scaled" data turned out as following:
* Logistic Regression Score: Scaled: 0.72 (Unscaled: 0.52)
* Random Forest Classifier Score: Scales: 0.63 (Unscaled: 0.60)<p>

It turned out the Logistic Regression (despite its less compute intensive or complicated algorithm) ended up outperforming the Random Forest Classifier as predicted above. It seems the scaling takes a dominant part of Logistic Regression which improved the result of unscaled data by 38%. 