In [1]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Prediction
Because there are 94 categorical features to model for, I think the random forest model will be better than the logistical regression in both unscaled and scaled data.<br>
This dataset is not linear, and logistical regression is better for modeling linear data.

## Actual Results
Overall the scores for both Random Forest and Logistic Regression were low. Logistic Regression had a better score overall.<br>
Unscaled Data: Random Forest Model did better with a score of 0.6405784772437261.<br>
Scaled Data: Logistic Regression Model did better with a score of 0.6703530412590387.

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop('loan_status', axis=1)
X_dummies_train = pd.get_dummies(X_train)
y_label_train = LabelEncoder().fit_transform(train_df["loan_status"])

In [4]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop('loan_status', axis=1)
X_dummies_test = pd.get_dummies(X_test)
y_label_test = LabelEncoder().fit_transform(test_df["loan_status"])

In [None]:
# add missing dummy variables to testing set

In [None]:
X_dummies_test.shape

In [None]:
X_dummies_train.shape

In [None]:
# find the different columns between both dfs
X_dummies_train.columns.difference(X_dummies_test.columns)

In [None]:
# check for unique values in 2019 df
X_dummies_train["debt_settlement_flag_Y"].unique()

In [5]:
# match columns from training df to testing df
X_dummies_test = X_dummies_test.reindex(columns=X_dummies_train.columns, fill_value=0)

In [None]:
X_dummies_test

In [6]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
model.fit(X_dummies_train, y_label_train)
print(f"Training Logistic Regression Score: {model.score(X_dummies_train,y_label_train)}")
print(f"Testing Logistic Regression Score: {model.score(X_dummies_test,y_label_test)}")

Training Logistic Regression Score: 0.648440065681445
Testing Logistic Regression Score: 0.5253083794130158


In [7]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(max_features="sqrt",random_state=1).fit(X_dummies_train,y_label_train)
print(f"Training RFC Score: {clf.score(X_dummies_train,y_label_train)}")
print(f"Testing RFC Score: {clf.score(X_dummies_test,y_label_test)}")

Training RFC Score: 1.0
Testing RFC Score: 0.6405784772437261


In [8]:
# Scale the data
# Training data
X_dummies_train_scaler = StandardScaler().fit(X_dummies_train)
X_dummies_train_scaled = X_dummies_train_scaler.transform(X_dummies_train)

# Testing data
X_dummies_test_scaler = StandardScaler().fit(X_dummies_test)
X_dummies_test_scaled = X_dummies_test_scaler.transform(X_dummies_test)


In [13]:
# Train the Logistic Regression model on the scaled data and print the model score
model.fit(X_dummies_train_scaled, y_label_train)
print(f"Scaled - Training Logistic Regression Score: {model.score(X_dummies_train_scaled,y_label_train)}")
print(f"Scaled - Testing Logistic Regression Score: {model.score(X_dummies_test_scaled,y_label_test)}")

Scaled - Training Logistic Regression Score: 0.713136288998358
Scaled - Testing Logistic Regression Score: 0.6703530412590387


In [14]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(max_features="sqrt",random_state=1).fit(X_dummies_train_scaled,y_label_train)
print(f"Scaled - Training RFC Score: {clf.score(X_dummies_train_scaled,y_label_train)}")
print(f"Scaled - Testing RFC Score: {clf.score(X_dummies_test_scaled,y_label_test)}")

Scaled - Training RFC Score: 1.0
Scaled - Testing RFC Score: 0.5710336027222459
