In [115]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [116]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [117]:
# Convert categorical data to numeric and separate target feature for training data
#drop the unnamed 0 column, the index column, and the loan_status column to make X, and use the loan_status column to make y
X_train = train_df.drop(columns=['Unnamed: 0','index','loan_status'])
y_train = train_df['loan_status']
#Convert the y labels to numbers
label_encoder= LabelEncoder()
label_encoder.fit(y_train)
y_train_label= label_encoder.transform(y_train)
X_train_dummies = pd.get_dummies(X_train,drop_first=True)


In [118]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop(columns=['Unnamed: 0','index','loan_status'])
y_test = test_df['loan_status']
y_test_label= label_encoder.transform(y_test)
X_test_dummies=pd.get_dummies(X_test,drop_first=True)

In [119]:
# add missing dummy variables to testing set
missing_columns = set(X_train_dummies)-set(X_test_dummies)
for missing_column in missing_columns:
    X_test_dummies[missing_column]=0
X_train_dummies, X_test_dummies = X_train_dummies.align(X_test_dummies, axis=1)

Prediction of better performance:
As per an article from https://www.linkedin.com/pulse/logistic-regression-vs-random-forest-classifier-chintan-chitroda random forests performs better vs linear regression with more categorical data as opposed to numeric data. In this set, it appears only 8 of the columns are caegorical, and the remaining 78 columns are numeric. This would lead me to believe that linear regression will perform better

In [120]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train_dummies,y_train_label)
train_score = lr_model.score(X_train_dummies,y_train_label)
test_score = lr_model.score(X_test_dummies,y_test_label)
print(f'The training score is {train_score} and the testing score is {test_score}')

The training score is 0.6498357963875205 and the testing score is 0.5157379838366652


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [121]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_dummies,y_train_label)
train_score = rf_model.score(X_train_dummies,y_train_label)
test_score = rf_model.score(X_test_dummies,y_test_label)
print(f'The training score is {train_score} and the testing score is {test_score}')

The training score is 1.0 and the testing score is 0.6416418545299872


In [122]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [123]:
# Train the Logistic Regression model on the scaled data and print the model score
lr_model.fit(X_train_scaled,y_train_label)
train_score = lr_model.score(X_train_scaled,y_train_label)
test_score = lr_model.score(X_test_scaled,y_test_label)
print(f'The training score is {train_score} and the testing score is {test_score}')

The training score is 0.7083743842364532 and the testing score is 0.7681837515950659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [124]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf_model.fit(X_train_scaled,y_train_label)
train_score = rf_model.score(X_train_scaled,y_train_label)
test_score = rf_model.score(X_test_scaled,y_test_label)
print(f'The training score is {train_score} and the testing score is {test_score}')

The training score is 1.0 and the testing score is 0.6369629944704381
