In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [15]:
train_df['debt_settlement_flag'].value_counts()

N    12175
Y        5
Name: debt_settlement_flag, dtype: int64

In [14]:
test_df['debt_settlement_flag'].value_counts()

N    4702
Name: debt_settlement_flag, dtype: int64

In [4]:
test_df_cols=list(test_df.columns)

In [6]:
set(train_df_cols) == set(test_df_cols)

True

In [7]:
X = pd.get_dummies(train_df)
y_train = X['loan_status_high_risk']
X_train = X.drop(["loan_status_low_risk","loan_status_high_risk"],axis=1)

In [16]:
X_train['debt_settlement_flag_Y']

0        0
1        0
2        0
3        0
4        0
        ..
12175    0
12176    0
12177    0
12178    0
12179    0
Name: debt_settlement_flag_Y, Length: 12180, dtype: uint8

In [24]:
y_train.value_counts()

0    6090
1    6090
Name: loan_status_high_risk, dtype: int64

In [25]:
y_test.value_counts()

0    2351
1    2351
Name: loan_status_high_risk, dtype: int64

# Processing test data

In [17]:
# add missing dummy variables to testing set
X_test = pd.get_dummies(test_df)
y_test = X_test['loan_status_high_risk']
X_test = X_test.drop(["loan_status_low_risk","loan_status_high_risk"],axis=1)
X_test['debt_settlement_flag_Y'] = 0
X_test

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,0,1,1,0,1,0,1,0,1,0
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,0,1,0,1,1,0,1,0,1,0
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,1,1,1,0,1,0,1,0,1,0
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,0,1,0,1,1,0,1,0,1,0


In [18]:
set(X_train.columns) - set(X_test.columns)

set()

# Consider the models

I think the random Forest would perform better than logistic regression in this case. Because test dataset includes numerical value,such as installment, annual_inc,etc. "Since Logistic Regression depends on a calculation based on ‘weights’, numerical encoding of categorical variables can lead the algorithm to treat certain categories are of higher importance compared to others, depending on the number assigned." -- https://medium.com/@bemali_61284/random-forest-vs-logistic-regression-16c0c8e2484c. On the other hand, "by randomly selecting subsets of features, some trees of the forest can isolate more important features while increasing the overall accuracy of the result".-- https://medium.com/@bemali_61284/random-forest-vs-logistic-regression-16c0c8e2484c.


# Fit a Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
classifier_lib = LogisticRegression(solver='liblinear', max_iter=10000)
classifier_lib

LogisticRegression(max_iter=10000, solver='liblinear')

In [29]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier_lib.fit(X_train, y_train)

LogisticRegression(max_iter=10000, solver='liblinear')

In [28]:
print(f"Training Data Score: {classifier_lib.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier_lib.score(X_test, y_test)}")

Training Data Score: 0.7076354679802955
Testing Data Score: 0.5676307954062101


In [23]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Training Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Training Score: 0.6154827732879625


In [30]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7132183908045977
Testing Data Score: 0.7201190982560612


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6165461505742237
