In [30]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


In [31]:
train_df = pd.read_csv(Path('2019loans.csv'))
test_df = pd.read_csv(Path('2020Q1loans.csv'))

In [43]:
train_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,25200.0,0.1102,548.17,RENT,65000.0,Not Verified,n,42.67,0.0,1.0,...,10.0,0.0,0.0,282008.0,93765.0,57300.0,116320.0,N,N,low_risk
1,14000.0,0.2055,375.22,MORTGAGE,80000.0,Source Verified,n,15.47,0.0,0.0,...,75.0,0.0,0.0,434976.0,137629.0,17800.0,95032.0,N,N,low_risk
2,30000.0,0.1171,992.28,MORTGAGE,200000.0,Not Verified,n,14.14,0.0,0.0,...,100.0,0.0,0.0,99849.0,68769.0,13500.0,86349.0,N,N,low_risk
3,12000.0,0.1033,256.92,MORTGAGE,50000.0,Not Verified,n,21.41,0.0,0.0,...,33.3,0.0,0.0,209700.0,44654.0,13000.0,39700.0,N,N,low_risk
4,10625.0,0.1612,259.06,OWN,29000.0,Not Verified,n,25.87,0.0,0.0,...,0.0,0.0,0.0,35300.0,11893.0,18800.0,8000.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12785,30000.0,0.1430,702.73,MORTGAGE,54000.0,Not Verified,n,25.98,0.0,0.0,...,50.0,1.0,0.0,344934.0,75029.0,8800.0,83584.0,N,N,high_risk
12786,21275.0,0.2055,570.19,RENT,52000.0,Verified,n,20.10,0.0,0.0,...,40.0,0.0,0.0,67400.0,35654.0,28400.0,20000.0,N,N,high_risk
12787,10000.0,0.2565,401.05,RENT,120000.0,Not Verified,n,14.38,1.0,0.0,...,60.0,0.0,0.0,72253.0,44671.0,15700.0,44553.0,N,N,high_risk
12788,14400.0,0.1430,494.26,MORTGAGE,75000.0,Not Verified,n,22.50,0.0,0.0,...,50.0,0.0,0.0,305090.0,36244.0,15100.0,53974.0,N,N,high_risk


In [33]:
# Convert categorical data to numeric and separate target feature for training data

train_df1 = train_df.drop("target", axis=1)
X_train = pd.get_dummies(train_df1, dtype=float)

# Create our target
y_train = train_df['target']

# X_train.describe()
y_train.value_counts()



high_risk    6395
low_risk     6395
Name: target, dtype: int64

In [34]:
# Converting output labels to 0 and 1
y_train = LabelEncoder().fit_transform(train_df['target'])
y_train


array([1, 1, 1, ..., 0, 0, 0])

In [39]:
# Convert categorical data to numeric and separate target feature for testing data

test_df1 = test_df.drop("target", axis=1)
X_test = pd.get_dummies(test_df1, dtype=float)

# Create our target
y_test = test_df['target']

# print(X_test.describe())
# y_test.value_counts()

# Converting output labels to 0 and 1
y_test = LabelEncoder().fit_transform(test_df['target'])
y_test


array([1, 1, 1, ..., 0, 0, 0])

In [40]:
# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set(X_test.columns )
missing_cols


set()

In [41]:
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0


In [42]:
# Ensure the order of column in the test set is in the same order as in train set
X_test = X_test[X_train.columns]
X_test

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,12000.0,0.2055,449.34,70000.0,28.56,0.0,1.0,9.0,1.0,18051.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,36000.0,0.0819,733.23,200000.0,11.38,0.0,0.0,13.0,0.0,35928.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
2,37225.0,0.1308,848.51,122700.0,16.83,0.0,0.0,11.0,0.0,32279.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
3,12000.0,0.1102,392.98,64500.0,36.63,0.0,2.0,15.0,0.0,24069.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
4,25000.0,0.1774,631.31,50000.0,33.82,0.0,0.0,12.0,0.0,22815.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8413,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,6342.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
8414,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,11636.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
8415,30000.0,0.2055,1123.34,180000.0,12.06,0.0,0.0,8.0,0.0,4771.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
8416,25000.0,0.2565,1002.62,60000.0,22.44,0.0,0.0,12.0,0.0,10979.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [12]:
# add missing dummy variables to testing set

# There are no dummy missing variable in the testing set

In [44]:
# Train the Logistic Regression model on the unscaled data and print the model score

classifier = LogisticRegression()
classifier

LogisticRegression()

In [45]:
print("Shape: ", X_train.shape, y_train.shape)
print("Shape: ", X_test.shape, y_test.shape)

Shape:  (12790, 91) (12790,)
Shape:  (8418, 91) (8418,)


In [46]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [47]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6153244722439406
Testing Data Score: 0.4895462105013067


In [48]:
# Prediction Vs  actual
predictions = classifier.predict(X_test)
# pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predictions

array([1, 0, 0, ..., 1, 1, 0])

In [49]:
# Model Accuracy

y_true = y_test
y_pred = classifier.predict(X_test)
# confusion_matrix(y_true, y_pred)
print(confusion_matrix(y_true, y_pred))

# Score
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")


[[1007 3202]
 [1095 3114]]
Accuracy: 0.4895462105013067


In [52]:
# Calculate the classification report

print(classification_report(y_test, predictions, target_names= ["Low Risk", "High Risk"]))


              precision    recall  f1-score   support

    Low Risk       0.48      0.24      0.32      4209
   High Risk       0.49      0.74      0.59      4209

    accuracy                           0.49      8418
   macro avg       0.49      0.49      0.46      8418
weighted avg       0.49      0.49      0.46      8418



 Train a Random Forest Classifier model and print the model score


In [53]:
# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')


              precision    recall  f1-score   support

    Low Risk       0.65      0.68      0.66      4209
   High Risk       0.66      0.63      0.65      4209

    accuracy                           0.66      8418
   macro avg       0.66      0.66      0.66      8418
weighted avg       0.66      0.66      0.66      8418

Training Score: 1.0
Testing Score: 0.6563316702304586


In [54]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state=1).fit(X_train, y_train)
print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')


              precision    recall  f1-score   support

    Low Risk       0.65      0.68      0.66      4209
   High Risk       0.66      0.63      0.65      4209

    accuracy                           0.66      8418
   macro avg       0.66      0.66      0.66      8418
weighted avg       0.66      0.66      0.66      8418

Training Score: 0.7205629397967162
Testing Score: 0.4965550011879306


In [67]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [68]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)

print(f"Scaled Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


Scaled Training Data Score: 0.7007036747458952
Scaled Testing Data Score: 0.7198859586600143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
# Prediction Vs  actual
predictions = classifier.predict(X_test_scaled)
# pd.DataFrame({"Prediction": predictions, "Actual": y_test})


In [70]:
print(classification_report(y_test, predictions,target_names=["Low Risk", "High Risk"]))

              precision    recall  f1-score   support

    Low Risk       0.65      0.94      0.77      4209
   High Risk       0.90      0.49      0.64      4209

    accuracy                           0.72      8418
   macro avg       0.78      0.72      0.70      8418
weighted avg       0.78      0.72      0.70      8418



In [71]:
# Train a Random Forest Classifier model on the scaled data and print the model score

# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)


print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6571632216678546


In [72]:
print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))

              precision    recall  f1-score   support

    Low Risk       0.65      0.68      0.67      4209
   High Risk       0.67      0.63      0.65      4209

    accuracy                           0.66      8418
   macro avg       0.66      0.66      0.66      8418
weighted avg       0.66      0.66      0.66      8418



In [73]:
from sklearn.ensemble import AdaBoostClassifier

clf_a = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)
print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))
print(f'Training Score: {clf_a.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_a.score(X_test_scaled, y_test)}')




              precision    recall  f1-score   support

    Low Risk       0.65      0.68      0.67      4209
   High Risk       0.67      0.63      0.65      4209

    accuracy                           0.66      8418
   macro avg       0.66      0.66      0.66      8418
weighted avg       0.66      0.66      0.66      8418

Training Score: 0.7205629397967162
Testing Score: 0.4965550011879306


In [80]:
# Extra Tree Classifier
from sklearn.ensemble import ExtraTreesClassifier

clf_et = ExtraTreesClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf_et.predict(X_test_scaled)

print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))
print(f'Training Score: {clf_et.score(X_train_scaled, y_train)}')
print(f'Testing Score : {clf_et.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    Low Risk       0.60      0.54      0.57      4209
   High Risk       0.58      0.64      0.61      4209

    accuracy                           0.59      8418
   macro avg       0.59      0.59      0.59      8418
weighted avg       0.59      0.59      0.59      8418

Training Score: 1.0
Testing Score : 0.5886196246139226
