In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


In [2]:
train_df = pd.read_csv(Path('2019loans.csv'))
test_df = pd.read_csv(Path('2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,25200.0,0.1102,548.17,RENT,65000.0,Not Verified,n,42.67,0.0,1.0,...,10.0,0.0,0.0,282008.0,93765.0,57300.0,116320.0,N,N,low_risk
1,14000.0,0.2055,375.22,MORTGAGE,80000.0,Source Verified,n,15.47,0.0,0.0,...,75.0,0.0,0.0,434976.0,137629.0,17800.0,95032.0,N,N,low_risk
2,30000.0,0.1171,992.28,MORTGAGE,200000.0,Not Verified,n,14.14,0.0,0.0,...,100.0,0.0,0.0,99849.0,68769.0,13500.0,86349.0,N,N,low_risk
3,12000.0,0.1033,256.92,MORTGAGE,50000.0,Not Verified,n,21.41,0.0,0.0,...,33.3,0.0,0.0,209700.0,44654.0,13000.0,39700.0,N,N,low_risk
4,10625.0,0.1612,259.06,OWN,29000.0,Not Verified,n,25.87,0.0,0.0,...,0.0,0.0,0.0,35300.0,11893.0,18800.0,8000.0,N,N,low_risk


In [4]:
# Convert categorical data to numeric and separate target feature for training data

train_df1 = train_df.drop("target", axis=1)
X_train = pd.get_dummies(train_df1, dtype=float)

# Create our target
y_train = train_df['target']

# X_train.describe()
y_train.value_counts()



low_risk     6395
high_risk    6395
Name: target, dtype: int64

In [5]:
# Converting output labels to 0 and 1
y_train = LabelEncoder().fit_transform(train_df['target'])
y_train


array([1, 1, 1, ..., 0, 0, 0])

In [6]:
# Convert categorical data to numeric and separate target feature for testing data

test_df1 = test_df.drop("target", axis=1)
X_test = pd.get_dummies(test_df1, dtype=float)

# Create our target
y_test = test_df['target']

# print(X_test.describe())
# y_test.value_counts()

# Converting output labels to 0 and 1
y_test = LabelEncoder().fit_transform(test_df['target'])
y_test
X_train.columns

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

In [7]:
# add missing dummy variables to testing set

In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score

classifier = LogisticRegression()
classifier

LogisticRegression()

In [9]:
print("Shape: ", X_train.shape, y_train.shape)
print("Shape: ", X_test.shape, y_test.shape)

Shape:  (12790, 91) (12790,)
Shape:  (8418, 91) (8418,)


In [10]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [11]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6153244722439406
Testing Data Score: 0.4895462105013067


In [12]:
# Prediction Vs  actual
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,1,1
1,0,1
2,0,1
3,1,1
4,0,1
...,...,...
8413,1,0
8414,1,0
8415,1,0
8416,1,0


In [13]:
# Model Accuracy

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

# Score
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")


Accuracy: 0.4895462105013067


In [14]:
# Calculate the classification report

print(classification_report(y_test, predictions, target_names= ["Low Risk", "High Risk"]))


              precision    recall  f1-score   support

    Low Risk       0.48      0.24      0.32      4209
   High Risk       0.49      0.74      0.59      4209

    accuracy                           0.49      8418
   macro avg       0.49      0.49      0.46      8418
weighted avg       0.49      0.49      0.46      8418



 Train a Random Forest Classifier model and print the model score


In [15]:
# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')


              precision    recall  f1-score   support

    Low Risk       0.65      0.68      0.66      4209
   High Risk       0.66      0.63      0.65      4209

    accuracy                           0.66      8418
   macro avg       0.66      0.66      0.66      8418
weighted avg       0.66      0.66      0.66      8418

Training Score: 1.0
Testing Score: 0.6563316702304586


In [16]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)

print(f"Scaled Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


Scaled Training Data Score: 0.7007036747458952
Scaled Testing Data Score: 0.7198859586600143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Prediction Vs  actual
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})


Unnamed: 0,Prediction,Actual
0,0,1
1,0,1
2,0,1
3,1,1
4,0,1
...,...,...
8413,0,0
8414,0,0
8415,0,0
8416,0,0


In [19]:
print(classification_report(y_test, predictions,target_names=["Low Risk", "High Risk"]))

              precision    recall  f1-score   support

    Low Risk       0.65      0.94      0.77      4209
   High Risk       0.90      0.49      0.64      4209

    accuracy                           0.72      8418
   macro avg       0.78      0.72      0.70      8418
weighted avg       0.78      0.72      0.70      8418



In [21]:
# Train a Random Forest Classifier model on the scaled data and print the model score

# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)


print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')


Training Score: 1.0
Testing Score: 0.6571632216678546


In [22]:
print(classification_report(y_test, y_pred, target_names=["Low Risk", "High Risk"]))

              precision    recall  f1-score   support

    Low Risk       0.65      0.68      0.67      4209
   High Risk       0.67      0.63      0.65      4209

    accuracy                           0.66      8418
   macro avg       0.66      0.66      0.66      8418
weighted avg       0.66      0.66      0.66      8418



In [34]:
predict = clf.predict(X_test_scaled)
pd.DataFrame({"Prediction": predict, "Actual": y_test})


Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,1
3,1,1
4,0,1
...,...,...
8413,0,0
8414,1,0
8415,0,0
8416,0,0
