In [74]:
import warnings
warnings.filterwarnings('ignore')

In [75]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


In [76]:
file_path = Path('lending_data.csv')
df = pd.read_csv(file_path, error_bad_lines=False)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [82]:
X = pd.get_dummies(df.drop(columns=["loan_status"]))
target = ["loan_status"]

y = df.loc[:, target].copy()



In [83]:
x.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.497472,0.398911,0.103616
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.499997,0.489678,0.304764
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0,0.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0,0.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0,1.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0,1.0,1.0


In [84]:
y['loan_status'].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y,random_state=1)

In [86]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [87]:
X_scaler = scaler.fit(X_train)

In [88]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [89]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [90]:
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9520479254722232

In [91]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  563,    56],
       [  102, 18663]])

In [92]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.85      0.91      0.99      0.88      0.95      0.90       619
   low_risk       1.00      0.99      0.91      1.00      0.95      0.91     18765

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



In [93]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'loan_status': 1})

In [94]:
from sklearn.linear_model import LogisticRegression
over_model = LogisticRegression(solver='lbfgs', random_state=1)
over_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [95]:
from sklearn.metrics import balanced_accuracy_score

# First, define "predictions" variable specifying Oversampling model
predictions = over_model.predict(X_test_scaled)

balanced_accuracy_score(y_test, predictions)

0.9934649587814939

In [96]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [97]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1, sampling_strategy=1.0)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

smote_model = LogisticRegression(solver='lbfgs', random_state=1)
smote_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [98]:
predictions = smote_model.predict(X_test_scaled)

balanced_accuracy_score(y_test, predictions)

0.9934649587814939

In [99]:
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [100]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'loan_status': 1})

In [102]:
cluster_model = LogisticRegression(solver='lbfgs', random_state=1)
cluster_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [103]:
predictions = cluster_model.predict(X_test_scaled)

balanced_accuracy_score(y_test, predictions)

0.9929503031930944

In [104]:
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [105]:
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'loan_status': 1})

In [106]:
smoteenn_model = LogisticRegression(solver='lbfgs', random_state=1)
smoteenn_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [107]:
predictions = smoteenn_model.predict(X_test_scaled)

balanced_accuracy_score(y_test, predictions)

0.9934649587814939

In [108]:
confusion_matrix(y_test, predictions)

array([[  615,     4],
       [  124, 18641]])

In [109]:
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [None]:
answer # 1 SMOTE Oversampling

answer # 2 over and under sampling

answer # 3 oversampling