In [1]:
!pip3 install imblearn
import warnings
warnings.filterwarnings('ignore')

Collecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
Collecting scikit-learn>=0.24
  Downloading scikit_learn-0.24.1-cp38-cp38-win_amd64.whl (6.9 MB)
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.1
    Uninstalling scikit-learn-0.23.1:
      Successfully uninstalled scikit-learn-0.23.1
Successfully installed imbalanced-learn-0.8.0 imblearn-0.0 scikit-learn-0.24.1


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [15]:
columns = [
    "loan_size", "interest_rate", "homeowner", "borrower_income","debt_to_income", "num_of_accounts", "derogatory_marks","total_debt", "loan_status"
]
target = ["loan_status"]

In [20]:
# Load the data
file_path = Path('lending_data.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
# df['interest_rate'] = df['interest_rate'].str.replace('%', '')
df['interest_rate'] = df['interest_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,0.07672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,0.06692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,0.06963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,0.07664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,0.07698,mortgage,53000,0.433962,5,1,23000,low_risk


# Split the Data into Training and Testing

In [31]:
# Create our features
X_df = df.drop(columns=['loan_status'])
X = pd.get_dummies(X_df, columns=["loan_size", "homeowner","debt_to_income", "derogatory_marks","total_debt"], drop_first = True)

# Create our target
y = df['loan_status'].to_frame()

In [32]:
X.describe()

Unnamed: 0,interest_rate,borrower_income,num_of_accounts,loan_size_5100.0,loan_size_5200.0,loan_size_5300.0,loan_size_5400.0,loan_size_5500.0,loan_size_5600.0,loan_size_5700.0,...,total_debt_70000,total_debt_70100,total_debt_70200,total_debt_71100,total_debt_71600,total_debt_72300,total_debt_72400,total_debt_73500,total_debt_74100,total_debt_75200
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,...,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,0.072923,49221.949804,3.82661,1.3e-05,5.2e-05,2.6e-05,6.4e-05,9e-05,0.000116,0.000155,...,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05
std,0.008895,8371.635077,1.904426,0.003591,0.007182,0.005079,0.00803,0.009501,0.010773,0.01244,...,0.003591,0.003591,0.003591,0.003591,0.003591,0.003591,0.003591,0.003591,0.003591,0.003591
min,0.0525,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.06825,44800.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.07172,48100.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.07528,51400.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.13235,105200.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
# Check the balance of our target values
y['loan_status'].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [34]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [35]:
# Scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators =50, random_state=1)
model = brfc.fit(X_train_scaled, y_train)
BalancedRandomForestClassifier()

BalancedRandomForestClassifier()

In [36]:
# Calculate the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

predictions = model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.9917457697069748

In [37]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[  622,     3],
       [  157, 18602]], dtype=int64)

In [38]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

   high_risk       0.80      1.00      0.89       625
    low_risk       1.00      0.99      1.00     18759

    accuracy                           0.99     19384
   macro avg       0.90      0.99      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [39]:
# List the features sorted in descending order by feature importance
importances = model.feature_importances_

sorted(zip(model.feature_importances_, X.columns), reverse=True)[:20]

[(0.2355069160782815, 'interest_rate'),
 (0.22656126743798924, 'num_of_accounts'),
 (0.1748285866037994, 'borrower_income'),
 (0.16196617305220914, 'derogatory_marks_2'),
 (0.04198542061938665, 'derogatory_marks_1'),
 (0.020968150177063216, 'derogatory_marks_3'),
 (0.004634106548323383, 'loan_size_9600.0'),
 (0.004354748239117493, 'loan_size_9200.0'),
 (0.0036192012732902267, 'loan_size_9900.0'),
 (0.0034353575259293014, 'loan_size_9700.0'),
 (0.003428173228164544, 'loan_size_9800.0'),
 (0.0031288748510577683, 'loan_size_17900.0'),
 (0.0030928239284912407, 'loan_size_8600.0'),
 (0.0030918624892682855, 'loan_size_18500.0'),
 (0.0030295300854624365, 'loan_size_8500.0'),
 (0.002877948414084972, 'loan_size_9400.0'),
 (0.0027657157255228425, 'loan_size_8700.0'),
 (0.0027583915516413552, 'loan_size_9500.0'),
 (0.002606810240479173, 'loan_size_8900.0'),
 (0.0025840086110052907, 'loan_size_9300.0')]

### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(base_estimator=None, n_estimators=10, n_jobs=1, random_state=1, 
                                   replacement=False, sampling_strategy='auto', verbose=0, 
                                   warm_start=False)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

EasyEnsembleClassifier()

In [None]:
# Calculated the balanced accuracy score
predictions = model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

In [None]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))