# Credit Risk Ensemble Techniques

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [21]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [22]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [23]:
# Binary encoding using Pandas (single column)
df_binary_encoded = pd.get_dummies(df, columns=["homeowner"])
df_binary_encoded.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status,homeowner_mortgage,homeowner_own,homeowner_rent
0,10700.0,7.672,52800,0.431818,5,1,22800,low_risk,0,1,0
1,8400.0,6.692,43600,0.311927,3,0,13600,low_risk,0,1,0
2,9000.0,6.963,46100,0.349241,3,0,16100,low_risk,0,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,low_risk,0,1,0
4,10800.0,7.698,53000,0.433962,5,1,23000,low_risk,1,0,0


# Split the Data into Training and Testing

In [24]:
# Create our features
X = df_binary_encoded.drop(columns="loan_status")

# Create our target
y = df[["loan_status"]]

In [25]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.497472,0.398911,0.103616
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.499997,0.489678,0.304764
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0,0.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0,0.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0,1.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0,1.0,1.0


In [26]:
# Check the balance of our target values
y['loan_status'].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [27]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# This is the class split created from default (random) sample
Counter(y_train)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [28]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model = brf.fit(X_train, y_train)

In [29]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'Accuracy Score (Balanced Random Forest Classifier): {acc_score}')

Accuracy Score (Balanced Random Forest Classifier): 0.9927988250218349


In [30]:
# Display the confusion matrix (RandomOversampler)
cm = confusion_matrix(y_test, y_pred)
cm

array([[  615,     4],
       [  149, 18616]], dtype=int64)

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.80      0.99      0.99      0.89      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [32]:
# List the features sorted in descending order by feature importance
feature_importances = brf_model.feature_importances_
sort_feature_importances = sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)
sort_feature_importances

[(0.21788480603139448, 'borrower_income'),
 (0.19595880072166166, 'interest_rate'),
 (0.1770964986362893, 'debt_to_income'),
 (0.13872579131432708, 'loan_size'),
 (0.12149751059258558, 'total_debt'),
 (0.11796397199368643, 'num_of_accounts'),
 (0.028820980671490287, 'derogatory_marks'),
 (0.0008975938107589897, 'homeowner_own'),
 (0.0008326989735687767, 'homeowner_mortgage'),
 (0.00032134725423733495, 'homeowner_rent')]

### Easy Ensemble Classifier

In [33]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)
eec_model = eec.fit(X_train, y_train)

In [34]:
# Calculated the balanced accuracy score
eec_y_pred = eec_model.predict(X_test)
acc_score_eec = balanced_accuracy_score(y_test, eec_y_pred)
print(f'Accuracy Score (Easy Ensemble Classifier): {acc_score_eec}')

Accuracy Score (Easy Ensemble Classifier): 0.9931452145768576


In [35]:
# Display the confusion matrix
cm = confusion_matrix(y_test, eec_y_pred)
cm

array([[  615,     4],
       [  136, 18629]], dtype=int64)

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, eec_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.82      0.99      0.99      0.90      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384

