## Risky Business 
#### The following code will use the imbalanced learn library to build and evaluate logistic regression classifiers using the resampled data to assess credit risk.

In [1]:
#Initial imports
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

### Read the CSV into dataframe 

In [2]:
#Load the data
df = pd.read_csv('Homework_11-Machine-Learning_Instructions_Starter_Code_Resources_lending_data.csv')
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [3]:
# Create the LabelEncoder instance
le = LabelEncoder()

In [4]:
# Fitting and encoding the columns with the LabelEncoder

# homeowner column
le.fit(df["homeowner"])
df["homeowner"] = le.transform(df["homeowner"])

# Encoding loan_status column
le.fit(df["loan_status"])
df["loan_status"] = le.transform(df["loan_status"])

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700,7.672,1,52800,0.431818,5,1,22800,1
1,8400,6.692,1,43600,0.311927,3,0,13600,1
2,9000,6.963,2,46100,0.349241,3,0,16100,1
3,10700,7.664,1,52700,0.43074,5,1,22700,1
4,10800,7.698,0,53000,0.433962,5,1,23000,1


### Split the Data into Training and Testing


In [5]:
# Create our features
X = df.copy()
X.drop(["loan_status"], axis=1, inplace=True)
X.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700,7.672,1,52800,0.431818,5,1,22800
1,8400,6.692,1,43600,0.311927,3,0,13600
2,9000,6.963,2,46100,0.349241,3,0,16100
3,10700,7.664,1,52700,0.43074,5,1,22700
4,10800,7.698,0,53000,0.433962,5,1,23000


In [6]:
df["loan_status"].values

array([1, 1, 1, ..., 0, 0, 0])

In [7]:
# Define target vector
y = df["loan_status"].values.reshape(-1, 1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [8]:
X.describe()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,0.606144,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,0.667811,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,0.0,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,0.0,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,1.0,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,1.0,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,2.0,105200.0,0.714829,16.0,3.0,75200.0


In [9]:
#Check the balance of our target values 
df["loan_status"].value_counts()

1    75036
0     2500
Name: loan_status, dtype: int64

In [10]:
# Create X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

#### Data Pre-Processing
##### Scale the training and testing data using the StandardScaler from sklearn. Remember that when scaling the data, you only scale the features data (X_train and X_testing).

In [11]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [12]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [13]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Simple Logistic Regression


In [14]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [15]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9481182566723452

In [16]:
 # Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  541,    59],
       [  102, 18682]])

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.90      0.99      0.87      0.95      0.89       600
          1       1.00      0.99      0.90      1.00      0.95      0.91     18784

avg / total       0.99      0.99      0.90      0.99      0.95      0.90     19384



### Oversampling Algorithms 
#### Now we will compare two oversampling algorithms to determine which algorithm results in the best performance. We will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm.
View the count of the target classes using Counter from the collections library. Use the resampled data to train a logistic regression model.Calculate the balanced accuracy score from sklearn.metrics.Print the confusion matrix from sklearn.metrics.Generate a classication report using the imbalanced_classification_report from imbalanced-learn.
Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

#### 1. Naive Random Oversampling


In [18]:
#Imports
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

In [19]:
Counter(y_train.reshape(-1))

Counter({1: 56252, 0: 1900})

In [20]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 56252, 0: 56252})

In [21]:
# Train the logistic regression model using the resampled data 
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression(multi_class='warn', random_state=1)

In [23]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9481182566723452

In [24]:
 # Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  597,     3],
       [  111, 18673]])

In [25]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.99      0.99      0.91      0.99      0.99       600
          1       1.00      0.99      0.99      1.00      0.99      0.99     18784

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



#### 2. SMOTE Oversampling


In [26]:
#Resample the training data using SMOTE and view target classes with the Counter 
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({1: 56252, 0: 56252})

In [27]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [28]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression(multi_class='warn', random_state=1)

In [29]:
# Calculate the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9945453577512777

In [30]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  597,     3],
       [  111, 18673]])

In [31]:
 # Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.99      0.99      0.91      0.99      0.99       600
          1       1.00      0.99      0.99      1.00      0.99      0.99     18784

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### Undersampling
#### Now we will test an undersampling algorithm to determine which algorithm results in the best performance compared to the oversampling algorithms above. We will undersample the data using the Cluster Centroids algorithm.

In [32]:
# Resample the data using the ClusterCentroids resampler

from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids (random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# View the count of target classes with Counter
Counter(y_resampled)

Counter({1: 56252, 0: 56252})

In [33]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [34]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression(multi_class='warn', random_state=1)

In [35]:
#Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)


0.9945453577512777

In [36]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  597,     3],
       [  111, 18673]])

In [37]:
 # Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.99      0.99      0.91      0.99      0.99       600
          1       1.00      0.99      0.99      1.00      0.99      0.99     18784

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### Combination (Over and Under) Sampling
#### Now we will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. We will resample the data using the SMOTEENN algorithm.

In [38]:
 # Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# View the count of target classes with Counter
Counter(y_resampled)

Counter({0: 55547, 1: 55861})

In [39]:
 # Logistic regression using random combination sampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [40]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression(multi_class='warn', random_state=1)

In [41]:
#Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)


0.9945453577512777

In [42]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  598,     2],
       [  124, 18660]])

In [43]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      1.00      0.99      0.90      1.00      0.99       600
          1       1.00      0.99      1.00      1.00      1.00      0.99     18784

avg / total       0.99      0.99      1.00      0.99      1.00      0.99     19384



### Final Questions

#### Which model had the best balanced accuracy score?

The combination over- and under-sampling algorithm resulted in the best balanced accuracy score = 99.5%.

#### Which model had the best recall score? 

The combination over- and under-sampling algorithm resulted in the best recall score = 100%.

#### Which model had the best geometric mean score? 

The combination over- and under-sampling algorithm resulted in the best geometric mean score = 100%.

### Part 2 - Ensemble Learning 

#### The following code will train and compare two different ensemble classifiers (Balanced Random Forest Classifier and Easy Ensemble Classifier) to evaluate which model is better for predicting loan risk. 

In [44]:
#Initial imports
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter  
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import warnings
warnings.filterwarnings('ignore')


#### Read CSV and Perform Basic Data Cleaning 

In [45]:
df = pd.read_csv('Homework_11-Machine-Learning_Instructions_Starter_Code_Resources_LoanStats_2019Q1.csv')
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500,0.1719,375.35,RENT,66000.0,Source Verified,Mar-19,low_risk,n,27.24,...,85.7,100.0,0,0,65687,38199,2000,61987,N,N
1,25000,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-19,low_risk,n,20.23,...,91.2,50.0,1,0,271427,60641,41200,49197,N,N
2,20000,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-19,low_risk,n,24.26,...,66.7,50.0,0,0,60644,45684,7500,43144,N,N
3,10000,0.164,353.55,RENT,92000.0,Verified,Mar-19,low_risk,n,31.44,...,100.0,50.0,1,0,99506,68784,19700,76506,N,N
4,22000,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-19,low_risk,n,18.76,...,100.0,0.0,0,0,219750,25919,27600,20000,N,N


In [46]:
# Encoding loan_status column
le.fit(df["loan_status"])
df["loan_status"] = le.transform(df["loan_status"])

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500,0.1719,375.35,RENT,66000.0,Source Verified,Mar-19,1,n,27.24,...,85.7,100.0,0,0,65687,38199,2000,61987,N,N
1,25000,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-19,1,n,20.23,...,91.2,50.0,1,0,271427,60641,41200,49197,N,N
2,20000,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-19,1,n,24.26,...,66.7,50.0,0,0,60644,45684,7500,43144,N,N
3,10000,0.164,353.55,RENT,92000.0,Verified,Mar-19,1,n,31.44,...,100.0,50.0,1,0,99506,68784,19700,76506,N,N
4,22000,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-19,1,n,18.76,...,100.0,0.0,0,0,219750,25919,27600,20000,N,N


#### Split the Data into Training and Testing 

In [47]:
# Create our features
X = df.copy()
X.drop(["loan_status"], axis=1, inplace=True)
X.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500,0.1719,375.35,RENT,66000.0,Source Verified,Mar-19,n,27.24,0,...,85.7,100.0,0,0,65687,38199,2000,61987,N,N
1,25000,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-19,n,20.23,0,...,91.2,50.0,1,0,271427,60641,41200,49197,N,N
2,20000,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-19,n,24.26,0,...,66.7,50.0,0,0,60644,45684,7500,43144,N,N
3,10000,0.164,353.55,RENT,92000.0,Verified,Mar-19,n,31.44,0,...,100.0,50.0,1,0,99506,68784,19700,76506,N,N
4,22000,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-19,n,18.76,0,...,100.0,0.0,0,0,219750,25919,27600,20000,N,N


In [48]:
# Create our target
y = df["loan_status"].values.reshape(-1, 1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [71]:
# Encode our data from strings to integers 
df = pd.get_dummies(df)
df

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,issue_d_Mar-19,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-19,next_pymnt_d_May-19,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
0,10500,0.1719,375.35,66000.0,1,27.24,0,0,8,0,...,1,1,0,1,0,1,1,0,1,1
1,25000,0.2000,929.09,105000.0,1,20.23,0,0,17,1,...,1,1,0,1,0,1,1,0,1,1
2,20000,0.2000,529.88,56000.0,1,24.26,0,0,8,0,...,1,1,0,1,0,1,1,0,1,1
3,10000,0.1640,353.55,92000.0,1,31.44,0,1,10,1,...,1,1,0,1,0,1,1,0,1,1
4,22000,0.1474,520.39,52000.0,1,18.76,0,1,14,0,...,1,1,0,1,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000,0.1502,346.76,26000.0,1,9.60,0,0,9,0,...,0,1,0,1,0,1,1,0,1,1
68813,12000,0.2727,368.37,63000.0,1,29.07,0,0,8,0,...,0,1,0,1,0,1,1,0,1,1
68814,5000,0.1992,185.62,52000.0,1,14.86,0,0,5,1,...,0,1,0,1,0,1,1,0,1,1
68815,40000,0.0646,1225.24,520000.0,1,9.96,0,1,21,0,...,0,1,1,0,0,1,1,0,1,1


In [50]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.052138,2.219423,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.390633,1.897432,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,2.0,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,3.0,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,18.0,19.0,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0


In [51]:
#Check the balance of our target values 
df["loan_status"].value_counts()

1    68470
0      347
Name: loan_status, dtype: int64

In [52]:
#Split the X and y into X_train, X_test, y_train, y_test
# Create X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

#### Data Pre-Processing

Now we will scale the training and testing data using the StandardScaler from sklearn. We will only scale the features data (X_train and X_testing).

In [53]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [72]:
#Encode our data using Dummies function
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [73]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [74]:
 # Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### 1. Balanced Random Forest Classifier

In [57]:
#Import for Random Forest Classifer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [58]:
# Create a random forest classifier
rf_model = RandomForestClassifier(random_state=1)


In [59]:
 # Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [60]:
 # Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

#### Balanced Random Forest Classifier Model Evaluation

In [61]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [62]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [63]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,29,63
Actual 1,19,17094


Accuracy Score : 0.9952339436210403
Classification Report
              precision    recall  f1-score   support

           0       0.60      0.32      0.41        92
           1       1.00      1.00      1.00     17113

    accuracy                           1.00     17205
   macro avg       0.80      0.66      0.71     17205
weighted avg       0.99      1.00      0.99     17205



### Feature Importance

In [64]:
 # Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_

In [65]:
 # Sort the features by their importance in descending order
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.07779205667236354, 'initial_list_status'),
 (0.07514184844817616, 'total_acc'),
 (0.07470889191208643, 'revol_bal'),
 (0.06807620135291256, 'total_rec_prncp'),
 (0.06190606051942391, 'out_prncp'),
 (0.02095996701768496, 'open_acc'),
 (0.01983993003131137, 'pub_rec'),
 (0.019512439094364037, 'installment'),
 (0.017982517791614073, 'acc_open_past_24mths'),
 (0.016519993659629584, 'annual_inc'),
 (0.016336475803778783, 'out_prncp_inv'),
 (0.01581513151954527, 'inq_last_12m'),
 (0.014998056277100208, 'delinq_2yrs'),
 (0.014721047753039066, 'open_il_12m'),
 (0.014581849378945374, 'all_util'),
 (0.014169816183931263, 'home_ownership'),
 (0.014089903193929429, 'num_tl_30dpd'),
 (0.013564791243942838, 'last_pymnt_amnt'),
 (0.013446557717804345, 'tot_coll_amt'),
 (0.0132945630792556, 'num_tl_90g_dpd_24m'),
 (0.013197876154669065, 'loan_amnt'),
 (0.013149454090485485, 'max_bal_bc'),
 (0.012890509779087457, 'tot_cur_bal'),
 (0.012830948712967237, 'total_rev_hi_lim'),
 (0.012370012474721906, '

#### 2. Easy Ensemble Classifier

In [66]:
# Import easy ensemble classifier 
from imblearn.ensemble import EasyEnsembleClassifier

In [67]:
# Create and train the classifier
classifier = EasyEnsembleClassifier(random_state=1)

# Fit the model
classifier.fit(X_train_scaled, y_train.ravel())

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(5)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


#### Easy Ensemble Classifier Model Evaluation

In [68]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9297878523684975


In [69]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,83,9
Actual 1,1199,15914


In [70]:
# Print classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.06      0.90      0.12        92
           1       1.00      0.93      0.96     17113

    accuracy                           0.93     17205
   macro avg       0.53      0.92      0.54     17205
weighted avg       0.99      0.93      0.96     17205



### Part 2 - Final Questions:

#### Which model had the best balanced accuracy score? 

The Random Forest Classifier Model had the best balanced accuracy score of 99.51%. 

#### Which model had the best recall score? 

The Easy Ensemble Classifier had the best recall score.

#### Which model had the best geometric mean score? 

The Random Forest Classifier Model had the better geometric mean score of 0.43.

#### What are the top three features? 

The top three features are:
1. Last Payment Amount - 8.2%
2. Total Rec Int - 7.6%
3. Total Rec Prncip - 7.3%