# Ensemble Learning

## Initial Imports

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split #to for the train_test_split function

## Read the CSV and Perform Basic Data Cleaning

In [5]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview the data
df.tail()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
68812,10000.0,0.1502,346.76,RENT,26000.0,Source Verified,Jan-2019,low_risk,n,9.6,...,80.0,0.0,0.0,0.0,20625.0,6798.0,11300.0,5425.0,N,N
68813,12000.0,0.2727,368.37,RENT,63000.0,Not Verified,Jan-2019,low_risk,n,29.07,...,96.2,0.0,0.0,0.0,87939.0,60350.0,13500.0,62939.0,N,N
68814,5000.0,0.1992,185.62,MORTGAGE,52000.0,Source Verified,Jan-2019,low_risk,n,14.86,...,100.0,0.0,1.0,0.0,30592.0,18611.0,3600.0,18492.0,N,N
68815,40000.0,0.0646,1225.24,MORTGAGE,520000.0,Verified,Jan-2019,low_risk,n,9.96,...,98.2,12.5,0.0,0.0,1033574.0,95958.0,100800.0,78634.0,N,N
68816,16000.0,0.1131,350.36,MORTGAGE,72000.0,Verified,Jan-2019,low_risk,n,7.02,...,94.3,0.0,1.0,0.0,251486.0,74835.0,23000.0,63090.0,N,N


## Split the Data into Training and Testing

In [6]:
# Create our features
X = df.drop(columns="loan_status") #x (features) is all columns without loan_status / x is what we will use to predict

# Create our target
y = df["loan_status"] #y equals this specific column (target)/ what we want to predict

In [7]:
X= pd.get_dummies(X)
#wouldn't let me scale without having all columns in rows display floats/integers
#.get_dummies() converts categorical variables into dummy/indicator variables.

In [8]:
X.describe()
X.shape

(68817, 95)

In [9]:
# Check the balance of our target values
y.value_counts()
#low_risk is marjoity
#high_risk is minority

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [10]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, #column created above
                                                    y, #column created above
                                                    random_state=1, 
                                                    stratify=y)

#using this data to train and test to see if predictions of actual data is accurate (if so, we can predict new data)

#around 70% of the data is going to training,and around 30% going to testing

X_train.shape #as you can see, 51612 out of 68817 rows is around 75% of data

(51612, 95)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
# Create the StandardScaler instance

#Scaling is used so that one column doesn't have too much bias
ds = StandardScaler()

In [13]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset

X_scaler = ds.fit(X_train) #using it on X_train (training dataset)


In [14]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [17]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

#Says to use random state of 1 for each alogrithm
#Instructions also stated, for the ensemble learners, use 100 estimators for both models.
balanced_rf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
balanced_rf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [18]:
# Calculated the balanced accuracy score
y_pred_balanced_rf = balanced_rf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_balanced_rf)

0.7871246640962729

In [19]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_balanced_rf)

array([[   58,    29],
       [ 1582, 15536]])

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_balanced_rf, digits=3))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk      0.035     0.667     0.908     0.067     0.778     0.590        87
   low_risk      0.998     0.908     0.667     0.951     0.778     0.620     17118

avg / total      0.993     0.906     0.668     0.946     0.778     0.619     17205



In [29]:
# List the features sorted in descending order by feature importance
importance = balanced_rf.feature_importances_ #this function lets us see most important features

#I create a dataframe to easily visualize and sort
importance_df = pd.DataFrame(importance, index = X_train.columns, columns = ['importance'])#makes x_train columns index
#labels the the column importance

importance_df = importance_df.sort_values(by = 'importance', ascending=False) #sorts values in descending order

importance_df.head(10)

Unnamed: 0,importance
total_rec_prncp,0.073767
total_rec_int,0.063903
total_pymnt_inv,0.060733
total_pymnt,0.058112
last_pymnt_amnt,0.049518
int_rate,0.024581
out_prncp,0.020399
dti,0.018626
max_bal_bc,0.018379
issue_d_Jan-2019,0.01748


### Easy Ensemble Classifier

In [31]:
from imblearn.ensemble import EasyEnsembleClassifier

In [34]:
# Train the Classifier
ez = EasyEnsembleClassifier(n_estimators=100,random_state=1)
ez.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [35]:
# Calculated the balanced accuracy score
y_pred_ez = ez.predict(X_test_scaled) 
balanced_accuracy_score(y_test, y_pred_ez)

0.9254565671948463

In [36]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ez)

array([[   79,     8],
       [  978, 16140]])

In [37]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ez, digits=3))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk      0.075     0.908     0.943     0.138     0.925     0.853        87
   low_risk      1.000     0.943     0.908     0.970     0.925     0.859     17118

avg / total      0.995     0.943     0.908     0.966     0.925     0.859     17205



*Accuracy*:
Balanced Random Forest:0.7871246640962729

Easy Ensemble Classifier:0.9254565671948463

*Recall*:
Balanced Random Forest:0.906 

Easy Ensemble Classifier: 0.943 

*Geometric*:
Balanced Random Forest: 0.778 

Easy Ensemble Classifier:0.925

### Final Questions

1. Which model had the best balanced accuracy score?

   *Easy Ensemble Classifier* had the best balanced accurcy score with a score of .925
   
2. Which model had the best recall score?

    *Easy Ensemble Classifier* had the best recall score with a score of .943

3. Which model had the best geometric mean score?

    *Easy Ensemble Classifier* had best Geometric mean score with a of .925

4. What are the top three features?

    Top 3 features were (values in dataframe above)
    1. total_rec_prncp
    2. total_rec_int
    3. total_pymnt_inv