## Lab 12. Ensemble Learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

## Build ensemble classifiers for Churn Modeling data

1. Import data, select features, apply encoding and/or scaling as appropriate, split into training and test sets 
2. Train and score a logistic regression model with default parameters. Add AUC to your scoring report
3. Train and score a random forest model. Compute AUC
4. List model features by their importance for the RF and LR models
5. Optimize the RF model using grid search. Does optimization improve the AUC?
6. Train and score an adaboost and gradient boost model with default hyperparaeters. Compute AUC
7. Find the winning algorithm
8. Plot and compare ROC curves for the LR, RF, Adaboost and Gradient Boost models.

### Solutions

In [2]:
## import and prepare data

df = pd.read_csv("../data/Churn_Modeling.csv")
y = df['Exited']
df = df.iloc[:, 2:-1]
df = pd.get_dummies(data = df, drop_first = True)
features = df.columns
X = df.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
sc = MinMaxScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [3]:
# baseline: logistic regression

lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [4]:
def classifier_scoring(model):
    print(classification_report(y_test, model.predict(X_test))[:220])
    print("AUC:   ", "%.2f" % roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    confusion_matrix(y_test, model.predict(X_test))

In [5]:
classifier_scoring(lr)

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      2416
           1       0.56      0.19      0.29       584

    accuracy                           0.81      3000
  
AUC:    0.77


In [6]:
# random forest

rfc=RandomForestClassifier(random_state=42)
rfc.fit(X_train,y_train)

classifier_scoring(rfc)

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      2416
           1       0.75      0.47      0.58       584

    accuracy                           0.87      3000
  
AUC:    0.85


In [7]:
## feature importance for RF

pd.Series(rfc.feature_importances_, index = features).sort_values(ascending = False)

Age                  0.241742
EstimatedSalary      0.145268
Balance              0.142440
CreditScore          0.141835
NumOfProducts        0.128864
Tenure               0.081797
IsActiveMember       0.040470
Geography_Germany    0.026532
HasCrCard            0.018946
Gender_Male          0.018300
Geography_Spain      0.013807
dtype: float64

In [8]:
## feature importance for LR

pd.Series(abs(lr.coef_[0]), index = features).sort_values(ascending = False)

Age                  5.014840
IsActiveMember       1.023919
Geography_Germany    0.768153
Balance              0.672898
Gender_Male          0.528219
CreditScore          0.380109
NumOfProducts        0.230739
Tenure               0.210481
Geography_Spain      0.086470
HasCrCard            0.065479
EstimatedSalary      0.014800
dtype: float64

In [9]:
## hyperparamater tuning for RFC (uncomment code below)

rfc=RandomForestClassifier(random_state=42)

parameters = {'n_estimators':[10, 100, 500, 1000],
              'max_depth':[1, 2, 3, 4]}


hyper_tune = GridSearchCV(rfc, parameters, cv = 5, scoring = 'f1', return_train_score = True, verbose = 3)
#hyper_tune.fit(X_train, y_train)
#hyper_tune.best_params_

In [10]:
# random forest with optimized hyperparameters

rfc=RandomForestClassifier(random_state=42, n_estimators = 100, max_depth = 7)
rfc.fit(X_train,y_train)


classifier_scoring(rfc)

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      2416
           1       0.84      0.41      0.55       584

    accuracy                           0.87      3000
  
AUC:    0.86


In [11]:
abc = AdaBoostClassifier(random_state = 42)
abc.fit(X_train,y_train)

classifier_scoring(abc)

              precision    recall  f1-score   support

           0       0.88      0.95      0.92      2416
           1       0.72      0.48      0.58       584

    accuracy                           0.86      3000
  
AUC:    0.85


In [12]:
gbc = GradientBoostingClassifier(random_state = 42)
gbc.fit(X_train,y_train)

classifier_scoring(gbc)

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      2416
           1       0.77      0.48      0.59       584

    accuracy                           0.87      3000
  
AUC:    0.87
