# Ensemble Methods Applied

Agenda:
- Review code for Voting Classifier, Bagging Classifier, and Random Forest
- Practice finding optimal hyperparameter for  Random Forest with gridsearch


## Import and Prep Titanic dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier


In [3]:
# Read in data and split data to be used in the models
titanic = pd.read_csv('https://raw.githubusercontent.com/learn-co-students/nyc-ds-033020-lectures/master/Mod_3/decision_trees/cleaned_titanic.csv', index_col='PassengerId')



In [4]:
# Create matrix of features
X = titanic.drop('Survived', axis = 1) # grabs everything else but 'Survived'

# Create target variable
y = titanic['Survived'] # y is the column we're trying to predict

# Create a list of the features being used in the 
feature_cols = X.columns

In [5]:
# Use x and y variables to split the training data into train and test set then scale that data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

## Fit a KNN model

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
knn = KNeighborsClassifier(n_neighbors=9)

In [8]:
knn.fit(X_train, y_train)

knn_preds = knn.predict(X_test)

knn_f1 = metrics.f1_score(y_test, knn_preds)


print(knn_f1)

0.7975460122699386


## Fit a Logistic Regression model 

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
lr = LogisticRegression(class_weight='balanced')

In [11]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
lr_preds = lr.predict(X_test)

lr_f1 = metrics.f1_score(y_test, lr_preds)

print(lr_f1)

0.8066298342541436


## Fit a Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
dtc = DecisionTreeClassifier(max_depth=5, class_weight='balanced')

dtc.fit(X_train, y_train)

dtc_preds  = dtc.predict(X_test)

dtc_f1 = metrics.f1_score(y_test, dtc_preds)

print(dtc_f1)

0.8047337278106509


## Combine three models using Voting Classifier

In [16]:
from sklearn.ensemble import VotingClassifier


For the estimators, we must provide a list of tuples. The first value in the tuple is is the name given to the model/estimator in the second value. SKlearn requires this because there is additional functionality where you can access information about the specific models, so you need to name the models to access them later.  

In [18]:
voting_clf = VotingClassifier(
                estimators=[('logreg', lr), ('knneighbors', knn), ('decisiontree', dtc)], 
                voting='hard')

voting_clf.fit(X_train, y_train)

vc_preds = voting_clf.predict(X_test)

vc_f1 = metrics.f1_score(y_test, vc_preds)

print(vc_f1)

0.8160919540229884


### Use a voting classifier with multiple Logistic regression models 

In [19]:

C_param_range = [0.001,0.01,0.1,1,10]
titles = ['lr_0_001', 'lr_0_01', 'lr_0_1', 'lr_1', 'lr_10']

params = dict(zip(titles, C_param_range)) 
models = {}

table = pd.DataFrame(columns = ['C_parameter','F1'])
table['C_parameter'] = C_param_range
j = 0

for k , v  in params.items():
    
    # Create model using different value for c  
    lr = LogisticRegression(penalty = 'l2', C = v, random_state = 1, class_weight='balanced')
    
    #save the model to a dictionary to use later in our voting classifiers
    models[k]= lr
    
    #the steps below this point are unnecessary in order to create a voting classifier, 
    #but it is easy to fit the model and see how performance changes for different levels of regularization
    lr.fit(X_train, y_train)
    
    # Predict using model
    y_preds = lr.predict(X_test)

    # Saving accuracy score in table
    table.iloc[j,1] = metrics.f1_score(y_test, y_preds)
    j += 1



In [20]:
#review performance for different levels of C
table


Unnamed: 0,C_parameter,F1
0,0.001,0.735135
1,0.01,0.751381
2,0.1,0.804469
3,1.0,0.80663
4,10.0,0.80663


In [21]:
#invesitgate the models D=dictionary
list(models.items())

[('lr_0_001',
  LogisticRegression(C=0.001, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('lr_0_01',
  LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('lr_0_1',
  LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=1, solver='lbfgs',

Now that we have programmatically created multiple logistic regression models, let's use them in an ensemble model

In [24]:
lr_voting = VotingClassifier(estimators=list(models.items()), 
                              voting='hard')

lr_voting.fit(X_train, y_train)

lrv_preds = lr_voting.predict(X_test)

lrv_f1 = metrics.f1_score(y_test, lrv_preds)

print(lrv_f1)

0.8044692737430168


## Fit a Bagging Classifier for a Logistic Regression model. 

In [25]:
X_train.shape

(666, 9)

In [35]:
bc_lr = BaggingClassifier(
            base_estimator=LogisticRegression(random_state = 1, class_weight='balanced'), 
            n_estimators= 100,
            max_samples= 0.8,
            max_features= 6,
            oob_score= True
                )

In [36]:
bc_lr.fit(X_train, y_train)



BaggingClassifier(base_estimator=LogisticRegression(C=1.0,
                                                    class_weight='balanced',
                                                    dual=False,
                                                    fit_intercept=True,
                                                    intercept_scaling=1,
                                                    l1_ratio=None, max_iter=100,
                                                    multi_class='auto',
                                                    n_jobs=None, penalty='l2',
                                                    random_state=1,
                                                    solver='lbfgs', tol=0.0001,
                                                    verbose=0,
                                                    warm_start=False),
                  bootstrap=True, bootstrap_features=False, max_features=6,
                  max_samples=0.8, n_estimators=100, n_jobs=None,

In [37]:
# Use the oob_score to get some idea of how the model performs on a validation set

bc_lr.oob_score_

0.7777777777777778

In [38]:
# See how the model performs on the test set

bc_lr_preds = bc_lr.predict(X_test)

bc_lr_f1 = metrics.f1_score(y_test, bc_lr_preds)

print(bc_lr_f1)

0.7894736842105262


***What is the difference in the `VotingClassifier` algorithm and the `BaggingClassifier` algorithm?***

Your answer:

**What is the difference between a BaggingClassifier that uses a decision tree as the base estimator and a Random Forest Classifier?**

A random forest classifier will take a sample of features at each node, where as a bagging classifier will take a sample of features at to use for the whole model. 

# Fitting a Random Forest Classifier

In [47]:
# Instantiate the classifier using 100 trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators=100, max_depth=2, max_features=4)

In [48]:
#let's look at all the different default features
rfc

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [49]:
#fit the model to the training data
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [50]:
#use the fitted model to predict on the test data
rfc_preds = rfc.predict(X_test)

rfc_f1 = metrics.f1_score(y_test, rfc_preds)

# checking accuracy on the test data
print('Test F1 score: ', rfc_f1)

Test F1 score:  0.7074829931972789


***Increase the number of trees and see how the model performs***

### GridsearchCV with Random Forest

Let's use grid search to identify the best tuning parameters to use for a random forest model. 

In [52]:
from sklearn.model_selection import GridSearchCV

In [None]:
RandomForestClassifier()

In [55]:
#create a dictionary of all the parameters you want to tune
param_grid = { 
    'n_estimators': [100,300,500,700,1000],
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(2,10)),
    'max_features': list(range(3,7))
}

In [56]:
#create a grid search object and fit it to the data

grid_tree=GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [57]:
grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed:  5.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [58]:
### Identify the best params 



# Single best score achieved across all params (min_samples_split)
print(grid_tree.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_tree.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_tree.best_estimator_)
#Identify the best score during fitting with cross-validation


0.7549283543204516
{'criterion': 'entropy', 'max_depth': 9, 'max_features': 5, 'n_estimators': 500}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=9, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [59]:
#Predict the response for test dataset
y_pred = grid_tree.best_estimator_.predict(X_test)

# Model F1, how often is the classifier correct?
print("F1:",metrics.f1_score(y_test, y_pred))

F1: 0.8024691358024691
