**`Ensemble method`** combine the predictions of several base estimators built with different learning algorithm in order to improve generalization/robustness over a single estimator.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [141]:
# Making an ensemble model with different classification models
import pandas as pd
import numpy as np

from sklearn.datasets import make_moons

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import (VotingClassifier, RandomForestClassifier,
                              BaggingClassifier)


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
X, y = make_moons(n_samples=10000, random_state=7)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [124]:
log_clf = LogisticRegression()
svm_clf = SVC()
rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[('log_clf', log_clf), ('svm_clf', svm_clf),
                                        ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)

In [125]:
for clf in (log_clf, svm_clf, rf_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_preds = clf.predict(X_train)
    print(clf.__class__.__name__, accuracy_score(y_train, y_preds))

LogisticRegression 0.89325
SVC 1.0
RandomForestClassifier 1.0
VotingClassifier 1.0


####  NB : scaling the Logistic model does little to it's overall accuracy score.

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

log_scale = Pipeline([
    ('scaler', StandardScaler()),
])

X_prep = log_scale.fit_transform(X_train)

log_clf = LogisticRegression()

svm_clf = SVC()

rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[('log_clf', log_clf), ('svm_clf', svm_clf),
                                        ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)
                                        
for clf in [log_clf, svm_clf, rf_clf, voting_clf]:

    if clf == log_clf:
        clf.fit(X_prep, y_train)
        y_pred = clf.predict(X_prep)
        print(clf.__class__.__name__, accuracy_score(y_train, y_pred))
    else : 
        clf.fit(X_train, y_train)
        y_preds = clf.predict(X_train)
        print(clf.__class__.__name__, accuracy_score(y_train, y_preds))
        
''' scores

**`LogisticRegression 0.893625`**

**`SVC 1.0`**

**`RandomForestClassifier 1.0`**

**`VotingClassifier 1.0`**
'''

#  

###  Bagging and Pasting 

Performing ensemble learning with a random sample of the training set (with replacement  in **`bagging`**, without replacement in **`pasting`**.

The **`max_samples`** and **`max_features`** control the size of the sample, while the **`bootstrap`** parameter controls which method to use.

bootstrap=True; **`Bagging`**, bootstrap=False; **`Pasting`**

In [144]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_train)
print(accuracy_score(y_train, y_pred))

0.990125


# Using cross validation to measure **`Bagging`** and **`Pasting`**

In [145]:
# using bootstrap
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), max_samples=1000, bootstrap=True, n_jobs=-1)
cross_val_score(bag_clf, X, y, cv=100, n_jobs=-1).mean()

0.9983999999999998

In [146]:
# using Pasting
bag_clf2 = BaggingClassifier(
    DecisionTreeClassifier(), max_samples=1000, bootstrap=False, n_jobs=-1)
cross_val_score(bag_clf2, X, y, cv=100, n_jobs=-1).mean()

0.9986999999999999

There is little difference between the two methods. Even at **`cv`** value of 100 and **`max_samples`** of 1000

## Out-Of-Bag Evaluation
Out-of_bag evaluatin is another means of determining the efficiency of a model

In [167]:
# checking the oob_score accuracy
bag_clf_oob = BaggingClassifier(
    DecisionTreeClassifier(), oob_score=True, max_samples=100, max_features=0.6,
    n_estimators=500, n_jobs=-1, bootstrap=True
)
bag_clf_oob.fit(X_train, y_train)
bag_clf_oob.oob_score_

1.0

In [168]:
# Evaluating the test data
y_preds = bag_clf_oob.predict(X_test)
accuracy_score(y_test, y_preds)

1.0

In [169]:
# the class probability can also be checked
bag_clf_oob.oob_decision_function_[:10]

array([[0.6969697 , 0.3030303 ],
       [0.75806452, 0.24193548],
       [0.76706827, 0.23293173],
       [0.39350913, 0.60649087],
       [0.00604839, 0.99395161],
       [0.2388664 , 0.7611336 ],
       [0.36437247, 0.63562753],
       [1.        , 0.        ],
       [0.72064777, 0.27935223],
       [0.        , 1.        ]])

# Random Forest

In [171]:
# running a RandomClassifier on 500 instace sample
rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                               n_jobs=-1).fit(X_train, y_train)
y_pred_clf = rf_clf.predict(X_train)
accuracy_score(y_train, y_pred_clf)

1.0

In [172]:
# evaluating on the test set
y_test_pred = rf_clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9985