**`Ensemble method`** combine the predictions of several base estimators built with different learning algorithm in order to improve generalization/robustness over a single estimator.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [56]:
# Making an ensemble model with different classification models
import pandas as pd
import numpy as np

from sklearn.datasets import make_moons

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (VotingClassifier, RandomForestClassifier,
                              BaggingClassifier)


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
X, y = make_moons(n_samples=10000, random_state=7)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [124]:
log_clf = LogisticRegression()
svm_clf = SVC()
rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[('log_clf', log_clf), ('svm_clf', svm_clf),
                                        ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)

In [125]:
for clf in (log_clf, svm_clf, rf_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_preds = clf.predict(X_train)
    print(clf.__class__.__name__, accuracy_score(y_train, y_preds))

LogisticRegression 0.89325
SVC 1.0
RandomForestClassifier 1.0
VotingClassifier 1.0


####  NB : scaling the Logistic model does little to it's accuracy score.

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

log_scale = Pipeline([
    ('scaler', StandardScaler()),
])

X_prep = log_scale.fit_transform(X_train)

log_clf = LogisticRegression()

svm_clf = SVC()

rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[('log_clf', log_clf), ('svm_clf', svm_clf),
                                        ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)
                                        
for clf in [log_clf, svm_clf, rf_clf, voting_clf]:

    if clf == log_clf:
        clf.fit(X_prep, y_train)
        y_pred = clf.predict(X_prep)
        print(clf.__class__.__name__, accuracy_score(y_train, y_pred))
    else : 
        clf.fit(X_train, y_train)
        y_preds = clf.predict(X_train)
        print(clf.__class__.__name__, accuracy_score(y_train, y_preds))
        
''' scores

**`LogisticRegression 0.893625`**

**`SVC 1.0`**

**`RandomForestClassifier 1.0`**

**`VotingClassifier 1.0`**
'''

###  Bagging and Pasting 

Performing ensemble learning with a random sample of the training set (with replacement  in **`bagging`**, without replacement in **`pasting`**.

The **`max_samples`** and **`max_features`** control the size of the sample, while the **`bootstrap`** parameter controls which method to use.

bootstrap=True; **`Bagging`**, bootstrap=False; **`Pasting`**

In [126]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), oob_score=True, max_samples=1000, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_train)
print(accuracy_score(y_train, y_pred))

0.998
