**`Ensemble method`** combine the predictions of several base estimators built with different learning algorithm in order to improve generalization/robustness over a single estimator.

In [9]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [10]:
# Making an ensemble model with different classification models
import pandas as pd
import numpy as np

from sklearn.datasets import make_moons, load_iris

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import (VotingClassifier, RandomForestClassifier,
                              BaggingClassifier, ExtraTreesClassifier,
                             AdaBoostClassifier, GradientBoostingRegressor)


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
X, y = make_moons(n_samples=10000, random_state=7)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [13]:
log_clf = LogisticRegression()
svm_clf = SVC()
rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[('log_clf', log_clf), ('svm_clf', svm_clf),
                                        ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)

In [14]:
for clf in (log_clf, svm_clf, rf_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_preds = clf.predict(X_train)
    print(clf.__class__.__name__, accuracy_score(y_train, y_preds))

LogisticRegression 0.89325
SVC 1.0
RandomForestClassifier 1.0
VotingClassifier 1.0


####  NB : scaling the Logistic model does little to it's overall accuracy score.

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

log_scale = Pipeline([
    ('scaler', StandardScaler()),
])

X_prep = log_scale.fit_transform(X_train)

log_clf = LogisticRegression()

svm_clf = SVC()

rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[('log_clf', log_clf), ('svm_clf', svm_clf),
                                        ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)
                                        
for clf in [log_clf, svm_clf, rf_clf, voting_clf]:

    if clf == log_clf:
        clf.fit(X_prep, y_train)
        y_pred = clf.predict(X_prep)
        print(clf.__class__.__name__, accuracy_score(y_train, y_pred))
    else : 
        clf.fit(X_train, y_train)
        y_preds = clf.predict(X_train)
        print(clf.__class__.__name__, accuracy_score(y_train, y_preds))
        
''' scores

**`LogisticRegression 0.893625`**

**`SVC 1.0`**

**`RandomForestClassifier 1.0`**

**`VotingClassifier 1.0`**
'''

#  

###  Bagging and Pasting 

Performing ensemble learning with a random sample of the training set (with replacement  in **`bagging`**, without replacement in **`pasting`**.

The **`max_samples`** and **`max_features`** control the size of the sample, while the **`bootstrap`** parameter controls which method to use.

bootstrap=True; **`Bagging`**, bootstrap=False; **`Pasting`**

In [15]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_train)
print(accuracy_score(y_train, y_pred))

0.989875


# Using cross validation to measure **`Bagging`** and **`Pasting`**

In [16]:
# using bootstrap
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), max_samples=1000, bootstrap=True, n_jobs=-1)
cross_val_score(bag_clf, X, y, cv=100, n_jobs=-1).mean()

0.9986

In [17]:
# using Pasting
bag_clf2 = BaggingClassifier(
    DecisionTreeClassifier(), max_samples=1000, bootstrap=False, n_jobs=-1)
cross_val_score(bag_clf2, X, y, cv=100, n_jobs=-1).mean()

0.9986999999999999

There is little difference between the two methods. Even at **`cv`** value of 100 and **`max_samples`** of 1000

## Out-Of-Bag Evaluation
Out-of_bag evaluatin is another means of determining the efficiency of a model

In [18]:
# checking the oob_score accuracy
bag_clf_oob = BaggingClassifier(
    DecisionTreeClassifier(), oob_score=True, max_samples=100, max_features=0.6,
    n_estimators=500, n_jobs=-1, bootstrap=True
)
bag_clf_oob.fit(X_train, y_train)
bag_clf_oob.oob_score_

1.0

In [19]:
# Evaluating the test data
y_preds = bag_clf_oob.predict(X_test)
accuracy_score(y_test, y_preds)

1.0

In [20]:
# the class probability can also be checked
bag_clf_oob.oob_decision_function_[:10]

array([[0.69230769, 0.30769231],
       [0.81352459, 0.18647541],
       [0.75101215, 0.24898785],
       [0.37826962, 0.62173038],
       [0.        , 1.        ],
       [0.26666667, 0.73333333],
       [0.39148073, 0.60851927],
       [1.        , 0.        ],
       [0.74141414, 0.25858586],
       [0.        , 1.        ]])

# Random Forest

In [21]:
# running a RandomClassifier on 500 instace sample
rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                               n_jobs=-1).fit(X_train, y_train)
y_pred_clf = rf_clf.predict(X_train)
accuracy_score(y_train, y_pred_clf)

1.0

In [22]:
# evaluating on the test set
y_test_pred = rf_clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9985

#  Extra Trees

In [23]:
ext_clf = ExtraTreesClassifier().fit(X_train, y_train)
ext_clf.score(X_train, y_train)

1.0

### Feature Importance

In [24]:
iris = load_iris()
X, y = iris['data'], iris['target']

rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1).fit(X, y)
for name, score in sorted(zip(iris['feature_names'], rf_clf.feature_importances_)):
    print(name , score)

petal length (cm) 0.4399758361786796
petal width (cm) 0.4342023286570898
sepal length (cm) 0.10340397117794092
sepal width (cm) 0.022417863986289793


# Boosting (AdaBoost and Gradient Boost)

In [25]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=500,
    algorithm='SAMME.R', learning_rate=0.5, 
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [26]:
ada_clf.score(X_train, y_train)

1.0

In [27]:
# Gradient Boosting
tree_reg1 = DecisionTreeClassifier(max_depth=2)
tree_reg1.fit(X, y)
y2 = y - tree_reg1.predict(X)

In [28]:
tree_reg2 = DecisionTreeClassifier(max_depth=2)
tree_reg2.fit(X, y2)
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeClassifier(max_depth=2).fit(X, y3)
tree_reg3.score(X, y3)

0.9666666666666667

In [29]:
y_preds = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [30]:
# Using the GradientBoosting classifier
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1)
gbrt.fit(X, y)
gbrt.score(X, y)

0.44375375097868175

# Exercise

In [31]:
# Using the mnist dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [32]:
X, y = mnist['data'], mnist['target']

In [33]:
# splitting into train, validation and test sets
X_train, X_val, X_test = X[:50000], X[50000:60000], X[60000:]
y_train, y_val, y_test = y[:50000], y[50000:60000], y[60000:]

In [34]:
# building Random forest model on the train set
np.random.seed(7)
rf_clf = RandomForestClassifier(n_jobs=-1)
rf_clf.fit(X_train, y_train)
y_preds_rf = rf_clf.predict(X_train)
accuracy_score(y_train, y_preds_rf)

1.0

In [35]:
# Extra-tree classifier
np.random.seed(7)
ext_clf = ExtraTreesClassifier().fit(X_train, y_train)
y_pred_ext = ext_clf.predict(X_train)
accuracy_score(y_train, y_pred_ext)

1.0

In [42]:
# Support vector machine
np.random.seed(7)
des_clf = DecisionTreeClassifier().fit(X_train, y_train)
y_pred_des = des_clf.predict(X_train)
accuracy_score(y_train, y_pred_des)

1.0

In [41]:
# Making an Ensemble model
np.random.seed(7)
vote_clf = VotingClassifier(estimators=[('ext_clf',ext_clf),('des_clf',des_clf), 
                                      ('rf_clf',rf_clf)], voting='hard', n_jobs=-1)
for clf in (ext_clf, rf_clf, des_clf, vote_clf):
    clf.fit(X_val, y_val)
    y_pred = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred))


ExtraTreesClassifier 1.0
RandomForestClassifier 1.0
DecisionTreeClassifier 1.0
VotingClassifier 1.0
