# Chapter 7: Ensemble Learning and Random Forests

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [24]:
# Create moon dataset and split into test and train sets

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
# Set up classifiers

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [26]:
# Set up voting classifier

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [27]:
# Train voting classifier

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [28]:
# Look at each classifiers accuracy on the test set

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8535
RandomForestClassifier 0.9555
SVC 0.9625
VotingClassifier 0.959


#### Soft Voting

In [29]:
# SVC needs probability set to true to use cross-validation to estimate probabilities
# as SVC does not have a predict_proba() method by default

svm_clf = SVC(probability = True)

In [30]:
# Set up soft voting classifier

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

In [31]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8535
RandomForestClassifier 0.9575
SVC 0.9625
VotingClassifier 0.9575


#### Bagging and Pasting

In [32]:
# Train an ensemble of 500 decision trees on 100 training instances randomly sampled
# from the training set, with replacement

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.9525

#### Out-of-Bag Evaluation

In [34]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)

In [35]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500,
                  n_jobs=-1, oob_score=True)

In [36]:
bag_clf.oob_score_

0.961625

In [37]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9535

#### Random Forests

In [38]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1)

In [44]:
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9555

The following ```BaggingClassifier``` is equivalent to the previous ```RandomForestClassifier```

In [49]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features="auto", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_features='auto',
                                                        max_leaf_nodes=16),
                  n_estimators=500, n_jobs=-1)

In [50]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.956

##### Feature Importances

In [51]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09830050889641946
sepal width (cm) 0.02373163328950335
petal length (cm) 0.46221086300636594
petal width (cm) 0.4157569948077114


#### AdaBoost

In [52]:
# Train an AdaBoost classifier on 200 decision stumps

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [53]:
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.954

#### Gradient Boosting

In [64]:
# Train a GBRT ensemble with 120 trees, measure the validation error at each stage to find the optimal no. of trees
# Then train another GBRT with the optimial no. of trees

import numpy as np
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)]

best_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

early stopping can also be implemented by using ```warm_start=True```

In [65]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up ==5:
            break # early stopping

#### XGBoost

In [68]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [71]:
# Can also take care of early stopping

xgb_reg.fit(X_train, y_train,
           eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.37161
[1]	validation_0-rmse:0.28786
[2]	validation_0-rmse:0.23702
[3]	validation_0-rmse:0.20784
[4]	validation_0-rmse:0.19107
[5]	validation_0-rmse:0.18191
[6]	validation_0-rmse:0.17731
[7]	validation_0-rmse:0.17597
[8]	validation_0-rmse:0.17491
[9]	validation_0-rmse:0.17406
[10]	validation_0-rmse:0.17355
[11]	validation_0-rmse:0.17346
[12]	validation_0-rmse:0.17431


#### Exercises

8. Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

Training

In [6]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [11]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [9]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000)

In [12]:
random_forest_clf = RandomForestClassifier(n_estimators=100)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100)
svm_clf = LinearSVC(max_iter=100, tol=20)
mlp_clf = MLPClassifier()

In [13]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]

for estimator in estimators:
    estimator.fit(X_train, y_train)

In [17]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9676, 0.9682, 0.8614, 0.9636]

Ensemble

In [14]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

In [16]:
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('random_forest_clf', RandomForestClassifier()),
                             ('extra_trees_clf', ExtraTreesClassifier()),
                             ('svm_clf', LinearSVC(max_iter=100, tol=20)),
                             ('mlp_clf', MLPClassifier())])

In [18]:
voting_clf.score(X_val, y_val)

0.9674

In [19]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.9666, 0.9675, 0.8701, 0.9587]

In [20]:
# train the ensemble again without the SVM

named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("mlp_clf", mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)

voting_clf.score(X_val, y_val)

0.9698

In [21]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.9652, 0.9681, 0.9628]

In [22]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.9687

In [23]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9726

9. Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image's class. Train a classifier on this new training set.

In [24]:
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [25]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True)
rnd_forest_blender.fit(X_val_predictions, y_val)

RandomForestClassifier(n_estimators=200, oob_score=True)

In [26]:
rnd_forest_blender.oob_score_

0.9685

In [27]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [28]:
y_pred = rnd_forest_blender.predict(X_test_predictions)
accuracy_score(y_test, y_pred)

0.9682