# Chapter 7: Moons, Iris & MNIST

This notebook contains the code for chapter 7 of the Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow book.

In [1]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.datasets import fetch_openml, load_iris, make_moons
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingRegressor,
    RandomForestClassifier,
    VotingClassifier,
)

import numpy as np

## Global configuration

In [2]:
MNIST_DATA_NAME = "mnist_784"
MNIST_DATA_VERSION = 1

RANDOM_SEED = 42

JOB_COUNT = 3

In [3]:
np.random.seed(RANDOM_SEED)

## Load <ins>moons</ins> data

In [4]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=RANDOM_SEED)

## Split <ins>moons</ins> data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED)

## Train <ins>hard-voting</ins> model

In [6]:
lr_model = LogisticRegression()
rf_model = RandomForestClassifier()
svc_model = SVC()

In [7]:
vot_model = VotingClassifier(
    estimators=[("lr", lr_model), ("rf", rf_model), ("svc", svc_model)],
    voting="hard",
)

In [8]:
%%time
vot_model.fit(X_train, y_train)

CPU times: user 93.8 ms, sys: 0 ns, total: 93.8 ms
Wall time: 92.9 ms


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

## Performance <ins>hard-voting</ins> model

In [9]:
%%time
for model in (lr_model, rf_model, svc_model, vot_model):
    model.fit(X_train, y_train)
    
    y_test_predictions = model.predict(X_test)
    
    print(model.__class__.__name__, accuracy_score(y_test, y_test_predictions))

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.896
VotingClassifier 0.904
CPU times: user 188 ms, sys: 7.61 ms, total: 196 ms
Wall time: 193 ms


## Train <ins>soft-voting</ins> model

In [10]:
lr_model = LogisticRegression()
rf_model = RandomForestClassifier()
svc_model = SVC(probability=True)

In [11]:
vot_model = VotingClassifier(
    estimators=[("lr", lr_model), ("rf", rf_model), ("svc", svc_model)],
    voting="soft",
)

In [12]:
%%time
vot_model.fit(X_train, y_train)

CPU times: user 102 ms, sys: 910 µs, total: 102 ms
Wall time: 100 ms


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

## Performance <ins>soft-voting</ins> model

In [13]:
%%time
for model in (lr_model, rf_model, svc_model, vot_model):
    model.fit(X_train, y_train)
    
    y_test_predictions = model.predict(X_test)
    
    print(model.__class__.__name__, accuracy_score(y_test, y_test_predictions))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912
CPU times: user 215 ms, sys: 0 ns, total: 215 ms
Wall time: 211 ms


## Train <ins>bagging</ins> model

In [14]:
bag_model = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=JOB_COUNT,
)

In [15]:
%%time
bag_model.fit(X_train, y_train)

CPU times: user 38 ms, sys: 12.7 ms, total: 50.7 ms
Wall time: 654 ms


BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=3)

## Evaluate <ins>bagging</ins> model

In [16]:
%%time
y_test_predictions = bag_model.predict(X_test)

CPU times: user 79.9 ms, sys: 0 ns, total: 79.9 ms
Wall time: 96.2 ms


## Train <ins>out-of-bag</ins> model

In [17]:
bag_model = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    bootstrap=True,
    n_jobs=JOB_COUNT,
    oob_score=True,
)

In [18]:
%%time
bag_model.fit(X_train, y_train)

CPU times: user 147 ms, sys: 2.43 ms, total: 149 ms
Wall time: 346 ms


BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500,
                  n_jobs=3, oob_score=True)

## Performance <ins>out-of-bag</ins> model

In [19]:
y_test_predictions = bag_model.predict(X_test)

In [20]:
bag_model.oob_score_, accuracy_score(y_test, y_test_predictions)

(0.8986666666666666, 0.896)

## Train <ins>random forest</ins> model

In [21]:
rf_model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=JOB_COUNT)

In [22]:
%%time
rf_model.fit(X_train, y_train)

CPU times: user 581 ms, sys: 112 ms, total: 693 ms
Wall time: 512 ms


RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=3)

## Evaluate <ins>random forest</ins> model

In [23]:
%%time
y_test_predictions = rf_model.predict(X_test)

CPU times: user 54 ms, sys: 20.1 ms, total: 74.1 ms
Wall time: 57.4 ms


## Load <ins>iris</ins> data

In [24]:
iris = load_iris()

## Split <ins>iris</ins> data

In [25]:
X, y = iris["data"], iris["target"]

## Train <ins>random forest</ins> model

In [26]:
rf_model = RandomForestClassifier(n_estimators=500, n_jobs=JOB_COUNT)

In [27]:
%%time
rf_model.fit(X, y)

CPU times: user 532 ms, sys: 99.9 ms, total: 631 ms
Wall time: 494 ms


RandomForestClassifier(n_estimators=500, n_jobs=3)

## Evaluate <ins>random forest</ins> model

In [28]:
for name, score in zip(iris["feature_names"], rf_model.feature_importances_):
    print(name, score)

sepal length (cm) 0.11232940454143936
sepal width (cm) 0.026348379794474807
petal length (cm) 0.4571984455932355
petal width (cm) 0.4041237700708503


## Train <ins>ada-boost</ins> model

In [29]:
ada_model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm="SAMME.R",
    learning_rate=0.5,
)

In [30]:
%%time
ada_model.fit(X_train, y_train)

CPU times: user 181 ms, sys: 0 ns, total: 181 ms
Wall time: 180 ms


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

## Generate <ins>quadratic</ins> data

In [31]:
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0]**2 + 0.05 * np.random.randn(100)

## Train <ins>gradient-boost</ins> model (manual)

In [32]:
y1 = y

In [33]:
dt_model1 = DecisionTreeRegressor(max_depth=2)

In [34]:
%%time
dt_model1.fit(X, y1)

CPU times: user 1.86 ms, sys: 0 ns, total: 1.86 ms
Wall time: 1.25 ms


DecisionTreeRegressor(max_depth=2)

In [35]:
y2 = y1 - dt_model1.predict(X)

In [36]:
dt_model2 = DecisionTreeRegressor(max_depth=2)

In [37]:
%%time
dt_model2.fit(X, y2)

CPU times: user 1.76 ms, sys: 0 ns, total: 1.76 ms
Wall time: 1.06 ms


DecisionTreeRegressor(max_depth=2)

In [38]:
y3 = y2 - dt_model2.predict(X)

In [39]:
dt_model3 = DecisionTreeRegressor(max_depth=2)

In [40]:
%%time
dt_model3.fit(X, y3)

CPU times: user 265 µs, sys: 193 µs, total: 458 µs
Wall time: 345 µs


DecisionTreeRegressor(max_depth=2)

## Evaluate <ins>gradient-boost</ins> model (manual)

In [41]:
X_test_new = np.array([[0.8]])

In [42]:
y_test_new_predictions = sum(tree.predict(X_test_new) for tree in (dt_model1, dt_model2, dt_model3))
y_test_new_predictions

array([0.7709191])

## Train <ins>gradient-boost</ins> model (sklearn)

In [43]:
gb_model = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)

In [44]:
%%time
gb_model.fit(X, y)

CPU times: user 678 µs, sys: 494 µs, total: 1.17 ms
Wall time: 950 µs


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

## Split <ins>quadratic</ins> data

In [45]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y)

## Train <ins>gradient-boost</ins> model (early-stopping)

In [46]:
gb_model = GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [47]:
%%time
gb_model.fit(X_train, y_train)

CPU times: user 19.5 ms, sys: 0 ns, total: 19.5 ms
Wall time: 18.7 ms


GradientBoostingRegressor(max_depth=2, n_estimators=120)

## Tune <ins>gradient-boost</ins> model (early-stopping)

In [48]:
errors = [mean_squared_error(y_validation, y_validation_predictions)
          for y_validation_predictions in gb_model.staged_predict(X_validation)]

best_n_estimators = np.argmin(errors)
best_n_estimators

55

In [49]:
gb_model = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)

In [50]:
%%time
gb_model.fit(X_train, y_train)

CPU times: user 9.42 ms, sys: 0 ns, total: 9.42 ms
Wall time: 8.77 ms


GradientBoostingRegressor(max_depth=2, n_estimators=55)

# Exercises

1. If you have trained five different models on the exact same training data, and they all achieve 95% precision, is there any chance that you can combine these models to get better results? If so, how? If not, why?

**Solution**

Yes, you can by applying an ensemble technique like voting.

2. What is the difference between hard and soft voting classifiers?

**Solution**

Hard voting classifiers pick the class that get the most votes. Soft voting classifiers use the average class probability and then picks the class with the highest probability.

3. Is it possible to speed up training of a bagging ensemble by distributing it across multiple servers? What about pasting ensembles, boosting ensembles, random forests, or stacking ensembles?

**Solution**

* Bagging: yes

* Pasting: yes

* Boosting: no

* Random forests: yes

* Stacking: no

4. What is the benefit of out-of-bag evaluation?

**Solution**

Each predictor is evaluated using instances that it was not trained on. This makes it possible to have a fairly unbiased evaluation.

5. What makes Extra-Trees more random than regular Random Forests? How can this extra randomness help? Are Extra-Trees slower or faster than regular Random Forests?

**Solution**

Extra trees doesn't search for the best threshold for each feature but it uses random threshold, therefore it is faster than regular random forests.

6. If your AdaBoost ensemble underfits the training data, what hyperparameters should you tweak and how?

**Solution**

You can try to reduce the number of estimators or the regularization hyperparameters of the base estimators.

7. If your Gradient Boosting ensemble overfits the training set, should you increase or decrease the learning rate?

**Solution**

You should decrease the learning rate.

8. Load the MNIST data (introduced in Chapter 3), and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

**Solution**

### Load

In [51]:
mnist = fetch_openml(MNIST_DATA_NAME, version=MNIST_DATA_VERSION, as_frame=False)

In [52]:
X, y = mnist["data"], mnist["target"]

### Split

In [53]:
X_validation, X_test, y_validation, y_test = train_test_split(
    X,
    y,
    test_size=10000,
    random_state=RANDOM_SEED,
)

In [54]:
X_train, X_validation, y_train, y_validation = train_test_split(
    X_validation,
    y_validation,
    test_size=10000,
    random_state=RANDOM_SEED,
)

### Train (individual)

In [55]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
et_model = ExtraTreesClassifier(n_estimators=100, random_state=RANDOM_SEED)
svc_model = LinearSVC(max_iter=100, tol=20, random_state=RANDOM_SEED)

In [56]:
%%time
rf_model.fit(X_train, y_train)

CPU times: user 20.4 s, sys: 5.88 ms, total: 20.4 s
Wall time: 20.4 s


RandomForestClassifier(random_state=42)

In [57]:
%%time
et_model.fit(X_train, y_train)

CPU times: user 15.7 s, sys: 71.2 ms, total: 15.8 s
Wall time: 15.8 s


ExtraTreesClassifier(random_state=42)

In [58]:
%%time
svc_model.fit(X_train, y_train)

CPU times: user 499 ms, sys: 32.2 ms, total: 531 ms
Wall time: 530 ms


LinearSVC(max_iter=100, random_state=42, tol=20)

### Evaluate (individual)

In [59]:
rf_model.score(X_validation, y_validation), rf_model.score(X_test, y_test)

(0.9692, 0.9645)

In [60]:
et_model.score(X_validation, y_validation), et_model.score(X_test, y_test)

(0.9715, 0.9691)

In [61]:
svc_model.score(X_validation, y_validation), svc_model.score(X_test, y_test)

(0.859, 0.8566)

### Train (ensemble)

In [62]:
vot_model = VotingClassifier(
    estimators=[("rf", rf_model), ("et", et_model), ("svc", svc_model)],
    voting="hard",
)

In [63]:
%%time
vot_model.fit(X_train, y_train)

CPU times: user 37.7 s, sys: 1.16 s, total: 38.8 s
Wall time: 37.5 s


VotingClassifier(estimators=[('rf', RandomForestClassifier(random_state=42)),
                             ('et', ExtraTreesClassifier(random_state=42)),
                             ('svc',
                              LinearSVC(max_iter=100, random_state=42,
                                        tol=20))])

### Evaluate (ensemble)

In [64]:
vot_model.score(X_validation, y_validation), vot_model.score(X_test, y_test)

(0.9693, 0.965)