# Chapter 5: Iris, Moons & MNIST

This notebook contains the code for chapter 5 of the Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow book.

In [1]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.datasets import fetch_openml, load_iris, make_moons
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

from scipy.stats import reciprocal, uniform
import numpy as np

## Global configuration

In [2]:
MNIST_DATA_NAME = "mnist_784"
MNIST_DATA_VERSION = 1

RANDOM_SEED = 42

JOB_COUNT = 3

In [3]:
np.random.seed(RANDOM_SEED)

## Load <ins>iris</ins> data

In [4]:
iris = load_iris()

## Split <ins>iris</ins> data

In [5]:
X, y = iris.data[:, (2, 3)], (iris.target == 2).astype(np.float64)

## Train <ins>linear svc</ins> model

In [6]:
lsvc_model = Pipeline([
    ("standard_scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=1, loss="hinge")),
])

In [7]:
%%time
lsvc_model.fit(X, y)

CPU times: user 420 µs, sys: 807 µs, total: 1.23 ms
Wall time: 1.06 ms


Pipeline(steps=[('standard_scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

## Evaluate <ins>linear svc</ins> model

In [8]:
lsvc_model.predict([[5.5, 1.7]])

array([1.])

## Load <ins>moons</ins> data

In [9]:
X, y = make_moons(n_samples=100, noise=0.15, random_state=RANDOM_SEED)

## Train <ins>polynomial svc</ins> model

In [10]:
psvc = Pipeline([
    ("polynomial_features", PolynomialFeatures(degree=3)),
    ("standard_scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=10, loss="hinge")),
])

In [11]:
%%time
psvc.fit(X, y)

CPU times: user 855 µs, sys: 1.62 ms, total: 2.48 ms
Wall time: 1.89 ms




Pipeline(steps=[('polynomial_features', PolynomialFeatures(degree=3)),
                ('standard_scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=10, loss='hinge'))])

## Train <ins>polynomial kernel svc</ins> model

In [12]:
pksvc_model = Pipeline([
    ("standard_scaler", StandardScaler()),
    ("svc", SVC(kernel="poly", degree=3, coef0=1, C=5)),
])

In [13]:
%%time
pksvc_model.fit(X, y)

CPU times: user 617 µs, sys: 770 µs, total: 1.39 ms
Wall time: 976 µs


Pipeline(steps=[('standard_scaler', StandardScaler()),
                ('svc', SVC(C=5, coef0=1, kernel='poly'))])

## Train <ins>rbf kernel svc</ins> model

In [14]:
rksvc_model = Pipeline([
    ("standard_scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", gamma=5, C=0.001)),
])

In [15]:
%%time
rksvc_model.fit(X, y)

CPU times: user 1.97 ms, sys: 140 µs, total: 2.11 ms
Wall time: 1.28 ms


Pipeline(steps=[('standard_scaler', StandardScaler()),
                ('svc', SVC(C=0.001, gamma=5))])

## Generate <ins>linear</ins> data

In [16]:
m = 50

X = 2 * np.random.rand(m, 1)
y = (4 + 3 * X + np.random.randn(m, 1)).ravel()

## Train <ins>linear svr</ins> model

In [17]:
lsvr_model = LinearSVR(epsilon=1.5)

In [18]:
%%time
lsvr_model.fit(X, y)

CPU times: user 198 µs, sys: 372 µs, total: 570 µs
Wall time: 351 µs


LinearSVR(epsilon=1.5)

## Generate <ins>quadratic</ins> data

In [19]:
m = 100

X = 2 * np.random.rand(m, 1) - 1
y = (0.2 + 0.1 * X + 0.5 * X**2 + np.random.randn(m, 1)/10).ravel()

## Train <ins>polynomial kernel svr</ins> model

In [20]:
psvr_model = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)

In [21]:
%%time
psvr_model.fit(X, y)

CPU times: user 8.25 ms, sys: 246 µs, total: 8.49 ms
Wall time: 8.14 ms


SVR(C=100, degree=2, kernel='poly')

# Exercises

1. What is the fundamental idea behind Support Vector Machines?

**Solution**

The idea behind support vector machines is to have the widest street possible between the separate classes.

2. What is a support vector?

**Solution**

A support vector is an instance located on the street or on the border of the street. Any instance that is no support vector has no influence on the model.

3. Why is it important to scale the inputs when using SVMs?

**Solution**

Because SVM will try to fit the largest street possible, if the inputs are not scaled SVM's will neglect smaller features.

4. Can an SVM classifier output a confidence score when it classifies an instance? What about a probability?

**Solution**

An SVM can compute the distance between an instance and the decision boundary (confidence score), but it cannot use this score as a probability.

5. Should you use the primal or the dual form of the SVM problem to train a model on a training set with millions of instances and hundreds of features?

**Solution**

If the training set contains millionis of instances and hundereds of features you should use the primal form.

6. Say you trained an SVM classifier with an RBF kernel. It seems to underfit the training set: should you increase or decrease γ (gamma)? What about C?

**Solution**

If the model underfits the training set, you should increase $\gamma$ (gamma) or C.

7. Train a LinearSVC on a linearly separable dataset. Then train an SVC and a SGDClassifier on the same dataset. See if you can get them to produce roughly the same model.

**Solution**

### Load

In [22]:
iris = load_iris()

In [23]:
X, y = iris["data"][:, (2, 3)], iris["target"]

In [24]:
X, y = X[(y == 0) | (y == 1)], y[(y == 0) | (y == 1)]

### Prepare

In [25]:
standard_scaler = StandardScaler()

In [26]:
%%time
X_scaled = standard_scaler.fit_transform(X)

CPU times: user 162 µs, sys: 300 µs, total: 462 µs
Wall time: 426 µs


### Train

In [27]:
C = 5
alpha = 1 / (C * len(X))

In [28]:
lsvc_model = LinearSVC(loss="hinge", C=C, random_state=RANDOM_SEED)
svc_model = SVC(kernel="linear", C=C, random_state=RANDOM_SEED)
sgd_model = SGDClassifier(loss="hinge", learning_rate="constant", eta0=0.001, alpha=alpha, random_state=RANDOM_SEED)

In [29]:
%%time
lsvc_model.fit(X_scaled, y)

CPU times: user 230 µs, sys: 419 µs, total: 649 µs
Wall time: 546 µs


LinearSVC(C=5, loss='hinge', random_state=42)

In [30]:
%%time
svc_model.fit(X_scaled, y)

CPU times: user 698 µs, sys: 216 µs, total: 914 µs
Wall time: 647 µs


SVC(C=5, kernel='linear', random_state=42)

In [31]:
%%time
sgd_model.fit(X_scaled, y)

CPU times: user 373 µs, sys: 677 µs, total: 1.05 ms
Wall time: 759 µs


SGDClassifier(alpha=0.002, eta0=0.001, learning_rate='constant',
              random_state=42)

### Evaluate

In [32]:
lsvc_model.intercept_, lsvc_model.coef_

(array([0.28475098]), array([[1.05364854, 1.09903804]]))

In [33]:
svc_model.intercept_, svc_model.coef_

(array([0.31896852]), array([[1.1203284 , 1.02625193]]))

In [34]:
sgd_model.intercept_, sgd_model.coef_

(array([0.117]), array([[0.77714169, 0.72981762]]))

8. Train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?

**Solution**

### Load

In [35]:
mnist = fetch_openml(MNIST_DATA_NAME, version=MNIST_DATA_VERSION, as_frame=False)

In [36]:
X, y = mnist["data"], mnist["target"]

### Split

In [37]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

### Prepare

In [38]:
standard_scaler = StandardScaler()

In [39]:
%%time
X_train_scaled = standard_scaler.fit_transform(X_train.astype(np.float32))

CPU times: user 466 ms, sys: 64.2 ms, total: 530 ms
Wall time: 530 ms


In [40]:
%%time
X_test_scaled = standard_scaler.fit_transform(X_test.astype(np.float32))

CPU times: user 98.6 ms, sys: 0 ns, total: 98.6 ms
Wall time: 97.4 ms


### Train

In [41]:
parameters = {
    "gamma": reciprocal(0.001, 0.1),
    "C": uniform(1, 10),
}

In [42]:
randomized_search = RandomizedSearchCV(
    SVC(random_state=RANDOM_SEED),
    parameters,
    n_iter=10 ,
    cv=3,
    n_jobs=JOB_COUNT,
)

In [43]:
%%time
randomized_search.fit(X_train_scaled[:1000], y_train[:1000])

CPU times: user 232 ms, sys: 17.9 ms, total: 250 ms
Wall time: 2.8 s


RandomizedSearchCV(cv=3, estimator=SVC(random_state=42), n_jobs=3,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9472ad6970>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9472ad68e0>})

In [44]:
%%time
randomized_search.best_estimator_.fit(X_train_scaled, y_train)

CPU times: user 3min 25s, sys: 288 ms, total: 3min 25s
Wall time: 3min 25s


SVC(C=7.830067734163569, gamma=0.0013879611079315789, random_state=42)

## Evaluate

In [45]:
%%time
y_test_predictions = randomized_search.best_estimator_.predict(X_test_scaled)

CPU times: user 1min 14s, sys: 7.52 ms, total: 1min 14s
Wall time: 1min 14s


In [46]:
accuracy_score(y_test, y_test_predictions)

0.972