# Chapter 8: Generated, MNIST & Swiss Roll

This notebook contains the code for chapter 8 of the Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow book.

In [1]:
from sklearn.decomposition import IncrementalPCA, KernelPCA, PCA
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets import fetch_openml, make_swiss_roll
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import numpy as np

## Global configuration

In [2]:
MNIST_DATA_NAME = "mnist_784"
MNIST_DATA_VERSION = 1

RANDOM_SEED = 42

JOB_COUNT = 3

In [3]:
np.random.seed(RANDOM_SEED)

## Generate <ins>3d</ins> data

In [4]:
m = 60
w1 = 0.1
w2 = 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5

X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

## Reduce dimensionality <ins>pca</ins> (numpy)

In [5]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)

c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [6]:
%%time
X_reduced = X_centered.dot(Vt.T[:, :2])

CPU times: user 17 µs, sys: 40 µs, total: 57 µs
Wall time: 59.6 µs


## Reduce dimensionality <ins>pca</ins> (scikit-learn)

In [7]:
pca = PCA(n_components = 2)

In [8]:
%%time
X_reduced = pca.fit_transform(X)

CPU times: user 229 µs, sys: 532 µs, total: 761 µs
Wall time: 584 µs


In [9]:
pca.explained_variance_ratio_

array([0.85406025, 0.13622918])

## Load <ins>MNIST</ins> data

In [10]:
mnist = fetch_openml(MNIST_DATA_NAME, version=MNIST_DATA_VERSION, as_frame=False)

In [11]:
X, y = mnist["data"], mnist["target"]

## Split <ins>MNIST</ins> data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Reduce dimensionality <ins>pca</ins>

In [13]:
pca = PCA(n_components=0.95)

In [14]:
%%time
X_train_reduced = pca.fit_transform(X_train)

CPU times: user 35.9 s, sys: 9.82 s, total: 45.8 s
Wall time: 3.35 s


## Increase dimensionality <ins>pca</ins>

In [15]:
X_train_recovered = pca.inverse_transform(X_train_reduced)

## Reduce dimensionality <ins>randomized pca</ins>

In [16]:
rnd_pca = PCA(n_components=154, svd_solver="randomized")

In [17]:
%%time
X_train_reduced = rnd_pca.fit_transform(X_train)

CPU times: user 31.5 s, sys: 28.3 s, total: 59.8 s
Wall time: 4.05 s


## Reduce dimensionality <ins>incremental pca</ins>

In [18]:
inc_pca = IncrementalPCA(n_components=154)

In [19]:
%%time
n_batches = 100

for X_train_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_train_batch)

CPU times: user 2min 2s, sys: 2min 43s, total: 4min 46s
Wall time: 18.6 s


In [20]:
X_train_reduced = inc_pca.transform(X_train)

## Load <ins>swiss roll</ins> data

In [21]:
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=RANDOM_SEED)

In [22]:
X, y = X, t > 6.9

## Reduce dimensionality <ins>kernel pca</ins>

In [23]:
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)

In [24]:
%%time
X_reduced = rbf_pca.fit_transform(X)

CPU times: user 295 ms, sys: 1.04 s, total: 1.34 s
Wall time: 87.4 ms


## Train <ins>logistic regression</ins> model

In [25]:
lr_model = Pipeline([
    ("kernel_pca", KernelPCA(n_components=2)),
    ("logistic_regression", LogisticRegression())
])

In [26]:
parameters = [{
    "kernel_pca__gamma": np.linspace(0.03, 0.05, 10),
    "kernel_pca__kernel": ["rbf", "sigmoid"],
}]

In [27]:
grid_search = GridSearchCV(lr_model, parameters, cv=3, n_jobs=JOB_COUNT)

In [28]:
%%time
grid_search.fit(X, y)

CPU times: user 324 ms, sys: 624 ms, total: 947 ms
Wall time: 1.97 s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kernel_pca',
                                        KernelPCA(n_components=2)),
                                       ('logistic_regression',
                                        LogisticRegression())]),
             n_jobs=3,
             param_grid=[{'kernel_pca__gamma': array([0.03      , 0.03222222, 0.03444444, 0.03666667, 0.03888889,
       0.04111111, 0.04333333, 0.04555556, 0.04777778, 0.05      ]),
                          'kernel_pca__kernel': ['rbf', 'sigmoid']}])

In [29]:
grid_search.best_params_

{'kernel_pca__gamma': 0.043333333333333335, 'kernel_pca__kernel': 'rbf'}

## Reduce dimensionality <ins>kernel pca</ins>

In [30]:
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.0433, fit_inverse_transform=True)

In [31]:
%%time
X_reduced = rbf_pca.fit_transform(X)

CPU times: user 625 ms, sys: 1.65 s, total: 2.28 s
Wall time: 147 ms


In [32]:
X_recovered = rbf_pca.inverse_transform(X_reduced)

## Evaluate dimensionality reduction <ins>kernel pca</ins> (reconstruction error)

In [33]:
mean_squared_error(X, X_recovered)

1.4105508609793482e-26

## Reduce dimensionality <ins>lle</ins>

In [34]:
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)

In [35]:
%%time
X_reduced = lle.fit_transform(X)

CPU times: user 464 ms, sys: 1.15 s, total: 1.61 s
Wall time: 132 ms


# Exercises

1. What are the main motivations for reducing a dataset’s dimensionality? What are the main drawbacks?

**Solution**

Pros:
 * Speed up training (and remove noise and redundant features in some cases)
 * Easier for visualizing data
 * Saves space (and memory)

Cons:
 * Some information is lost
 * Can be computationally intensive
 * Adds complexity to pipeline
 * Transformed features are hard to interpret

2. What is the curse of dimensionality?

**Solution**

The curse of dimensionality is the fact many problems do not exist in lowdimensional space but in highdimensional space.

3. Once a dataset’s dimensionality has been reduced, is it possible to reverse the operation? If so, how? If not, why?

**Solution**

It is not possible to exactly reverse the operation because some information gets lost during the dimensionality reduction.

4. Can PCA be used to reduce the dimensionality of a highly nonlinear dataset?

**Solution**

Yes, PCA can be used to reduce the dimensionality nonlinear datasets.

5. Suppose you perform PCA on a 1,000-dimensional dataset, setting the explained variance ratio to 95%. How many dimensions will the resulting dataset have?

**Solution**

That depends on the dataset. If all the points are perfectly aligned PCA can reduce the dataset down to one dimension. If all the points are perfeclty random than PCA would roughly need 950 dimensions.

6. In what cases would you use vanilla PCA, Incremental PCA, Randomized PCA, or Kernel PCA?

**Solution**

Vanilla PCA: if the dataset fits in memory.

Incremental PCA: if the dataset doesn't fit in memory.

Randomized PCA: if the dataset fits in memory and the dimensions need to be considerably reduced.

Kernel PCA: if the dataset is nonlinear.

7. How can you evaluate the performance of a dimensionality reduction algorithm on your dataset?

**Solution**

By using the reconstruction error or if the dimensionality reduction algorithm is used as a preprocessing step then the performance of the second algorithm can be used.

8. Does it make any sense to chain two different dimensionality reduction algorithms?

**Solution**

Yes, there are various types of dimensionality reduction algorithms so it could be handy to apply a rough algorithm first and then a more fine algorithm after the rough algorithm.

9. Load the MNIST dataset (introduced in Chapter 3) and split it into a training set and a test set (take the first 60,000 instances for training, and the remaining 10,000 for testing). Train a Random Forest classifier on the dataset and time how long it takes, then evaluate the resulting model on the test set. Next, use PCA to reduce the dataset’s dimensionality, with an explained variance ratio of 95%. Train a new Random Forest classifier on the reduced dataset and see how long it takes. Was training much faster? Next evaluate the classifier on the test set: how does it compare to the previous classifier?

**Solution**

### Load

In [36]:
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]

X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]

### Train

In [37]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)

In [38]:
%%time
rf_model.fit(X_train, y_train)

CPU times: user 29.2 s, sys: 1.08 s, total: 30.3 s
Wall time: 28.9 s


RandomForestClassifier(random_state=42)

### Evaluate

In [39]:
%%time
y_test_predictions = rf_model.predict(X_test)

CPU times: user 225 ms, sys: 4.79 ms, total: 230 ms
Wall time: 229 ms


In [40]:
accuracy_score(y_test, y_test_predictions)

0.9705

### Dimensionality

In [41]:
pca = PCA(n_components=0.95)

In [42]:
%%time
pca.fit(X_train)

CPU times: user 43.9 s, sys: 13.3 s, total: 57.2 s
Wall time: 4.08 s


PCA(n_components=0.95)

In [43]:
X_train_reduced = pca.transform(X_train)

In [44]:
X_test_reduced = pca.transform(X_test)

### Train

In [45]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)

In [46]:
%%time
rf_model.fit(X_train_reduced, y_train)

CPU times: user 1min 12s, sys: 1.23 s, total: 1min 13s
Wall time: 1min 12s


RandomForestClassifier(random_state=42)

### Evaluate

In [47]:
%%time
y_test_predictions = rf_model.predict(X_test_reduced)

CPU times: user 200 ms, sys: 0 ns, total: 200 ms
Wall time: 198 ms


In [48]:
accuracy_score(y_test, y_test_predictions)

0.9481