# Exercise 9.

Load the MNIST dataset (introduced in Chapter 3) and split it into a training set and a
test set (take the first 60,000 instances for training, and the remaining 10,000 for testing). Train
a `RandomForestClassifier` on the dataset and time how long it takes, then evaluate the resulting model
on the test set. Next, use PCA to reduce the dataset's dimensionality, with an explained variance ratio
of 95%. Train a new random forest classifier on the reduced dataset and see how long it takes. Was training
much faster? Next, evaluate the classifier on the test set. How does it compare to the previous classifier?
Try again with an `SGDClassifier`. How much does PCA help now?

## Step 1. Load and split the dataset

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [3]:
# Load the data
mnist = fetch_openml("mnist_784", version=1, as_frame=False, parser="auto")
X = mnist.data
y = mnist.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10_000, random_state=42)

## Step 2. Train and test classifiers without PCA

In [23]:
import time
from functools import wraps

def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, *kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        return result, execution_time
    return wrapper

# from chapter_07/common/model_utils.py
@timeit
def evaluate_single_classifier(X_train, y_train, X_test, y_test, clf):
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    return clf, accuracy

def train_and_evaluate_classifiers(
    classifiers, X_train, y_train, X_validation, y_validation
):
    """
    Train each classifier on the full training set and print its accuracy.

    Returns:
        trained_models: List of tuples (name, trained_classifier)
    """
    trained_models = []

    for name, clf in classifiers:
        fitted_clf, accuracy = evaluate_single_classifier(
            X_train, y_train, X_validation, y_validation, clf
        )
        print(f"{name}'s accuracy on the validation set is {accuracy:.4f}")

        trained_models.append((name, fitted_clf))

    return trained_models

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

def classifiers_factory():
    return [
        ("random_forest", RandomForestClassifier(random_state=42)),
        ("sgd", SGDClassifier(random_state=42))
    ]


In [25]:
base_classifiers = classifiers_factory()
for name, clf in base_classifiers:
    (fitted_clf, accuracy), timing = evaluate_single_classifier(X_train, y_train, X_test, y_test, clf)
    print(f"{name} took {timing:.4f} seconds to execute. It had the accuracy of {accuracy:.4f}")

random_forest took 18.6954 seconds to execute. It had the accuracy of 0.9674
sgd took 60.8442 seconds to execute. It had the accuracy of 0.8691


# Step 3. Use PCA to reduce dimensions by keeping variance at 95%

In [28]:
from sklearn.decomposition import PCA
import numpy as np

In [29]:
# find the optimal number of dimensions so as to keep variance at 95%
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print(f"optimal number of retained dimensions for the MNIST dataset (to keep variance at 95%) is {d}")

optimal number of retained dimensions for the MNIST dataset (to keep variance at 95%) is 154


In [31]:
# obtain the reduced training set
pca = PCA(n_components=d)
X_train_reduced = pca.fit_transform(X_train)

# and the reduced test set
# we need to use transform() instead of fit_transform(), this
# 1. ensures we use the same principal components in training and testing
# 2. makes sure that the model is evaluated on data transformed in the same way as it was trained
# 2. prevents data leakage
X_test_reduced = pca.transform(X_test)

# Step 4. Train and test classifiers with PCA

In [36]:
classifiers_with_pca = classifiers_factory()
for name, clf in classifiers_with_pca:
    (fitted_clf, accuracy), timing = evaluate_single_classifier(X_train_reduced, y_train, X_test_reduced, y_test, clf)
    print(f"{name} with PCA took {timing:.4f} seconds to execute. It had the accuracy of {accuracy:.4f}")

random_forest with PCA took 83.1906 seconds to execute. It had the accuracy of 0.9471
sgd with PCA took 15.8823 seconds to execute. It had the accuracy of 0.8862
