In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

In [2]:
# Loading the dataset
X, y = fetch_openml('mnist_784', return_X_y=True)

x_train = X[:60000]
x_test = X[60000:]
y_train = y[:60000]
y_test = y[60000:]

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

In [4]:
%%timeit
rf.fit(x_train, y_train)

28.4 s ± 33.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, rf.predict(x_test))

0.9705

In [6]:
# Now applying PCA and then training

from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
x_train_red = pca.fit_transform(x_train)

In [8]:
# Only 154 components left out of the 784 components
pca.components_.shape

(154, 784)

In [9]:
# Almost thrice more time required, showing that reducing dataset doesn't
# always speed up training and it depends on algorithm + type pf data
%%timeit
rf.fit(x_train_red, y_train)

1min 16s ± 1.77 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
# Reduces performance by around 2%
accuracy_score(y_test, rf.predict(pca.transform(x_test)))

0.9481