### Dimensionality Reduction

#### MNIST With Principal Component Analysis (PCA)

In [23]:
import numpy as np

In [24]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.int64)

In [25]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000)

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=10, random_state=11)

In [27]:
import time

original_data_train_start = time.time()
rf_clf.fit(X_train, y_train)
original_data_train_time = time.time() - original_data_train_start

In [28]:
original_data_train_time

7.795207977294922

In [29]:
rf_clf.score(X_test, y_test)

0.948

#### Dimensionality Reduction With Explained Variance Ratio Of 95%

In [30]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver='full')
X_train_reduced = pca.fit_transform(X_train)

In [31]:
pca.explained_variance_ratio_

array([0.09747651, 0.07172315, 0.06147336, 0.05394125, 0.04904265,
       0.04304134, 0.03281495, 0.02875437, 0.02759226, 0.02341265,
       0.02112042, 0.02043308, 0.01704051, 0.01693138, 0.0158206 ,
       0.01482767, 0.01318209, 0.01276324, 0.01184139, 0.01154963,
       0.01067252, 0.01011199, 0.00958624, 0.00910295, 0.00885332,
       0.00837205, 0.00812999, 0.00784962, 0.00739628, 0.00686777,
       0.00655862, 0.00645629, 0.00600243, 0.00585679, 0.00565062,
       0.0054326 , 0.00506195, 0.00487308, 0.00478222, 0.0046711 ,
       0.00451755, 0.00443306, 0.00418076, 0.00396021, 0.00384503,
       0.00375952, 0.00359931, 0.00350356, 0.0033791 , 0.00319063,
       0.00316558, 0.00309421, 0.00297408, 0.00287255, 0.00283741,
       0.00270198, 0.00268342, 0.00256729, 0.002527  , 0.0024466 ,
       0.00240874, 0.00239978, 0.00228161, 0.00221164, 0.00213432,
       0.00206371, 0.00202467, 0.00194731, 0.00192003, 0.00189138,
       0.00186765, 0.00180132, 0.00175699, 0.0017406 , 0.00164

In [32]:
rf_clf_PCA = RandomForestClassifier(n_estimators=10, random_state=11)

In [33]:
pca_data_train_start = time.time()
rf_clf_PCA.fit(X_train_reduced, y_train)
pca_data_train_time = time.time() - pca_data_train_start

In [34]:
pca_data_train_time

8.212119817733765

In [41]:
X_test_reduced = pca.transform(X_test)

In [35]:
rf_clf_PCA.score(pca.transform(X_test), y_test)

0.8952

#### Using Softmax Classifier

In [45]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(random_state=11, solver="lbfgs", multi_class="multinomial")
t0 = time.time()
log_clf.fit(X_train, y_train)
t1 = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [46]:
print(f"logistic regression training time: {t1-t0:.2f} seconds")

logistic regression training time: 14.91 seconds


In [47]:
from sklearn.metrics import accuracy_score

y_pred = log_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9238

In [50]:
log_clf2 = LogisticRegression(random_state=11, solver="lbfgs", multi_class="multinomial")
t0 = time.time()
log_clf2.fit(X_train_reduced, y_train)
t1 = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [51]:
print(f"logistic regression with PCA training time: {t1-t0:.2f} seconds")

logistic regression with PCA training time: 5.11 seconds


In [53]:
y_pred = log_clf2.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9206

#### MNIST Handwritten Digits Visualization With t-SNE

In [54]:
from sklearn.manifold import TSNE

tsne_embedder = TSNE(n_components=2)

In [None]:
X_train_embedded = tsne_embedder.fit_transform(X_train)
X_train_embedded