#  Dimensionality Reduction

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Imports

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_moons, fetch_openml
from sklearn.decomposition import PCA, KernelPCA

In [3]:
X, y = make_moons(1000, noise=0.4)

### Manual approach

In [4]:
X_centered = X - X.mean(axis=0)
U, s, vt = np.linalg.svd(X_centered)
c1 = vt.T[:, 0]
c2 = vt.T[:, 1]

In [5]:
c1, c2

(array([ 0.94816706, -0.31777229]), array([-0.31777229, -0.94816706]))

In [6]:
w2 = vt.T[:, :2]
X2d = X_centered.dot(w2)
X2d

array([[-1.46624873,  1.06879525],
       [-0.86051497,  0.00390033],
       [-0.99032192,  0.23821095],
       ...,
       [ 0.79832907,  1.52011665],
       [-0.64376907, -0.1631367 ],
       [ 1.10665917, -0.26677589]])

## PCA using sklearn

In [7]:
pca = PCA(n_components=2) # n_components determines the reduction dimension 
X2d = pca.fit_transform(X)

In [8]:
X2d

array([[ 1.46624873,  1.06879525],
       [ 0.86051497,  0.00390033],
       [ 0.99032192,  0.23821095],
       ...,
       [-0.79832907,  1.52011665],
       [ 0.64376907, -0.1631367 ],
       [-1.10665917, -0.26677589]])

In [9]:
pca.components_.T[:, 0]

array([-0.94816706,  0.31777229])

In [10]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)
X_reduced

array([[ 1.46624873,  1.06879525],
       [ 0.86051497,  0.00390033],
       [ 0.99032192,  0.23821095],
       ...,
       [-0.79832907,  1.52011665],
       [ 0.64376907, -0.1631367 ],
       [-1.10665917, -0.26677589]])

In [11]:
pca.explained_variance_ratio_

array([0.7267767, 0.2732233])

In [12]:
mnist = fetch_openml('mnist_784', version=1)

In [13]:
X, y = mnist['data'], mnist['target']

In [14]:
X.shape

(70000, 784)

In [15]:
pca = PCA(n_components=0.95)
X_mnist_red = pca.fit_transform(X)

In [16]:
X_mnist_red.shape

(70000, 154)

In [17]:
rnd_pca = PCA(n_components=154, svd_solver='randomized')
X_reduced = rnd_pca.fit_transform(X)

In [18]:
X_reduced

array([[ 1.22255255e+02, -3.16233844e+02, -5.11318309e+01, ...,
        -1.96132444e-01, -4.04308049e+00, -3.64031845e+01],
       [ 1.01049400e+03, -2.89963621e+02,  5.76120745e+02, ...,
        -2.92220858e+01,  2.05942023e+01,  2.28274887e+01],
       [-5.89959472e+01,  3.93697445e+02, -1.61998184e+02, ...,
        -6.43215007e+01, -9.89294709e+01,  3.90537119e+01],
       ...,
       [-2.71507013e+02,  5.90078500e+02,  3.41368869e+02, ...,
        -4.79669213e+00,  5.12463110e+01, -3.55745702e+01],
       [-3.10224823e+02, -1.16727151e+02,  6.35719997e+02, ...,
        -5.89953965e+01,  2.98765684e+01, -7.90140473e+00],
       [ 1.05886213e+03, -8.33925384e+01,  7.31342184e+02, ...,
        -1.18093341e+01,  3.84329280e+01,  2.14064192e+01]])

## Kernel PCA

In [21]:
kernel_pca = KernelPCA(n_components=154, kernel='rbf', gamma=0.04)
X_reduced = kernel_pca.fit_transform(X)
X_reduced

MemoryError: Unable to allocate 36.5 GiB for an array with shape (70000, 70000) and data type float64