In [1]:
import numpy as np

np.random.seed(42)

In [2]:
'''
There are several ways to deal with an excess of dimensions. The two most notable methods are projection and
manifold learning. Projection deals with dropping extra dimensions and projecting the training instances onto
lower dimensions. Manifold learning preserves the variance of the training instances by determining the subspace
that the instances lie on, and determining a proper hyperplane that can fit the data. Within manifold learning, 
there are different techniques to achieve this hyperplane. The first we will explore is Principal Component Analysis(PCA)
'''

'\nThere are several ways to deal with an excess of dimensions. The two most notable methods are projection and\nmanifold learning. Projection deals with dropping extra dimensions and projecting the training instances onto\nlower dimensions. Manifold learning preserves the variance of the training instances by determining the subspace\nthat the instances lie on, and determining a proper hyperplane that can fit the data. Within manifold learning, \nthere are different techniques to achieve this hyperplane. The first we will explore is Principal Component Analysis(PCA)\n'

In [3]:
'''
We can determine the unit vectors that define our hyperplane using the Singular Value Decomposition(SVD) matrix
factorization technique. This allows us to decompose a training set matrix into the dot prodduct of three matrices,
one of them being the matrix containing all the principal components. 
'''
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

# PCA assumes the data is centered around the origin. Scikit learn's API automatically takes care of centering
X_centered = X - X.mean(axis=0)
U, s, V = np.linalg.svd(X_centered)
c1 = V.T[:, 0]
c2 = V.T[:, 1]

In [4]:
'''
Once the the first d principal components have been found, we may project the data onto the hyperplane 
spanned by the d principal components. Using this hyperplane ensuyres that we will preserve as much variance
as possible. To project the training data, just compute the dot product of the training set matrix X by the 
matrix W, defined as the first d principal components. 
'''
W2 = V.T[:, :2]
X2D = X_centered.dot(W2)

In [5]:
# Scikit-Learn's PCA class implements PCA usig SVD
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [6]:
# Explained Variance Ratio displays how much variance is in each of the individual components
# Note, only 1.2% is not shown as this lies on the third axis, which suggests it carries little information
print(pca.explained_variance_ratio_)

[ 0.84248607  0.14631839]


In [7]:
# Let's figure out how many dimensions we actually need to maintain 95% variance
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d

2

In [8]:
# Better alternative
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

In [9]:
from six.moves import urllib
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
'''
PCA requires all the data to be available and in memory when running. Luckily, we can use incremental PCA
(IPCA), which splits the training set into mini-batches. 
'''
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
# I did not load in the mnist data set
for X_batch in np.array_split(X, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_mnist_reduced = inc_pca.transform(X)

In [12]:
# Can use kernel PCA
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

MemoryError: 

In [None]:
# Let's use gridsearch to determine the best hyperparameters for the kernel PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

In [None]:
print(grid_search.best_params_)