## Principal Components

In [20]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.15)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
import numpy as np

X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [7]:
c2

array([-0.31924247, -0.94767307])

In [8]:
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [17]:
#using sklearn

from sklearn.decomposition import PCA

pca= PCA(n_components=2)
X2D = pca.fit_transform(X)

In [18]:
pca.explained_variance_ratio_

array([0.80939285, 0.19060715])

## Choosing the right no of dimensions

In [21]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

## PCA for Compression

In [24]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version = 1)
X, y = mnist['data'], mnist['target']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
pca = PCA(n_components=154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

## Randomized PCA

In [27]:
rnd_pca = PCA(n_components = 154, svd_solver='randomized')
X_reduced = rnd_pca.fit_transform(X_train)
print(X_train.shape)
print(X_reduced.shape)

(56000, 784)
(56000, 154)


## Incremental PCA

In [28]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components = 154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)
X_reduced = inc_pca.transform(X_train)

## Kernel PCA

In [29]:
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

MemoryError: Unable to allocate 36.5 GiB for an array with shape (70000, 70000) and data type float64

### Selecting a Kernel and Tuning Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('kpca', KernelPCA(n_components=2)),
    ('log_reg', LogisticRegression())
])

param_grid = [{
    'kpca_gamma': np.linspace(0.03, 0.04, 10),
    'kpca_kernel': ['rbf', 'sigmoid']
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

# LLE

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)