In [25]:
from sklearn.datasets import load_iris
import numpy as np

In [58]:
data = load_iris()
X = data.data[:, 0:4]
y = data['target']

## Principal Components 

In [59]:
#Singular Value Decomposition (SVD)

In [60]:
X_centered = X - X.mean(axis = 1).reshape(-1, 1)# Before you perform SVD, you need to center the data first
U, s, Vt = np.linalg.svd(X_centered) # Use svd() to perform Singular Value Decomposition
c1 = Vt.T[:, 0] # Obtain the first unit vector(the first axis)
c2 = Vt.T[:, 1]

## Projecting Down to d Dimensions 

You can reduce the dimensionality of the dataset down to d dimensions by projecting it onto the hyperplane defined by the first d principal components

Projecting the training set down to d dimensions
$$X_{d-proj}=XW_{d}$$
<br>Where $W_{d}$ is the first d columns of $V$

In [61]:
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2) #projects the training set onto the plane defined by the first 2 PC

## Using Scikit-Learn

In [63]:
from sklearn.decomposition import PCA

In [64]:
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [53]:
pca.explained_variance_ratio_

array([0.92461872, 0.05306648])

## Compress first, then reconstruct datasets 

In [55]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X) #先降维
X_recovered = pca.inverse_transform(X_reduced)#再根据reduced的dataset reconstruct回去
#reconstruct的dataset不是和之前的一模一样的，但是可以说非常close to original dataset了

## Randomized PCA

In [56]:
#Randomly find the approximation of first d principal components
#速度比不random的快很多很多，但是得到的是approximation

In [57]:
rnd_pca = PCA(n_components=2, svd_solver='randomized')
X_reduced = rnd_pca.fit_transform(X)

## Incremental PCA

In [66]:
#不是一次把所有data给PCA，而是分成batch一点一点给IPCA

In [67]:
from sklearn.decomposition import IncrementalPCA

## Kernel PCA

In [71]:
from sklearn.decomposition import KernelPCA

In [72]:
rbf_pca = KernelPCA(n_components=2, kernel = 'rbf', gamma = 0.04) #和SVC一样，你可以set自己想要的kernel

In [73]:
X_reduced = rbf_pca.fit_transform(X)

## Selecting a Kernel and Tuning Hyperparameters

In [74]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [77]:
clf = Pipeline(steps = [
    ('kpca', KernelPCA(n_components=2)),
    ('log_reg', LogisticRegression())
])
param_grid = [{'kpca__gamma': np.linspace(0.03, 0.05, 10), 'kpca__kernel': ['rbf', 'sigmoid']}]

In [78]:
grid_search = GridSearchCV(clf, param_grid, cv = 3)
grid_search.fit(X, y)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kpca', KernelPCA(n_components=2)),
                                       ('log_reg', LogisticRegression())]),
             param_grid=[{'kpca__gamma': array([0.03      , 0.03222222, 0.03444444, 0.03666667, 0.03888889,
       0.04111111, 0.04333333, 0.04555556, 0.04777778, 0.05      ]),
                          'kpca__kernel': ['rbf', 'sigmoid']}])

In [79]:
grid_search.best_params_

{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}

## Locally Linear Embedding (LLE)

In [81]:
# This approach is particularly good at unrolling twisted manifolds, especially when there is not too much noise

How does LLE works?
<br>
1. Find *k* closets neighbors for each training instance x
2. Try to reconstruct x as a linear function of these neighbors, such that the squared distance between x and these 