In [None]:
import numpy as np
from scipy.spatial.transform import Rotation

m = 60
X = np.zeros((m, 3))  # initialize 3D dataset
np.random.seed(42)
angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi  # uneven distribution
X[:, 0], X[:, 1] = np.cos(angles), np.sin(angles) * 0.5  # oval
X += 0.28 * np.random.randn(m, 3)  # add more noise
X = Rotation.from_rotvec([np.pi / 29, -np.pi / 20, np.pi / 4]).apply(X)
X += [0.2, 0, 0.2]  # shift a bit

: 

In [None]:
X_centered = X - X.mean(axis=0)

U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt[0]
c2 = Vt[1]
print(f'c1: {c1}')
print(f'c2: {c2}')

: 

c1 and c2 are unit vector for each of the pricpal components for the 3D dataset

This can then be projected onto a hyperplane to tranform it to 2D space or d-space


In [None]:
W2 = Vt[:2].T
X2D = X_centered @ W2
print(f'old shape: {X_centered.shape}')
print(f'new shape: {X2D.shape}')

: 

In [None]:
#With sklearn
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
print(f'first 5 rows without sklearn:\n {X2D[:5,:]}')
X2D = pca.fit_transform(X_centered)
print(f'first 5 rows with sklearn:\n {X2D[:5,:]}')

: 

In [None]:
pca.components_

: 

In [None]:
pca.explained_variance_ratio_

: 

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)
X_train, X_test = mnist.data[:60000], mnist.data[60_000:]
y_train, y_test = mnist.target[:60000], mnist.target[60000:]

: 

In [None]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d

: 

In [None]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

: 

In [None]:
pca.n_components_

: 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV

clf = make_pipeline(PCA(random_state=42), RandomForestClassifier(random_state=42))

params = {
    "pca__n_components": np.arange(10,80),
    "randomforestclassifier__n_estimators": np.arange(50,500)
}

rnd_search = RandomizedSearchCV(clf, params, cv3, n_iter=10, n_jobs=-1, random_state=42)
rnd_search.fit(X_train[:1000], y_train[:1000])

: 

In [None]:
print(rnd_search.best_estimator_)

: 

In [None]:
rnd_pca = PCA(n_components=154, svd_solver='randomized', random_state=42)
X_reduced = rnd_pca.fit(X_train)

: 

In [None]:
from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)
X_reduced = inc_pca.transform(X_train)

: 