Dimensionality Reduction

Proection

PCA

In [8]:
import numpy as np
from scipy.spatial.transform import Rotation

m = 60
X = np.zeros((m, 3))  # initialize 3D dataset
np.random.seed(42)
angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi  # uneven distribution
X[:, 0], X[:, 1] = np.cos(angles), np.sin(angles) * 0.5  # oval
X += 0.28 * np.random.randn(m, 3)  # add more noise
X = Rotation.from_rotvec([np.pi / 29, -np.pi / 20, np.pi / 4]).apply(X)
X += [0.2, 0, 0.2]  # shift a bit

In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2d = pca.fit_transform(X)

In [10]:
pca.explained_variance_ratio_

array([0.7578477 , 0.15186921])

### How to choose number of dimensions

In [11]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)
X_train, y_train = mnist.data[:60_000], mnist.target[:60_000]
X_test, y_test = mnist.data[60_000:], mnist.target[60_000:]

pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
dimensions = np.argmax(cumsum >= 0.95) + 1
dimensions

  warn(


154

In [12]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [13]:
pca.n_components_

154

### Tuning PCA

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

classifier = make_pipeline(
    PCA(random_state=42),
    RandomForestClassifier(random_state=42)
)

param_distrib = {
    "pca__n_components": np.arange(10, 80),
    "randomforestclassifier__n_estimators": np.arange(50, 500)
}

randomized_search = RandomizedSearchCV(classifier, param_distrib, n_iter=10, cv=3, random_state=42)
randomized_search.fit(X_train[:1_000], y_train[:1_000])
randomized_search.best_params_

{'randomforestclassifier__n_estimators': 465, 'pca__n_components': 23}

### Randomized PCA

In [15]:
random_pca = PCA(n_components=154, svd_solver="randomized", random_state=42)
X_reduced = pca.fit_transform(X_train)

### Incremental PCA

In [16]:
from sklearn.decomposition import IncrementalPCA

nbatches = 100
incremental_pca = IncrementalPCA(n_components=154)
for batch in np.array_split(X_train, nbatches):
    incremental_pca.partial_fit(batch)

X_reduced = incremental_pca.transform(X_train)

When data does not fit in memory

In [17]:
filename = "mnist_mmap"
X_mmap = np.memmap(filename, dtype="float32", mode="write", shape=X_train.shape)
X_mmap[:] = X_train
X_mmap.flush()

In [18]:
X_mmap = np.memmap(filename, dtype="float32", mode="readonly").reshape(-1, 784)
batch_size = X_mmap.shape[0] // nbatches
incremental_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
incremental_pca.fit(X_mmap)

Random Proection

In [19]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim

m, e = 5_000, 0.1
dimensions = johnson_lindenstrauss_min_dim(m, eps=e)
dimensions

7300

In [20]:
n = 20_000
np.random.seed(42)
P = np.random.randn(dimensions, n) / np.sqrt(dimensions)

X = np.random.randn(m, n)
X_reduced = X @ P.T

In [None]:
from sklearn.random_projection import GaussianRandomProjection

gaussian_random_proection = GaussianRandomProjection(eps=0.1, random_state=42)
X_reduced = gaussian_random_proection.fit_transform(X)

In [None]:
from sklearn.random_projection import SparseRandomProjection

sparse_random_proection = SparseRandomProjection(eps=0.1, random_state=42)
X_reduced = sparse_random_proection.fit_transform(X)

Locally Linear Embedding

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding

X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)
X_unrolled = lle.fit_transform(X_swiss)