In [1]:
import numpy as np

In [2]:
m = 1000
X = 6*np.random.rand(m, 1) - 3
X_square = X**2 + np.random.randn(m, 1)
X_cube = X**3 + np.random.randn(m, 1)
y = 0.5*X**2 + X + X_square*1.5 + X_cube*2.5 + 2 + np.random.randn(m, 1)

In [3]:
X = np.concatenate((X, X_square, X_cube), axis=1)

In [4]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [5]:
X_centered

array([[ -2.56337789,   3.79185562, -17.98959799],
       [  0.52642251,  -2.98532277,   0.69481962],
       [ -0.7146814 ,  -1.17602235,  -1.60665401],
       ...,
       [ -0.80091053,  -0.75529727,  -1.60082444],
       [ -0.02631053,  -1.97348909,   0.06913821],
       [ -2.89565892,   6.75956505, -24.11006362]])

In [6]:
W2 =Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [7]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [8]:
X_csr = csr_matrix(X)
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd.fit(X_csr)

In [9]:
from sklearn.decomposition import PCA

In [10]:
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [11]:
pca.explained_variance_ratio_

array([0.92392826, 0.07170896])

In [12]:
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [13]:
cumsum

array([0.92392826, 0.99563723, 1.        ])

In [14]:
d

2

In [15]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

In [16]:
from keras.datasets import mnist

2023-07-12 10:13:44.588369: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-12 10:13:45.015733: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-12 10:13:45.017381: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

In [18]:
train_X.shape

(60000, 28, 28)

In [19]:
nsamples, nx, ny = train_X.shape
train_X = train_X.reshape((nsamples,nx*ny))

In [20]:
pca = PCA(n_components = 154)
X_reduced = pca.fit_transform(train_X)
X_recovered = pca.inverse_transform(X_reduced)

In [21]:
rnd_pca = PCA(n_components=154, svd_solver = "randomized")
X_reduced = rnd_pca.fit_transform(train_X)

In [22]:
from sklearn.decomposition import IncrementalPCA

In [23]:
n_batches = 100
inc_pca = IncrementalPCA(n_components=2)
for X_batch in np.array_split(X, n_batches):
    inc_pca.partial_fit(X_batch)

In [24]:
X_reduced = inc_pca.transform(X)
X_reduced

array([[-18.20784619,   3.60451896],
       [  0.79786626,  -2.97880298],
       [ -1.68506889,  -1.19165859],
       ...,
       [ -1.69683866,  -0.77071772],
       [  0.08451795,  -1.97257805],
       [-24.33702151,   6.50727585]])

In [25]:
X.shape

(1000, 3)

In [26]:
n = X.shape[1]
n

3

In [27]:
m

1000

In [28]:
X.astype('int16').tofile('X_binary')

In [29]:
import os

In [30]:
os.stat('X_binary')

os.stat_result(st_mode=33204, st_ino=1313898, st_dev=2051, st_nlink=1, st_uid=1001, st_gid=1001, st_size=6000, st_atime=1688924062, st_mtime=1689174853, st_ctime=1689174853)

In [31]:
#X_mm = np.memmap('X_binary', dtype="float32", mode="readonly", shape=(m, n))

In [32]:
#batch_size = m // n_ batches
#inc_pca = IncrementalPCA(n_components = 2, batch_size = batch_size)
#inc_pca.fit(X_mm)

In [33]:
from sklearn.decomposition import KernelPCA

In [34]:
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [36]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [37]:
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

In [38]:
from keras.datasets import mnist

In [39]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

In [40]:
nsamples, nx, ny = train_X.shape
train_X_2d = train_X.reshape((nsamples,nx*ny))

In [41]:
train_X_2d = train_X_2d[:10000, :]
train_y = train_y[:10000]

In [42]:
clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())
])

In [43]:
param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]
}]

In [44]:
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(train_X_2d, train_y)

In [45]:
grid_search.best_params_

{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}

In [46]:
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0433,
                   fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [47]:
from sklearn.metrics import mean_squared_error

In [48]:
mean_squared_error(X, X_preimage)

0.17139223567262762

In [49]:
from sklearn.manifold import LocallyLinearEmbedding

In [50]:
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)