In [1]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = ".."
CHAPTER_ID = "dim_reduction"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# 投影方法

In [4]:
np.random.seed(42)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m)
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

# 奇异值分解进行PCA

svd()方法返回U,s Vt。其中Vt等于$\mathbf{V}^T$, $\mathbf{V}$的转置矩阵。

$\mathbf{V} = \begin{pmatrix} \mid & \mid & & \mid \ \mathbf{c_1} & \mathbf{c_2} & \cdots & \mathbf{c_n} \ \mid & \mid & & \mid \end{pmatrix} $

In [5]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [7]:
m, n = X.shape

S = np.zeros(X_centered.shape)
S[:n, :n] = np.diag(s)

In [8]:
np.allclose(X_centered, U.dot(S).dot(Vt))

True

In [10]:
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [11]:
X2D_using_svd = X2D

# 使用sklearn进行PCA

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [15]:
X2D[:5]

array([[-0.62067606, -0.38275655],
       [ 1.42540302,  0.34379254],
       [ 1.02717543, -0.35001421],
       [ 0.22047721, -0.48877927],
       [-0.93532162,  0.26385752]])

In [16]:
X2D_using_svd[:5]

array([[-0.62067606, -0.38275655],
       [ 1.42540302,  0.34379254],
       [ 1.02717543, -0.35001421],
       [ 0.22047721, -0.48877927],
       [-0.93532162,  0.26385752]])

In [17]:
np.allclose(X2D, -X2D_using_svd)

False

In [18]:
# 恢复压缩的数据
X3D_inv = pca.inverse_transform(X2D)

In [19]:
# 会丢失部分信息， 因此并不相等。
np.allclose(X3D_inv, X)

False

In [21]:
# 计算重构误差
np.mean(np.sum(np.square(X3D_inv - X), axis=1))

0.009438256298697748

In [22]:
# 另一种方法
X3D_inv_using_svd = X2D_using_svd.dot(Vt[:2, :])

In [24]:
# 这两种方法的重构并不相同，因为sklearn的PCA类自动地处理反向的均值，但是如果减去均值，我们得到相同的重构结果
np.allclose(X3D_inv_using_svd, X3D_inv - pca.mean_)

True

In [25]:
# PCA对象可以访问它计算的主要组件:
pca.components_

array([[-0.95383238, -0.24413536, -0.17493346],
       [ 0.28778085, -0.90958415, -0.2997313 ]])

In [26]:
Vt[:2]

array([[-0.95383238, -0.24413536, -0.17493346],
       [ 0.28778085, -0.90958415, -0.2997313 ]])

In [29]:
# 解释方差比
pca.explained_variance_ratio_

array([0.85478042, 0.13561249])

In [30]:
# 投影为2D数据，方差的差别
1 - pca.explained_variance_ratio_.sum()

0.00960708908570218

如上结果对比，可以知道SVD怎么计算解释方差。

In [31]:
np.square(s) / np.square(s).sum()

array([0.85478042, 0.13561249, 0.00960709])

In [None]:
# 下面进行一些可视化