# PCA basic usage

In [76]:
from sklearn.decomposition import PCA
import numpy as np

x=np.array([[10001,2,55], [16020,4,11,], [12008,6,33], [13131,8,22]])
pca = PCA(n_components=2)
pca.fit(x)
pca.transform(x)

array([[ -2.78910648e+03,   5.19040274e+00],
       [  3.23005437e+03,   4.41130144e+00],
       [ -7.81999212e+02,  -3.11817922e+00],
       [  3.41051325e+02,  -6.48352497e+00]])

It is better to normalize feature before PCA

In [85]:
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import StandardScaler

x=np.array([[10001,2,55], [16020,4,11], [12008,6,33], [13131,8,22]])
X_scaler = StandardScaler()
x = X_scaler.fit_transform(x)
pca = PCA(n_components=2)
pca.fit(x)
pca.transform(x)



array([[-2.36863319,  0.38298087],
       [ 1.50952734,  1.23481789],
       [-0.14360068, -0.58040917],
       [ 1.00270653, -1.03738959]])

# PCA calculation process

## 1. data normalization

In [124]:
from sklearn.preprocessing import StandardScaler
x=np.array([[10001,2,55], [16020,4,11], [12008,6,33], [13131,8,22]])
X_scaler = StandardScaler()
x = X_scaler.fit_transform(x)
x



array([[-1.2817325 , -1.34164079,  1.52127766],
       [ 1.48440157, -0.4472136 , -1.18321596],
       [-0.35938143,  0.4472136 ,  0.16903085],
       [ 0.15671236,  1.34164079, -0.50709255]])

## 2. cov matrix

In [78]:
# method 1
m = 4 # sample number
cov_mat = np.dot(x.transpose(),x)/(m-1) # 协方差矩阵
cov_mat

array([[ 1.33333333,  0.36843716, -1.28215095],
       [ 0.36843716,  1.33333333, -0.70553368],
       [-1.28215095, -0.70553368,  1.33333333]])

In [106]:
# method 2
cov_mat = np.cov(x, rowvar = 0)
cov_mat

array([[ 1.33333333,  0.36843716, -1.28215095],
       [ 0.36843716,  1.33333333, -0.70553368],
       [-1.28215095, -0.70553368,  1.33333333]])

## 3. svd

In [129]:
sigma = cov_mat
[U,S,V] = np.linalg.svd(sigma) # 奇异值分解
Ur = U[:,0:2]
z = np.dot(x, Ur)
z# 降维结果z

array([[ 2.36863319,  0.38298087],
       [-1.50952734,  1.23481789],
       [ 0.14360068, -0.58040917],
       [-1.00270653, -1.03738959]])

**这里降维的结果z（Out[118]）与用sklearn中PCA方法降维得到的结果Out[85]相同**

In [103]:
np.dot(z, Ur.transpose())# 数据复原，即x

array([[-1.28505803, -1.34271509,  1.51751098],
       [ 1.48450422, -0.44718044, -1.18309969],
       [-0.34955552,  0.45038782,  0.18016022],
       [ 0.15010934,  1.3395077 , -0.51457151]])

In [125]:
x # 复原后的数据Out[103]与压缩之前的x(Out[104])没有区别

array([[-1.2817325 , -1.34164079,  1.52127766],
       [ 1.48440157, -0.4472136 , -1.18321596],
       [-0.35938143,  0.4472136 ,  0.16903085],
       [ 0.15671236,  1.34164079, -0.50709255]])

In [132]:
(S[0]+S[1])/(S[0]+S[1]+S[2])# 从三维降到二维，保留了99.997%的差异

0.99996991682077252