### P067 数据降维PCA-标准化输入数据

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
np.random.seed(42)

In [3]:
df = pd.read_csv("./pca.csv")

In [4]:
df.head()

Unnamed: 0,var1,var2,var3,class
0,5.1,1.4,0.2,0.0
1,4.9,1.4,0.2,0.0
2,4.7,1.3,0.2,0.0
3,4.6,1.5,0.2,0.0
4,5.0,1.4,0.2,0.0


In [5]:
X = df.copy()

In [6]:
y = X.pop("class")

In [7]:
X.head()

Unnamed: 0,var1,var2,var3
0,5.1,1.4,0.2
1,4.9,1.4,0.2
2,4.7,1.3,0.2
3,4.6,1.5,0.2
4,5.0,1.4,0.2


In [8]:
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: class, dtype: float64

In [9]:
scaler = StandardScaler()

In [10]:
X_std = scaler.fit_transform(X)

In [11]:
X_std[:10]

array([[-0.90068117, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.34022653, -1.3154443 ],
       [-1.38535265, -1.39706395, -1.3154443 ],
       [-1.50652052, -1.2833891 , -1.3154443 ],
       [-1.02184904, -1.34022653, -1.3154443 ],
       [-0.53717756, -1.16971425, -1.05217993],
       [-1.50652052, -1.34022653, -1.18381211],
       [-1.02184904, -1.2833891 , -1.3154443 ],
       [-1.74885626, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.2833891 , -1.44707648]])

### P068 数据降维PCA-自己实现PCA算法

In [12]:
X_std[:10]

array([[-0.90068117, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.34022653, -1.3154443 ],
       [-1.38535265, -1.39706395, -1.3154443 ],
       [-1.50652052, -1.2833891 , -1.3154443 ],
       [-1.02184904, -1.34022653, -1.3154443 ],
       [-0.53717756, -1.16971425, -1.05217993],
       [-1.50652052, -1.34022653, -1.18381211],
       [-1.02184904, -1.2833891 , -1.3154443 ],
       [-1.74885626, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.2833891 , -1.44707648]])

In [13]:
cov = np.cov(X_std, rowvar=False)
cov

array([[1.00671141, 0.87760447, 0.82343066],
       [0.87760447, 1.00671141, 0.96932762],
       [0.82343066, 0.96932762, 1.00671141]])

In [14]:
eig_vals, eig_vecs = np.linalg.eig(cov)
eig_vals, eig_vecs

(array([2.78833033, 0.20075012, 0.03105378]),
 array([[ 0.55964149,  0.81270446,  0.16221241],
        [ 0.59148855, -0.2546058 , -0.76506024],
        [ 0.58046765, -0.52410624,  0.62319335]]))

In [15]:
eig_pairs = [
    (
        np.abs(eig_vals[i]),
        eig_vecs[:, i]
    )
    for i in range(len(eig_vals))
]
eig_pairs

[(2.7883303296752926, array([0.55964149, 0.59148855, 0.58046765])),
 (0.20075011806343787, array([ 0.81270446, -0.2546058 , -0.52410624])),
 (0.031053780449189813, array([ 0.16221241, -0.76506024,  0.62319335]))]

In [16]:
eig_pairs.sort(reverse=True)
eig_pairs

[(2.7883303296752926, array([0.55964149, 0.59148855, 0.58046765])),
 (0.20075011806343787, array([ 0.81270446, -0.2546058 , -0.52410624])),
 (0.031053780449189813, array([ 0.16221241, -0.76506024,  0.62319335]))]

In [17]:
W = np.hstack(
    (
        eig_pairs[0][1].reshape(3, 1),
        eig_pairs[1][1].reshape(3, 1),
    )
)

In [18]:
W

array([[ 0.55964149,  0.81270446],
       [ 0.59148855, -0.2546058 ],
       [ 0.58046765, -0.52410624]])

In [20]:
X_pca = X_std.dot(W)
X_pca[:10]

array([[-2.06036006,  0.2986744 ],
       [-2.1959812 ,  0.10172707],
       [-2.36522102, -0.08074913],
       [-2.36579421, -0.20816508],
       [-2.12817063,  0.20020073],
       [-1.60325585,  0.4127035 ],
       [-2.32300467, -0.26268319],
       [-2.09455194,  0.1857296 ],
       [-2.53503403, -0.39064128],
       [-2.23877073,  0.15624518]])

### P069 数据降维PCA-组合降维结果和标签数据


In [21]:
X_pca[:10]

array([[-2.06036006,  0.2986744 ],
       [-2.1959812 ,  0.10172707],
       [-2.36522102, -0.08074913],
       [-2.36579421, -0.20816508],
       [-2.12817063,  0.20020073],
       [-1.60325585,  0.4127035 ],
       [-2.32300467, -0.26268319],
       [-2.09455194,  0.1857296 ],
       [-2.53503403, -0.39064128],
       [-2.23877073,  0.15624518]])

In [22]:
df_pca = pd.DataFrame(
    data=X_pca,
    columns = ["pca_1", "pca_2"]
)
df_pca.head()

Unnamed: 0,pca_1,pca_2
0,-2.06036,0.298674
1,-2.195981,0.101727
2,-2.365221,-0.080749
3,-2.365794,-0.208165
4,-2.128171,0.200201


In [23]:
df_pca["class"] = df["class"]

In [24]:
df_pca.head()

Unnamed: 0,pca_1,pca_2,class
0,-2.06036,0.298674,0.0
1,-2.195981,0.101727,0.0
2,-2.365221,-0.080749,0.0
3,-2.365794,-0.208165,0.0
4,-2.128171,0.200201,0.0


### P070 数据降维PCA-使用sklearn的pca算法



In [25]:
X_std[:10]

array([[-0.90068117, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.34022653, -1.3154443 ],
       [-1.38535265, -1.39706395, -1.3154443 ],
       [-1.50652052, -1.2833891 , -1.3154443 ],
       [-1.02184904, -1.34022653, -1.3154443 ],
       [-0.53717756, -1.16971425, -1.05217993],
       [-1.50652052, -1.34022653, -1.18381211],
       [-1.02184904, -1.2833891 , -1.3154443 ],
       [-1.74885626, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.2833891 , -1.44707648]])

In [26]:
from sklearn.decomposition import PCA

In [27]:
pca = PCA(n_components=2)

In [28]:
X_pca = pca.fit_transform(X_std)

In [29]:
X_pca[:10]

array([[-2.06036006, -0.2986744 ],
       [-2.1959812 , -0.10172707],
       [-2.36522102,  0.08074913],
       [-2.36579421,  0.20816508],
       [-2.12817063, -0.20020073],
       [-1.60325585, -0.4127035 ],
       [-2.32300467,  0.26268319],
       [-2.09455194, -0.1857296 ],
       [-2.53503403,  0.39064128],
       [-2.23877073, -0.15624518]])

In [30]:
df = pd.DataFrame(
    data = X_pca,
    columns =["pca_1", "pca_2"]
)
df.head()

Unnamed: 0,pca_1,pca_2
0,-2.06036,-0.298674
1,-2.195981,-0.101727
2,-2.365221,0.080749
3,-2.365794,0.208165
4,-2.128171,-0.200201


In [31]:
df["class"] = y

In [32]:
df.head()

Unnamed: 0,pca_1,pca_2,class
0,-2.06036,-0.298674,0.0
1,-2.195981,-0.101727,0.0
2,-2.365221,0.080749,0.0
3,-2.365794,0.208165,0.0
4,-2.128171,-0.200201,0.0


### P071 数据降维PCA - 计算观察方差分布

In [33]:
import numpy as np
import pandas as pd

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [35]:
df = pd.read_csv("./p071.csv")

In [36]:
df.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32


In [37]:
data = df.values
data[:10]

array([[ 14.23,   1.71,   2.43,  15.6 , 127.  ,   2.8 ,   3.06,   0.28,
          2.29,   5.64],
       [ 13.2 ,   1.78,   2.14,  11.2 , 100.  ,   2.65,   2.76,   0.26,
          1.28,   4.38],
       [ 13.16,   2.36,   2.67,  18.6 , 101.  ,   2.8 ,   3.24,   0.3 ,
          2.81,   5.68],
       [ 14.37,   1.95,   2.5 ,  16.8 , 113.  ,   3.85,   3.49,   0.24,
          2.18,   7.8 ],
       [ 13.24,   2.59,   2.87,  21.  , 118.  ,   2.8 ,   2.69,   0.39,
          1.82,   4.32],
       [ 14.2 ,   1.76,   2.45,  15.2 , 112.  ,   3.27,   3.39,   0.34,
          1.97,   6.75],
       [ 14.39,   1.87,   2.45,  14.6 ,  96.  ,   2.5 ,   2.52,   0.3 ,
          1.98,   5.25],
       [ 14.06,   2.15,   2.61,  17.6 , 121.  ,   2.6 ,   2.51,   0.31,
          1.25,   5.05],
       [ 14.83,   1.64,   2.17,  14.  ,  97.  ,   2.8 ,   2.98,   0.29,
          1.98,   5.2 ],
       [ 13.86,   1.35,   2.27,  16.  ,  98.  ,   2.98,   3.15,   0.22,
          1.85,   7.22]])

In [38]:
scaler = StandardScaler()
data_std = scaler.fit_transform(data)

In [39]:
pca = PCA(n_components=3)
data_pca = pca.fit_transform(data_std)

In [40]:
data_pca[:10]

array([[ 2.76934698, -1.30493533, -0.58395762],
       [ 1.67070079,  1.02905763, -1.81994227],
       [ 2.10783949, -0.8316145 ,  0.99444906],
       [ 3.44931605, -1.8592707 , -0.38935836],
       [ 0.87603192, -1.50882534,  1.63982782],
       [ 2.67119513, -1.29513525, -0.69468732],
       [ 1.66955982, -0.57093005, -1.18079406],
       [ 1.13000447, -1.39071555, -0.27443572],
       [ 2.39875256, -0.27899794, -1.83738049],
       [ 2.3741835 , -0.34132632, -1.10598146]])

In [41]:
results = pd.DataFrame(
    data = {"variance_ratio": pca.explained_variance_ratio_}
)

In [42]:
results["cumulative"] = results["variance_ratio"].cumsum()
results["component"] = results.index + 1

In [43]:
results

Unnamed: 0,variance_ratio,cumulative,component
0,0.338023,0.338023,1
1,0.199678,0.537701,2
2,0.133415,0.671117,3


### P072 数据降维PCA - 指定方差百分比计算分量数

In [44]:
data_std[:10]

array([[ 1.51861254, -0.5622498 ,  0.23205254, -1.16959318,  1.91390522,
         0.80899739,  1.03481896, -0.65956311,  1.22488398,  0.25171685],
       [ 0.24628963, -0.49941338, -0.82799632, -2.49084714,  0.01814502,
         0.56864766,  0.73362894, -0.82071924, -0.54472099, -0.29332133],
       [ 0.19687903,  0.02123125,  1.10933436, -0.2687382 ,  0.08835836,
         0.80899739,  1.21553297, -0.49840699,  2.13596773,  0.26901965],
       [ 1.69154964, -0.34681064,  0.4879264 , -0.80925118,  0.93091845,
         2.49144552,  1.46652465, -0.98187536,  1.03215473,  1.18606801],
       [ 0.29570023,  0.22769377,  1.84040254,  0.45194578,  1.28198515,
         0.80899739,  0.66335127,  0.22679555,  0.40140444, -0.31927553],
       [ 1.48155459, -0.51736664,  0.30515936, -1.28970717,  0.86070511,
         1.56209322,  1.36612798, -0.17609475,  0.66421706,  0.73186953],
       [ 1.71625494, -0.4186237 ,  0.30515936, -1.46987817, -0.26270834,
         0.32829793,  0.49267693, -0.49840699

In [45]:
pca = PCA(n_components=0.95)
data_pca = pca.fit_transform(data_std)

In [46]:
pca.n_components_

8

In [47]:
data_pca[:10]

array([[ 2.76934698, -1.30493533, -0.58395762,  0.88718074, -0.05383194,
         0.4175187 , -0.65310465, -0.55835533],
       [ 1.67070079,  1.02905763, -1.81994227,  0.19988232, -0.34183924,
        -0.70990577, -0.82282074,  0.70308247],
       [ 2.10783949, -0.8316145 ,  0.99444906, -0.74968877,  0.52989459,
         0.60452126,  0.11621797, -0.05362278],
       [ 3.44931605, -1.8592707 , -0.38935836, -0.49400668, -0.16692107,
        -0.15436675,  0.06916937,  0.40293675],
       [ 0.87603192, -1.50882534,  1.63982782,  0.41881807, -0.08648593,
        -0.29583018, -0.86454971, -0.03449526],
       [ 2.67119513, -1.29513525, -0.69468732, -0.34175964, -0.58811531,
         0.06150361, -0.65042096,  0.16782641],
       [ 1.66955982, -0.57093005, -1.18079406, -0.5974367 , -0.40172288,
        -0.36573743, -0.31139999, -0.76851304],
       [ 1.13000447, -1.39071555, -0.27443572,  1.01082913, -0.5247762 ,
        -0.92440137, -0.76028542, -0.21809656],
       [ 2.39875256, -0.27899794