# PCA

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## PCA from scrach

In [2]:
# サンプルデータ準備
x = np.array([3, 4, 2, 3, 4, 4, 3])
y = np.array([2, 3, 5, 5, 3, 2, 3])
X = pd.DataFrame({'x': x, 'y': y})

In [3]:
def my_pca(X, num_components):
    # 1.標準化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 2.分散共分散行列
    cov_mat = np.cov(X_scaled, rowvar=False)

    # 3.固有ベクトルと固有値
    eigen_values, eigen_vectors = np.linalg.eigh(cov_mat)
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_values = eigen_values[sorted_index]
    sorted_vectors = eigen_vectors[:, sorted_index]
    subset_vectors = sorted_vectors[:, :num_components]
    
    # 4.主成分にXを変換
    X_reduced = np.dot(X_scaled, subset_vectors)
    
    return X_reduced    

In [4]:
my_pca(X,1)

array([[-0.49467432],
       [-0.89576549],
       [ 2.34350404],
       [ 1.33314107],
       [-0.89576549],
       [-1.50503729],
       [ 0.11459748]])

## PCA from sklearn

In [5]:
# 標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# pca
pca = PCA(n_components=1)
pca.fit_transform(X_scaled)

array([[-0.49467432],
       [-0.89576549],
       [ 2.34350404],
       [ 1.33314107],
       [-0.89576549],
       [-1.50503729],
       [ 0.11459748]])

## PCA iris

In [6]:
# データ準備
df = sns.load_dataset('iris')

# 変数定義
y_col = 'species'
X = df.loc[:, df.columns!=y_col] #X = df.drop(columns=[y_col])でもOK
y = df[y_col].values

# 標準化
X_scaled = StandardScaler().fit_transform(X)

# pca
X_pc = PCA(n_components=2).fit_transform(X_scaled)

In [None]:
# 主成分分析したdfと目的変数を結合
df_pc = pd.DataFrame(np.concatenate([X_pc, y.reshape(-1,1)], axis=1)
                    ,columns = ['first_component', 'second_component', 'species'])
# 散布図
sns.scatterplot(x='first_component', y='second_component', hue='species', data=df_pc)

<matplotlib.axes._subplots.AxesSubplot at 0x7effc302f550>