# PCAで分類器学習

このノートブックでは、PCAで次元削減したデータで分類器を学習し、性能を比較します。


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = (12, 8)

# データの準備
iris = load_iris()
X, y = iris.data, iris.target

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# データの標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 異なる成分数でのPCA
n_components_list = [1, 2, 3, 4]
accuracies = []

for n_components in n_components_list:
    # PCAの適用
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # 分類器の学習
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train_pca, y_train)
    
    # 予測と評価
    y_pred = clf.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    print(f"=== {n_components}成分でのPCA ===")
    print(f"累積寄与率: {np.sum(pca.explained_variance_ratio_):.4f}")
    print(f"精度: {accuracy:.4f}")

# 可視化
plt.figure(figsize=(15, 5))

# 精度の比較
plt.subplot(1, 3, 1)
plt.plot(n_components_list, accuracies, 'o-')
plt.xlabel('Number of Components')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Components')
plt.grid(True, alpha=0.3)

# 2成分での分類結果
pca_2d = PCA(n_components=2)
X_train_2d = pca_2d.fit_transform(X_train_scaled)
X_test_2d = pca_2d.transform(X_test_scaled)

clf_2d = LogisticRegression(random_state=42, max_iter=1000)
clf_2d.fit(X_train_2d, y_train)
y_pred_2d = clf_2d.predict(X_test_2d)

plt.subplot(1, 3, 2)
plt.scatter(X_test_2d[:, 0], X_test_2d[:, 1], c=y_test, cmap='viridis', alpha=0.7)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('2D PCA Classification')

# 3成分での分類結果
pca_3d = PCA(n_components=3)
X_train_3d = pca_3d.fit_transform(X_train_scaled)
X_test_3d = pca_3d.transform(X_test_scaled)

clf_3d = LogisticRegression(random_state=42, max_iter=1000)
clf_3d.fit(X_train_3d, y_train)
y_pred_3d = clf_3d.predict(X_test_3d)

plt.subplot(1, 3, 3)
ax = plt.axes(projection='3d')
ax.scatter(X_test_3d[:, 0], X_test_3d[:, 1], X_test_3d[:, 2], c=y_test, cmap='viridis')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('3D PCA Classification')

plt.tight_layout()
plt.show()

print(f"\n=== 分類結果の比較 ===")
print(f"元データ（4次元）: 精度={accuracies[3]:.4f}")
print(f"3次元: 精度={accuracies[2]:.4f}")
print(f"2次元: 精度={accuracies[1]:.4f}")
print(f"1次元: 精度={accuracies[0]:.4f}")
