# scikit-learn 基礎：前処理→学習→評価（分類）
**到達目標**
- 学習/検証の分割と再現性（乱数シード）を理解する
- `Pipeline` と `ColumnTransformer` で前処理と学習を一括管理する
- 交差検証・混同行列・学習曲線でモデルを評価する

> Colab では最初に下の `pip` セルを実行してください。


In [None]:
# Colab 用セットアップ
!pip -q install -U scikit-learn matplotlib numpy pandas tqdm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
np.random.seed(42)

In [None]:
# データ準備
iris = load_iris(as_frame=True)
X = iris.data.copy()
y = iris.target
X.head()

In [None]:
# Train/Test 分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape

In [None]:
# LogisticRegression パイプライン
num_proc = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
])
clf_lr = Pipeline(steps=[
    ("pre", num_proc),
    ("model", LogisticRegression(max_iter=1000, random_state=42))
])
clf_lr.fit(X_train, y_train)
y_pred = clf_lr.predict(X_test)
print(classification_report(y_test, y_pred, target_names=iris.target_names))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=iris.target_names)
disp.plot(values_format="d")
plt.show()

In [None]:
# 交差検証
scores = cross_val_score(clf_lr, X_train, y_train, cv=5)
print("CV mean:", scores.mean(), "±", scores.std())

In [None]:
# 学習曲線
train_sizes, train_scores, valid_scores = learning_curve(
    clf_lr, X_train, y_train, cv=5,
    train_sizes=np.linspace(0.1, 1.0, 5), shuffle=True, random_state=42
)
plt.plot(train_sizes, train_scores.mean(axis=1), "o-", label="train")
plt.plot(train_sizes, valid_scores.mean(axis=1), "s-", label="cv")
plt.xlabel("Training examples"); plt.ylabel("Score")
plt.title("Learning Curve (LogisticRegression)")
plt.legend(); plt.show()

In [None]:
# RandomForest でも比較
clf_rf = Pipeline(steps=[
    ("pre", num_proc),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf, target_names=iris.target_names))
cm = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=iris.target_names).plot(values_format="d")
plt.show()

In [None]:
# 練習問題
# 1) RandomForest の max_depth をいくつか試し CV スコアを比較
# 2) feature_importances_ を可視化（棒グラフ）
# 3) cv=3,5,10 で学習曲線の変化を確認
