<a href="https://colab.research.google.com/github/ymuto0302/RW2025/blob/main/holdout_vs_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ホールドアウト法と交差検証法の比較実験
以下では Wine Dataset の場合のコードのみを示している。

### ライブラリのインポート

In [1]:
# 必要なライブラリのインポート
import numpy as np
from sklearn.datasets import load_wine, load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import time

### データセットの読み込み

In [2]:
# データセット読み込み
wine = load_wine()
X, y = wine.data, wine.target

### ホールドアウト法：訓練・テストの分割比率が与える影響

In [None]:
for test_size in [0.2, 0.3, 0.4, 0.5]:
    print("-----------------")
    print(f"訓練:テスト = {1-test_size:.1f}:{test_size:.1f}")

    # 複数回実行して分散を計算
    accuracies = [] # 正解率を保存するためのリスト
    for random_state in range(10):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        model = DecisionTreeClassifier(random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

    print(f"正解率(平均±標準偏差): {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}")

### ホールドアウト法：訓練・テストへの分割の際の乱数シードが与える影響

In [None]:
accuracies = [] # 正解率を保存するためのリスト

for random_state in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3,
        random_state=random_state, stratify=y
    )

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

print(f"正解率(平均±標準偏差): {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}")
print(f"正解率の最小値: {np.min(accuracies):.3f}")
print(f"正解率の最大値: {np.max(accuracies):.3f}")


### 交差検証法：fold数が与える影響

In [None]:
model = DecisionTreeClassifier(random_state=42)
folds = [3, 5, 10, 20]

accuracies = [] # 正解率を保存するためのリスト

for k in folds:
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    cv_name = f"{k}-Fold"

    start_time = time.time() # 開始時刻
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    end_time = time.time() # 終了時刻

    print("------------------------------")
    print(f"{cv_name} CV:")
    print(f"正解率(平均±標準偏差): {scores.mean():.3f} ± {scores.std():.3f}")
    print(f"実行時間: {end_time - start_time:.3f} sec")