# 決定木の剪定（Pruning）

このノートブックでは、決定木の剪定について学習します。


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = (12, 8)

# データの準備
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 異なる深さでの決定木の性能
depths = range(1, 11)
train_scores = []
test_scores = []
cv_scores = []

for depth in depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    
    # 訓練データでの性能
    train_score = tree.score(X_train, y_train)
    train_scores.append(train_score)
    
    # テストデータでの性能
    test_score = tree.score(X_test, y_test)
    test_scores.append(test_score)
    
    # クロスバリデーション
    cv_score = cross_val_score(tree, X_train, y_train, cv=5).mean()
    cv_scores.append(cv_score)

# 可視化
plt.figure(figsize=(15, 5))

# 性能の比較
plt.subplot(1, 3, 1)
plt.plot(depths, train_scores, 'o-', label='Training')
plt.plot(depths, test_scores, 'o-', label='Test')
plt.plot(depths, cv_scores, 'o-', label='CV')
plt.xlabel('Tree Depth')
plt.ylabel('Accuracy')
plt.title('Performance vs Tree Depth')
plt.legend()
plt.grid(True, alpha=0.3)

# 過学習の可視化
plt.subplot(1, 3, 2)
overfitting = np.array(train_scores) - np.array(test_scores)
plt.plot(depths, overfitting, 'o-', color='red')
plt.xlabel('Tree Depth')
plt.ylabel('Overfitting (Train - Test)')
plt.title('Overfitting vs Tree Depth')
plt.grid(True, alpha=0.3)

# 最適な深さの決定
plt.subplot(1, 3, 3)
plt.plot(depths, cv_scores, 'o-', color='green')
plt.axvline(x=np.argmax(cv_scores) + 1, color='red', linestyle='--', label=f'Optimal Depth: {np.argmax(cv_scores) + 1}')
plt.xlabel('Tree Depth')
plt.ylabel('CV Accuracy')
plt.title('Optimal Depth Selection')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 最適な深さでの決定木
optimal_depth = np.argmax(cv_scores) + 1
optimal_tree = DecisionTreeClassifier(max_depth=optimal_depth, random_state=42)
optimal_tree.fit(X_train, y_train)

print("=== 決定木の剪定結果 ===")
print(f"最適な深さ: {optimal_depth}")
print(f"訓練精度: {train_scores[optimal_depth-1]:.4f}")
print(f"テスト精度: {test_scores[optimal_depth-1]:.4f}")
print(f"CV精度: {cv_scores[optimal_depth-1]:.4f}")
print(f"過学習度: {overfitting[optimal_depth-1]:.4f}")

# 剪定前後の比較
unpruned_tree = DecisionTreeClassifier(random_state=42)
unpruned_tree.fit(X_train, y_train)

print(f"\n=== 剪定前後の比較 ===")
print(f"剪定前（深さ制限なし）:")
print(f"  訓練精度: {unpruned_tree.score(X_train, y_train):.4f}")
print(f"  テスト精度: {unpruned_tree.score(X_test, y_test):.4f}")
print(f"  過学習度: {unpruned_tree.score(X_train, y_train) - unpruned_tree.score(X_test, y_test):.4f}")

print(f"剪定後（深さ{optimal_depth}）:")
print(f"  訓練精度: {optimal_tree.score(X_train, y_train):.4f}")
print(f"  テスト精度: {optimal_tree.score(X_test, y_test):.4f}")
print(f"  過学習度: {optimal_tree.score(X_train, y_train) - optimal_tree.score(X_test, y_test):.4f}")
