# 決定木の実装

このノートブックでは、決定木の手動実装とscikit-learnの比較を学習します。


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['figure.figsize'] = (12, 8)

# 決定木の手動実装
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def gini_impurity(self, y):
        """ジニ不純度の計算"""
        if len(y) == 0:
            return 0
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)
    
    def find_best_split(self, X, y):
        """最適な分割点を見つける"""
        best_gini = float('inf')
        best_feature = None
        best_threshold = None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask
                
                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue
                
                # 重み付きジニ不純度
                left_gini = self.gini_impurity(y[left_mask])
                right_gini = self.gini_impurity(y[right_mask])
                weighted_gini = (len(y[left_mask]) * left_gini + len(y[right_mask]) * right_gini) / len(y)
                
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold, best_gini
    
    def build_tree(self, X, y, depth=0):
        """決定木の構築"""
        if (self.max_depth is not None and depth >= self.max_depth) or len(np.unique(y)) == 1 or len(y) < self.min_samples_split:
            return {'leaf': True, 'class': np.bincount(y).argmax()}
        
        feature, threshold, gini = self.find_best_split(X, y)
        if feature is None:
            return {'leaf': True, 'class': np.bincount(y).argmax()}
        
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask
        
        return {
            'leaf': False,
            'feature': feature,
            'threshold': threshold,
            'left': self.build_tree(X[left_mask], y[left_mask], depth + 1),
            'right': self.build_tree(X[right_mask], y[right_mask], depth + 1)
        }
    
    def fit(self, X, y):
        """モデルの訓練"""
        self.tree = self.build_tree(X, y)
        return self
    
    def predict_single(self, x, tree):
        """単一サンプルの予測"""
        if tree['leaf']:
            return tree['class']
        
        if x[tree['feature']] <= tree['threshold']:
            return self.predict_single(x, tree['left'])
        else:
            return self.predict_single(x, tree['right'])
    
    def predict(self, X):
        """予測"""
        return np.array([self.predict_single(x, self.tree) for x in X])

# データの準備
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 手動実装とscikit-learnの比較
tree_manual = DecisionTree(max_depth=3)
tree_manual.fit(X_train, y_train)
y_pred_manual = tree_manual.predict(X_test)

tree_sklearn = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_sklearn.fit(X_train, y_train)
y_pred_sklearn = tree_sklearn.predict(X_test)

# 性能の比較
accuracy_manual = accuracy_score(y_test, y_pred_manual)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

print("=== 決定木の実装結果 ===")
print(f"手動実装の精度: {accuracy_manual:.4f}")
print(f"scikit-learnの精度: {accuracy_sklearn:.4f}")
print(f"精度の差: {abs(accuracy_manual - accuracy_sklearn):.4f}")

# 可視化
plt.figure(figsize=(15, 5))

# 手動実装の結果
plt.subplot(1, 3, 1)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred_manual, cmap='viridis', alpha=0.7)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Manual Decision Tree')

# scikit-learnの結果
plt.subplot(1, 3, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred_sklearn, cmap='viridis', alpha=0.7)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('scikit-learn Decision Tree')

# 真のラベル
plt.subplot(1, 3, 3)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='viridis', alpha=0.7)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('True Labels')

plt.tight_layout()
plt.show()

print(f"\n=== 分類レポート ===")
print("手動実装:")
print(classification_report(y_test, y_pred_manual))
print("scikit-learn:")
print(classification_report(y_test, y_pred_sklearn))
