In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from IPython.display import Image
warnings.filterwarnings('ignore')

In [2]:
from collections import Counter
class ScratchDecesionTreeClassifierDepth1():
    """
    深さ1の決定木分類器のスクラッチ実装

    Parameters
    ----------
    verbose : bool
      学習過程を出力する場合はTrue
    """

    def __init__(self, verbose=False):
        # ハイパーパラメータを属性として記録
        self.verbose = verbose
        self.name = "ScratchDecesionTreeClassifierDepth1"

    def fit(self, X, y):
        """
        決定木分類器を学習する
        Parameters
        ----------
        X : 次の形のndarray, shape (n_samples, n_features)
            訓練データの特徴量
        y : 次の形のndarray, shape (n_samples, )
            訓練データの正解値
        """
        _ig = 0
        for column_index in range(X.shape[1]):
            for threshold in list(set(X[:, column_index])):
                _temp_ig = IG(X, y, column_index, threshold)
                if _ig <= _temp_ig:
                    _ig = _temp_ig
                    under_threshold_y = y[X[:, column_index] < threshold]
                    over_threshold_y = y[X[:, column_index] >= threshold]
                    # 全てthreshold未満の場合
                    if len(under_threshold_y) == 0:
                        right_leaf_label = Counter(over_threshold_y).most_common()[0][0]
                        left_leaf_label = Counter(over_threshold_y).most_common()[1][0]
                    # 全てthreshold以上の場合
                    elif len(over_threshold_y) == 0:
                        left_leaf_label = Counter(under_threshold_y).most_common()[0][0]
                        right_leaf_label = Counter(under_threshold_y).most_common()[1][0]
                    else:
                        left_leaf_label = Counter(under_threshold_y).most_common()[0][0]
                        right_leaf_label = Counter(over_threshold_y).most_common()[0][0]

                    self.left_leaf_label = left_leaf_label
                    self.right_leaf_label = right_leaf_label
                    self.column_index = column_index
                    self.threshold = threshold
                    if self.verbose:
                        #verboseをTrueにした際は学習過程を出力
                        print("カラム {}, 閾値 {}の時: {}".format(column_index,  threshold, _ig))

    def predict(self, X):
        """
        決定木分類器を使いラベルを推定する
        """
        return np.where(X[:, self.column_index] < self.threshold, self.left_leaf_label,  self.right_leaf_label)

# 問題1
def GINI(y):
    labels, counts = np.unique(y, return_counts=True)
    gini = 1 - ((counts / counts.sum()) ** 2).sum()
    return gini

# 問題2
def IG(X, y, column_index, threshold):
    n_all = len(y)
    child_left_y = y[X[:, column_index] < threshold]
    child_right_y = y[X[:, column_index] >= threshold]
    i_p = GINI(y)
    i_left = GINI(child_left_y)
    i_right = GINI(child_right_y)
    ig_p = i_p - ((len(child_left_y) / n_all) * i_left) - ((len(child_right_y) / n_all)) * i_right
    return ig_p

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X = iris.data[:100,:]
y = iris.target[:100]
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)

In [4]:
dtc = ScratchDecesionTreeClassifierDepth1(verbose=True)
dtc.fit(X_train, y_train)

カラム 0, 閾値 4.8の時: 0.06790178571428573
カラム 0, 閾値 5.8の時: 0.15384407788539148
カラム 0, 閾値 5.1の時: 0.18643200549450556
カラム 0, 閾値 5.5の時: 0.36169252828409804
カラム 2, 閾値 3.9の時: 0.3688364361702129
カラム 2, 閾値 1.7の時: 0.3888920454545456
カラム 2, 閾値 3.7の時: 0.408576388888889
カラム 2, 閾値 3.5の時: 0.4520130813953489
カラム 2, 閾値 3.3の時: 0.49968750000000006
カラム 3, 閾値 1.0の時: 0.49968750000000006


In [5]:
pred_y = dtc.predict(X_test)

In [6]:
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def evaluate_model_and_display_plot(X, y_test, model):
    mesh1, mesh2 = np.meshgrid(
        np.arange(np.min(X[:,0]), np.max(X[:,0]), 0.01),
        np.arange(np.min(X[:,1]), np.max(X[:,1]), 0.01)
    )
    mesh = np.c_[np.ravel(mesh1), np.ravel(mesh2)]
    y_pred = slr.predict(mesh).reshape(mesh1.shape)
    plt.xlabel('f0')
    plt.ylabel('f1')
    # contourf: 塗りつぶし
    plt.contourf(mesh1, mesh2, y_pred, cmap=ListedColormap(['pink', 'skyblue']))
    # contour: 等高線
    plt.contour(mesh1, mesh2, y_pred, colors='red')
    plt.scatter(X[y_test==0][:, 0], X[y_test==0][:, 1], label='versicolor')
    plt.scatter(X[y_test==1][:, 0], X[y_test==1][:, 1], label='virginica')
    plt.legend()
    plt.show()

    result = pd.DataFrame(
        [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred)],
        index=['Accuracy', 'Precision', 'Recall'],
        columns=[model.name]
     )
    return result