In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn import datasets

In [4]:
import numpy as np

In [5]:
iris = datasets.load_iris()

In [6]:
X = iris.data
y = iris.target

In [7]:
from tree_modules.tree_base import gini

## フィット過程を記録するためにNodeクラスを定義する
- 記録が必要な情報
    - 中間ノードの場合
        - ノード番号
        - どの特徴量で分割するか
        - 分割の閾値
        - 子ノードの番号
    - 葉ノードの場合
        - ノード番号
        - 分類後のクラス番号
    
- 中間ノードと終端ノード（葉ノード）を別々に実装
    - 基本ノードクラスを作成し，継承させる。
        - ノード番号と，念の為木の深さdepthも入れておく。

In [8]:
class node_basis():
    
    def __init__(self, i_node, depth):
        
        self.i_node = i_node
        self.depth = depth

In [9]:
class node_internal(node_basis):
    
    def __init__(self, i_node, depth, i_feature, threshold):
        
        super().__init__(i_node, depth)
        self.i_feature = i_feature
        self.threshold = threshold
        
        self.node_child = {0:None, 1:None}
        

    def set_node_child(self, lr, i):
        self.node_child[lr] = i


In [10]:
class node_leaf(node_basis):
    
    def __init__(self, i_node, depth, k):
        
        super().__init__(i_node, depth)
        self.k_decided = k

## go_on_dividing関数に付け加える

In [11]:
def find_optimal_division(x, y):
    list_gini = []
    x_unique = np.unique(x)

    for threshold in x_unique:

        mask_divide = x > threshold
        y_upper = y[mask_divide]
        y_lower = y[~mask_divide]

        gini_divide = (gini(y_upper) * len(y_upper) + gini(y_lower) * len(y_lower)) / len(y)

        list_gini.append(gini_divide)
        
    array_gini = np.array(list_gini)
    i_div_opt = np.argmin(array_gini)
    
    return x_unique[i_div_opt], array_gini[i_div_opt]

In [17]:
def divide_tree(X, y):

    results = np.apply_along_axis(find_optimal_division, 0, X, y)

    arg_div = np.argmin(results[1])
    x_div = results[0, arg_div]

    return arg_div, x_div

In [21]:
def go_on_dividing(X, y, depth=0,
                   threshold_gini=0.05, min_node_size=5, max_depth=3):
    
    global i_node, dict_nodes
    depth += 1

    arg_div, x_div = divide_tree(X, y)
    node_current = node_internal(i_node, depth, arg_div, x_div)
    dict_nodes[i_node] = node_current

    print("=== node {} (depth {}): arg_div -> {}, x_div -> {} ===".format(i_node, depth, arg_div, x_div))

    mask = X[:, arg_div] > x_div
    X_right, X_left = X[mask], X[~mask]
    y_right, y_left = y[mask], y[~mask]

    gini_left = gini(y_left)
    gini_right = gini(y_right)

    list_divided = [(X_left, y_left, gini_left), (X_right, y_right, gini_right)]

    for lr, divided in enumerate(list_divided):
        i_node +=1

        X_i, y_i, gini_i = divided
        if gini_i > threshold_gini and len(y_i)>min_node_size and depth+1 <= max_depth:

            node_current.set_node_child(lr, i_node)
            go_on_dividing(X_i, y_i, depth=depth)
        else:
            node_current.set_node_child(lr, i_node)
            feature_majority = np.bincount(np.array(y_i)).argmax()

            node_terminal = node_leaf(i_node, depth, feature_majority)
            dict_nodes[i_node] = node_terminal

In [22]:
i_node=0
dict_nodes = {}
go_on_dividing(X, y)

=== node 0 (depth 1): arg_div -> 2, x_div -> 1.9 ===
=== node 2 (depth 2): arg_div -> 3, x_div -> 1.7 ===
=== node 3 (depth 3): arg_div -> 2, x_div -> 4.9 ===


In [26]:
dict_nodes

{0: <__main__.node_internal at 0x1222dcfd0>,
 1: <__main__.node_leaf at 0x1222dccf8>,
 2: <__main__.node_internal at 0x1222dc9b0>,
 3: <__main__.node_internal at 0x1222dc908>,
 4: <__main__.node_leaf at 0x1222dcc88>,
 5: <__main__.node_leaf at 0x1222dc828>,
 6: <__main__.node_leaf at 0x1222dceb8>}

In [31]:
dict_nodes[2].i_feature, dict_nodes[2].threshold

(3, 1.7)

In [30]:
dict_nodes[2].node_child

{0: 3, 1: 6}

In [50]:
dict_nodes[1].k_decided

0

## フィットで作成したnode情報から予測を行う

#### まずは適当な特徴ベクトルをサンプルに

In [32]:
x_sample = X[0]

In [33]:
x_sample

array([5.1, 3.5, 1.4, 0.2])

In [35]:
dict_nodes[0].i_feature

2

In [36]:
dict_nodes[0].threshold

1.9

In [37]:
dict_nodes[0].node_child

{0: 1, 1: 2}

In [40]:
node_current = dict_nodes[0]

lr = int(x_sample[node_current.i_feature] > node_current.threshold)
node_next = dict_nodes[node_current.node_child[lr]]

In [41]:
lr

0

In [42]:
if node_next.__class__.__name__ == 'node_leaf':
    print(node_next.k_decided)

0


#### 関数にまとめる

In [43]:
def pred_at_node(x, node):
    lr = int(x[node.i_feature] > node.threshold)
    i_node_next = node.node_child[lr]
    
    return i_node_next

In [44]:
def pred_each_vector(x, dict_nodes):
    node_current = dict_nodes[0]
    while True:
        node_next = dict_nodes[pred_at_node(x, node_current)]
        if node_next.__class__.__name__ == 'node_leaf':
            return node_next.k_decided
        else:
            node_current = node_next

In [45]:
pred_each_vector(x_sample, dict_nodes)

0

#### 全データに適用してみる

In [51]:
y_pred = np.apply_along_axis(func1d=pred_each_vector, axis=1, arr=X, dict_nodes=dict_nodes)

In [52]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [48]:
# 精度。フィットに用いたのと同じデータで計算しているから，高くて当たり前。
(y_pred == y).sum() / len(y)

0.9733333333333334

## MyTreeクラスとして整える

In [156]:
class MyTree():
    
    
    def __init__(self, threshold_gini=0.05, min_node_size=5, max_depth=3):
        
        self.threshold_gini, self.min_node_size, self.max_depth = threshold_gini, min_node_size, max_depth
        self.i_node = None
        self.dict_nodes = None
    
    
    def _find_optimal_division(self, x, y):
        list_gini = []
        x_unique = np.unique(x)

        for threshold in x_unique:

            mask_divide = x > threshold
            y_right = y[mask_divide]
            y_left = y[~mask_divide]

            gini_divide = (gini(y_right) * len(y_right) + gini(y_left) * len(y_left)) / len(y)

            list_gini.append(gini_divide)

        array_gini = np.array(list_gini)
        i_div_opt = np.argmin(array_gini)

        return x_unique[i_div_opt], array_gini[i_div_opt]


    def _divide(self, X, y):

        results = np.apply_along_axis(self._find_optimal_division, 0, X, y)

        arg_div = np.argmin(results[1])
        x_div = results[0, arg_div]

        return arg_div, x_div


    def _go_on_dividing(self, X, y, depth=0):

        depth += 1

        arg_div, x_div = self._divide(X, y)
        node_current = node_internal(self.i_node, depth, arg_div, x_div)
        self.dict_nodes[self.i_node] = node_current

        print("=== node {} (depth {}): arg_div -> {}, x_div -> {} ===".format(self.i_node, depth, arg_div, x_div))

        mask = X[:, arg_div] > x_div
        X_right, X_left = X[mask], X[~mask]
        y_right, y_left = y[mask], y[~mask]

        gini_left = gini(y_left)
        gini_right = gini(y_right)

        list_divided = [(X_left, y_left, gini_left), (X_right, y_right, gini_right)]

        for lr, divided in enumerate(list_divided):
            self.i_node +=1

            X_i, y_i, gini_i = divided
            if gini_i > self.threshold_gini and len(y_i)>self.min_node_size and depth+1 <= self.max_depth:
                
                node_current.set_node_child(lr, self.i_node)
                self._go_on_dividing(X_i, y_i, depth=depth)
            else:
                node_current.set_node_child(lr, self.i_node)
                feature_majority = np.bincount(np.array(y_i)).argmax()
                
                node_terminal = node_leaf(self.i_node, depth, feature_majority)
                self.dict_nodes[self.i_node] = node_terminal
                

    def fit(self, X, y):
        
        self.i_node = 0
        self.dict_nodes = {}
        
        self._go_on_dividing(X, y)


    def _pred_each_vector(self, x):
        
        node_current = self.dict_nodes[0]
        while True:
            lr = int(x[node_current.i_feature] > node_current.threshold)
            node_next = self.dict_nodes[node_current.node_child[lr]]
            
            if node_next.__class__.__name__ == 'node_leaf':
                return node_next.k_decided
            else:
                node_current = node_next
    
    
    def predict(self, X):
        
        return np.apply_along_axis(self._pred_each_vector, 1, X)

#### 実際にそれっぽく使ってみる

In [157]:
from sklearn.model_selection import train_test_split

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [159]:
X_train.shape, y_train.shape

((112, 4), (112,))

In [160]:
X_test.shape, y_test.shape

((38, 4), (38,))

In [161]:
tree = MyTree()

In [162]:
tree.fit(X_train, y_train)

=== node 0 (depth 1): arg_div -> 2, x_div -> 1.9 ===
=== node 2 (depth 2): arg_div -> 2, x_div -> 4.8 ===
=== node 3 (depth 3): arg_div -> 3, x_div -> 1.6 ===
=== node 6 (depth 3): arg_div -> 3, x_div -> 1.7 ===


In [163]:
y_pred = tree.predict(X_test)

In [164]:
y_pred

array([2, 0, 0, 0, 0, 2, 0, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 2, 0, 2, 1, 2,
       2, 1, 1, 0, 0, 2, 0, 2, 1, 2, 1, 1, 1, 1, 2, 2])

In [165]:
y_test

array([1, 0, 0, 0, 0, 2, 0, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 2, 0, 2, 1, 2,
       2, 1, 1, 0, 0, 1, 0, 2, 1, 2, 1, 1, 1, 1, 2, 2])

In [166]:
(y_pred == y_test).sum() / len(y_test)

0.9473684210526315