In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import datasets

In [3]:
import numpy as np

In [4]:
from tree_modules.tree_base import gini, node_internal, node_leaf

In [5]:
iris = datasets.load_iris()

In [6]:
X = iris.data
y = iris.target

## 森にする
- 使うデータをresampling with replacement
- 各ノードで使う特徴量の数をランダムで選べるようにする
- 複数の木を作って，平均をとる
    - 使うデータは，木ごとに変える。

### データをresampling with replacement(復元抽出)
- Bootstrap

In [7]:
X.shape

(150, 4)

In [8]:
len(y)

150

In [9]:
index_resample = np.random.choice(len(y), len(y), replace=True)

In [10]:
X_resample, y_resample = X[index_resample], y[index_resample]

### 特徴量をランダムに選出

In [11]:
num_features = X.shape[1]
max_features = int(np.sqrt(num_features))

In [12]:
index_feat_chosen = np.random.choice(num_features, max_features, replace=False)

In [13]:
X_chosen = X_resample[:, index_feat_chosen]

### ランダムフォレストに対応させるためにMyTreeクラスを改変
- _go_on_dividingメソッドに特徴量ランダム選出を追加
- パラメータを２つ追加
    - splitter : string, 'best' or 'random' (default = 'best')
    - max_features : int, float, or None (default = None)
        - int : the number of features to use (if max_features <= n_features)
        - float : the fraction of features to use (if 0 < max_features <=1.0)
        - None : the number of features are determined as root of n_features (CAUTION different from sklearn option!)

In [14]:
class MyTree():
    
    
    def __init__(self, threshold_gini=0.05, min_node_size=5, max_depth=3,
                 splitter='best', max_features=None,
                 verbose=False):
        
        self.threshold_gini, self.min_node_size, self.max_depth = threshold_gini, min_node_size, max_depth
        self.i_node = None
        self.dict_nodes = None
        
        self.splitter = splitter # for RF
        self.max_features = max_features # for RF
        
        self.verbose = verbose
    
    
    def _find_optimal_division(self, x, y):
        list_gini = []
        x_unique = np.unique(x)

        for threshold in x_unique:

            mask_divide = x > threshold
            y_right = y[mask_divide]
            y_left = y[~mask_divide]

            gini_divide = (gini(y_right) * len(y_right) + gini(y_left) * len(y_left)) / len(y)

            list_gini.append(gini_divide)

        array_gini = np.array(list_gini)
        i_div_opt = np.argmin(array_gini)

        return x_unique[i_div_opt], array_gini[i_div_opt]


    def _divide(self, X, y):

        results = np.apply_along_axis(self._find_optimal_division, 0, X, y)

        arg_div = np.argmin(results[1])
        x_div = results[0, arg_div]

        return arg_div, x_div


    def _go_on_dividing(self, X, y, depth=0):

        depth += 1
        
        if self.splitter == 'best':
            X_chosen = X
            index_feat_chosen = np.arange(X.shape[1])
        
        elif self.splitter == 'random':
            
            n_features = X.shape[1]
            if self.max_features is None:
                num_feat_chosen = int(np.sqrt(n_features))
            elif isinstance(self.max_features, int) and self.max_features>0 and self.max_features <= n_features:
                num_feat_chosen = self.max_features
            elif isinstance(self.max_features, float) and self.max_features>0 and self.max_features<=1.0:
                num_feat_chosen = int(n_features * self.max_features)
            else:
                raise ValueError
                
            index_feat_chosen = np.random.choice(n_features, num_feat_chosen, replace=False)
            X_chosen = X[:, index_feat_chosen]
            
        else:
            raise ValueError

        arg_div_tmp, x_div = self._divide(X_chosen, y)
        arg_div = index_feat_chosen[arg_div_tmp] # inevitable in case of RF
        
        node_current = node_internal(self.i_node, depth, arg_div, x_div)
        self.dict_nodes[self.i_node] = node_current
        
        if self.verbose == True:
            print("=== node {} (depth {}): arg_div -> {}, x_div -> {} ===".format(self.i_node, depth, arg_div, x_div))

        mask = X[:, arg_div] > x_div
        X_right, X_left = X[mask], X[~mask]
        y_right, y_left = y[mask], y[~mask]

        gini_left = gini(y_left)
        gini_right = gini(y_right)

        list_divided = [(X_left, y_left, gini_left), (X_right, y_right, gini_right)]

        for lr, divided in enumerate(list_divided):
            self.i_node +=1

            X_i, y_i, gini_i = divided
            if gini_i > self.threshold_gini and len(y_i)>self.min_node_size and depth+1 <= self.max_depth:
                
                node_current.set_node_child(lr, self.i_node)
                self._go_on_dividing(X_i, y_i, depth=depth)
            else:
                node_current.set_node_child(lr, self.i_node)
                
                feature_majority = np.bincount(y_i).argmax()
                
                node_terminal = node_leaf(self.i_node, depth, feature_majority)
                self.dict_nodes[self.i_node] = node_terminal
                

    def fit(self, X, y):
        
        self.i_node = 0
        self.dict_nodes = {}
        
        self._go_on_dividing(X, y)


    def _pred_each_vector(self, x):
        
        node_current = self.dict_nodes[0]
        while True:
            lr = int(x[node_current.i_feature] > node_current.threshold)
            node_next = self.dict_nodes[node_current.node_child[lr]]
            
            if node_next.__class__.__name__ == 'node_leaf':
                return node_next.k_decided
            else:
                node_current = node_next
    
    
    def predict(self, X):
        
        return np.apply_along_axis(self._pred_each_vector, 1, X)

##### 動作確認

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
X_train.shape, y_train.shape

((112, 4), (112,))

In [18]:
X_test.shape, y_test.shape

((38, 4), (38,))

In [19]:
tree = MyTree(splitter='random', max_features=None, verbose=True)

In [20]:
tree.fit(X_train, y_train)

=== node 0 (depth 1): arg_div -> 2, x_div -> 1.7 ===
=== node 2 (depth 2): arg_div -> 3, x_div -> 1.6 ===
=== node 3 (depth 3): arg_div -> 2, x_div -> 4.9 ===
=== node 6 (depth 3): arg_div -> 2, x_div -> 4.8 ===


In [21]:
y_pred = tree.predict(X_test)

In [22]:
y_pred

array([1, 1, 2, 0, 2, 1, 2, 1, 2, 1, 0, 1, 1, 2, 2, 2, 1, 1, 0, 1, 2, 2,
       0, 2, 0, 1, 0, 0, 1, 1, 1, 0, 1, 2, 0, 2, 1, 0])

In [23]:
y_test

array([1, 1, 2, 0, 2, 1, 2, 1, 2, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 2, 2,
       0, 2, 0, 1, 0, 0, 0, 1, 1, 0, 1, 2, 0, 2, 1, 0])

In [24]:
(y_pred == y_test).sum() / len(y_test)

0.9210526315789473

## 木を集めて森を作る
- Bootstrappingも行う

#### 学習

In [25]:
n_estimators=10

In [26]:
list_trees = []
length_data = len(y)
for i in range(0, n_estimators):
    
    index_resample = np.random.choice(length_data, length_data, replace=True)
    X_resample, y_resample = X[index_resample], y[index_resample]
    
    a_tree = MyTree(splitter='random', max_features=None)
    a_tree.fit(X_resample, y_resample)
    list_trees.append(a_tree)

In [27]:
list_trees

[<__main__.MyTree at 0x1222f64e0>,
 <__main__.MyTree at 0x1222e2dd8>,
 <__main__.MyTree at 0x122301748>,
 <__main__.MyTree at 0x122301b38>,
 <__main__.MyTree at 0x122301f28>,
 <__main__.MyTree at 0x122301f60>,
 <__main__.MyTree at 0x1223018d0>,
 <__main__.MyTree at 0x1222f6080>,
 <__main__.MyTree at 0x122369438>,
 <__main__.MyTree at 0x122369748>]

#### 予測

In [28]:
list_pred = []
for a_tree in list_trees:
    
    list_pred.append(a_tree.predict(X))
    

In [29]:
array_pred = np.array(list_pred)

In [30]:
array_pred

array([[0, 0, 0, ..., 2, 2, 2],
       [0, 0, 0, ..., 2, 2, 2],
       [0, 0, 0, ..., 2, 2, 2],
       ...,
       [0, 0, 0, ..., 2, 2, 2],
       [0, 0, 0, ..., 2, 2, 2],
       [0, 0, 0, ..., 2, 2, 2]])

In [31]:
array_pred[:,0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
np.bincount(array_pred[:,0], minlength=3)

array([10,  0,  0])

In [33]:
pole_result = np.apply_along_axis(func1d=np.bincount, axis=0, arr=array_pred, minlength=3)

In [34]:
pole_result

array([[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10,  9, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  8, 10, 10, 10, 10,
        10, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,
         0,  0, 10, 10,  8, 10, 10, 10, 10,  9, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10,

In [35]:
pole_result.argmax(axis=0)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [36]:
(pole_result / n_estimators).T

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.9, 0.1, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. 

## クラスにしてまとめる

In [37]:
class MyForest():
    
    def __init__(self, n_estimators=10, max_features=None,
                 threshold_gini=0.05, min_node_size=5, max_depth=3,
                 verbose=False):
        
        self.n_estimators = n_estimators
        
        self.threshold_gini, self.min_node_size, self.max_depth = threshold_gini, min_node_size, max_depth
        self.verbose = verbose

        self.max_features = max_features # for RF
        
        self.n_classes = None
        self.list_trees = None
        
    
    def fit(self, X, y):
        
        self.n_classes = len(np.unique(y))
        self.list_trees = []
        length_data = len(y)
        
        for i in range(0, self.n_estimators):
            if self.verbose == True:
                print('=== {}th tree ==='.format(i))
    
            index_resample = np.random.choice(length_data, length_data, replace=True)
            X_resample, y_resample = X[index_resample], y[index_resample]

            a_tree = MyTree(splitter='random', max_features=self.max_features,
                            threshold_gini=self.threshold_gini, min_node_size=self.min_node_size, max_depth=self.max_depth,
                            verbose=self.verbose)
            
            a_tree.fit(X_resample, y_resample)
            self.list_trees.append(a_tree)

    
    def _pole(self, X, n_classes, list_trees):
        
        list_pred = []
        for a_tree in list_trees:
            list_pred.append(a_tree.predict(X))
        
        array_pred = np.array(list_pred)
        
        return np.apply_along_axis(func1d=np.bincount, axis=0, 
                                   arr=array_pred, minlength=n_classes)
    
    
    def predict(self, X):

        pole_result = self._pole(X, self.n_classes, self.list_trees)
        
        return pole_result.argmax(axis=0)
    
    
    def predict_proba(self, X):
        
        pole_result = self._pole(X, self.n_classes, self.list_trees)
        
        return (pole_result / self.n_estimators).T

#### 動作確認

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [39]:
myrf = MyForest(n_estimators=100)

In [40]:
myrf.fit(X_train, y_train)

In [41]:
y_pred = myrf.predict(X_test)

In [42]:
y_pred

array([1, 1, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 1, 1, 1, 1, 0, 1,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1])

In [43]:
y_test

array([1, 1, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 1, 1, 1, 0, 1,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1])

In [44]:
(y_pred == y_test).sum() / len(y_test)

0.9473684210526315

In [45]:
myrf.predict_proba(X_test)

array([[0.  , 1.  , 0.  ],
       [0.  , 1.  , 0.  ],
       [0.  , 0.01, 0.99],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.62, 0.38],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.01, 0.99],
       [0.  , 1.  , 0.  ],
       [0.  , 1.  , 0.  ],
       [0.02, 0.98, 0.  ],
       [0.  , 0.93, 0.07],
       [0.  , 0.99, 0.01],
       [1.  , 0.  , 0.  ],
       [0.  , 0.97, 0.03],
       [0.  , 0.08, 0.92],
       [0.  , 0.01, 0.99],
       [0.  , 0.  , 1.  ],
       [0.02, 0.81, 0.17],
       [0.  , 0.86, 0.14],
       [0.  , 0.02, 0.98],
       [0.  , 0.02, 0.98],
       [0.  , 0.02, 0.98],
       [0.  , 0.01, 0.99],
       [0.  , 0.01, 0.99],
       [0.  , 0.02, 0.98],
       [0.94, 0.06, 0.  ],
       [0.  , 0.02, 0.98],
       [0.  , 0.01, 0.99],
       [0.  , 0.  , 1.  ],
 

In [46]:
myrf.n_classes

3