该notebook为指导在树算法中实现一些额外的参数。

In [1]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## max_depth
```max_depth```参数是控制树生成的最重要的参数之一。在递归生成树时没想到什么好办法可以直接获取当前树的深度，不过可以通过增加一个叶子节点数的全局变量来获取当前树的深度，原理在于深度为$d$的树，他的最小叶子节点数与最大叶子节点数是可以通过公式算出来的。以下代码全部引自之前的notebook，不再赘述。

In [1]:
# import numpy as np
# from datasets.dataset import load_breast_cancer
# data=load_breast_cancer()
# X,Y=data.data,data.target
# del data

# from model_selection.train_test_split import train_test_split
# X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

# training_data=np.c_[X_train,Y_train]
# testing_data=np.c_[X_test,Y_test]

# def Gini(data, y_idx=-1):
#     K = np.unique(data[:, y_idx])
#     n_sample = len(data)
#     gini_idx = 1 - \
#         np.sum([np.square(len(data[data[:, y_idx] == k])/n_sample) for k in K])

#     return gini_idx

# def BinSplitData(data,f_idx,f_val):
#     data_left=data[data[:,f_idx]<=f_val]
#     data_right=data[data[:,f_idx]>f_val]
#     return data_left,data_right

# from scipy import stats

# def Test(data, criteria='gini', min_samples_split=5, min_samples_leaf=5, min_impurity_decrease=0.0):
#     n_sample, n_feature = data.shape

#     if n_sample < min_samples_split or len(np.unique(data[:,-1]))==1:
#         return None, stats.mode(data[:, -1])[0][0]

#     Gini_before = Gini(data)
#     best_gain = 0
#     best_f_idx = None
#     best_f_val = stats.mode(data[:, -1])[0][0]

#     for f_idx in range(n_feature-1):
#         for f_val in np.unique(data[:, f_idx]):
#             data_left, data_right = BinSplitData(data, f_idx, f_val)

#             if len(data_left) < min_samples_leaf or len(data_right) < min_samples_leaf:
#                 continue

#             Gini_after = len(data_left)/n_sample*Gini(data_left) + \
#                 len(data_right)/n_sample*Gini(data_right)
#             gain = Gini_before-Gini_after 

#             if gain < min_impurity_decrease or gain < best_gain:
#                 continue
#             else:
#                 best_gain = gain
#                 best_f_idx, best_f_val = f_idx, f_val

#     return best_f_idx, best_f_val

在递归生成树的函数之前增加一个全局变量：```nodes```，用于实时监控CART树的叶节点数。

In [19]:
# nodes=0
# max_depth=1

# def CART(data,criteria='gini',min_samples_split=5,min_samples_leaf=5,min_impurity_decrease=0.0):
#     best_f_idx,best_f_val=Test(data,criteria,min_samples_split,min_samples_leaf,min_impurity_decrease)
    
#     tree={}
#     tree['cut_f']=best_f_idx
#     tree['cut_val']=best_f_val
    
#     nonlocal nodes
#     nodes+=1
    
#     if best_f_idx==None:
#         return best_f_val
    
#     # 节点数超过最大深度的限制，也要返回叶节点，叶节点的值为当前数据中的目标值众数
#     if nodes>=2**max_depth:
#         return stats.mode(data[:, -1])[0][0]
    
#     data_left,data_right=BinSplitData(data,best_f_idx,best_f_val)
#     tree['left']=CART(data_left,criteria,min_samples_split,min_samples_leaf,min_impurity_decrease)
#     tree['right']=CART(data_right,criteria,min_samples_split,min_samples_leaf,min_impurity_decrease)
    
#     return tree

# tree=CART(training_data)

# # CART树存储为字典形式，将其字符串化后，每一个'left'代表左叉树枝，每一个'right'代表右叉树枝
# # 节点数之和等于树枝数+1
# tree_str=str(tree)
# edge=tree_str.count('left')+tree_str.count('right')
# assert edge+1==nodes

# print(tree,nodes)

{'cut_f': 23, 'cut_val': 880.8, 'left': 1.0, 'right': 0.0} 3


## sample_weight
该参数用于控制样本在树的分裂时所占的权重，实质就是不同样本对于纯净度的贡献。因为sklearn中该参数是位于```fit```方法中，所以实现该参数需要结合```.py```工程文件来分析。

```DecisionTreeClassifier.py```中的```fit```方法只有两步：
1. 将```X_train```与```Y_train```拼接起来便于共同操作
2. 递归调用```CART```方法

而```CART```方法中又调用了```Test```方法与```BinSplitData```方法，所以需要修改的部分就是这四个方法。

思路：
- 在```fit```方法中接受一个```sample_weight```参数，同样与```X_train```以及```Y_train```拼到一起
- Gini指数的计算方式需要改变，因为样本权重改变了$p_{k}$值，同时还要更改做test时加权Gini指数的计算方式
- 做test时注意不要扫描```weight```列与```Y```列

下面依次展示各方法中更改的代码部分：

In [20]:
def fit(self, X_train, Y_train，sample_weight=None):
    # ...
    sample_weight=sample_weight if sample_weight else np.array([1/len(X_train)]*len(X_train))
    data = np.c_[X_train,sample_weight, Y_train]    # 权重为倒数第二列，目标值为最后一列
    # ...

In [None]:
def __Gini(self, data, y_idx=-1):
    # ...
    gini_idx = 1 - np.sum([np.square(np.sum(data[data[:, y_idx] == k][:,-2]) / np.sum(data[:,-2])) for k in K])
    # ...

In [21]:
def __Test(self, data):
    # ...
    n_sample, n_feature = data.shape
    n_feature-=-2   # 除去第一列与最后一列
    # ...
    # 加权Gini的计算方式需要更改，改成数据子集的权重和
    Gini_after = np.sum(data_left[:,-2])/np.sum(data[:,-2]) * self.__Gini(data_left) + \
                 np.sum(data_right[:,-2])/np.sum(data[:,-2]) * self.__Gini(data_right)
    # ...

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])