In [23]:
##   注意dataframe 类型数据iloc很慢，可以用apply函数尝试，或者替换成ndarray格式
class DecisionTreeClassifier():
    def __init__(self,max_depth: int = None,min_samples_split:int = 5,
         min_samples_leaf: int = 5,min_impurity_decrease: float =0.0):
        '''
        min_samples_split:  内部节点再划分所需最小样本数
        min_samples_leaf:   叶子节点最少样本数 这个值限制了叶子节点最少的样本数，如果某叶子节点数目小于样本数，则会和兄弟节点一起被剪枝
        分裂需要满足的最小增益
        max_depth: 最大深度
        min_impurity_decrease:分裂需要满足的最小增益
        '''
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease
        self.nodes = 0  
        self.tree = None
        self.type_feature = None
        self.index = None
    def __Gini(self,X,y):
        '''
        :param data: 
        :param X: 特征数据
        :param y: 目标数据
        :return: Gini: 返回该数据每个特征的Gini系数
        '''
        ##  根据第一个公式
        K = np.unique(y)
        Gini = 1 - np.sum([(len(X[y == k]) / len(X))**2 for k in K])
        
        return Gini
    
    def __typeFeature(self,X):
        # 表示特征是否为连续还是离散
        n_sample,n_feature = X.shape
        self.type_feature = []
        ####   特征属性小于10个，认为是离散型数据用0表示，连续性数据用1 表示
        for f_idx in range(n_feature):
            if len(np.unique(X[:, f_idx]))< 10:
                self.type_feature.append(0)
            else:
                self.type_feature.append(1)
        return self.type_feature
                

    def __binSplitData(self,X,y,index,f_idx,f_val):
        ### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类，其余分为另一类
        ####################    0: 离散类型特征二分方法 1:连续数据   ############################
        att=X[:, f_idx]
        
        if self.type_feature[f_idx]== 0:
            X_left = X[att == f_val]
            X_right = X[att != f_val]
            y_left = y[att == f_val]
            y_right = y[att != f_val]
            index_left = index[att == f_val]
            index_right = index[att != f_val]
        else:
            X_left = X[att <= f_val]
            X_right = X[att >f_val]
            y_left = y[att <= f_val]
            y_right = y[att > f_val]
            index_left = index[att <= f_val]
            index_right = index[att > f_val]
           ## 切分点和样本点的索引
        return X_left, X_right, y_left, y_right,index_left,index_right
    
    
    def __bestSplit(self,X,y,index):
        '''
           
        找到最佳分割特征与特征值
        :param X
        :return: best_f_idx  最佳分割特征 ， best_f_val 特征值
         
        '''
        best_gain = 0
        n_sample,n_feature = X.shape
        best_f_idx = None
        best_f_val = None
        ## 第一个终止条件： 当叶子节点中的样本数小于最小分割值，不再分割
        Gini_before= self.__Gini(X,y)
        if n_sample < self.min_samples_split:
            return best_f_idx,best_f_val       
        ##-------------------------通过不断二分的过程 寻找对于某个特征，的最佳分割点---------------------------
        for f_idx in range(n_feature):
        ##-------------------------如果该特征中的属性个数小于10，则认为是离散数据 type_feature = 0，否则else---------------------------

            if self.type_feature[f_idx] == 0:
                for f_val in np.unique(X[:, f_idx]):
                    ## 当某个特征只有两个类别时，仅仅做一次左右子树的划分，不用重复操作
                    if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
                        continue

                    else:
                        X_left, X_right, y_left, y_right,index_left,index_right = self.__binSplitData(X,y,index,f_idx,f_val)
                        

                    ## 第二个终止条件： 分割后样本数据小于节点的最低样本数，则放弃分割   
                        if len(index_left)<self.min_samples_leaf or len(index_right)<self.min_samples_leaf:
                            continue
                        Gini_after = len(X_left)/len(X) * self.__Gini(X_left,y_left) + len(X_right)/len(X) * self.__Gini(X_right,y_right)
                    ## 第三个终止条件，当分裂后的增益小于阈值后者大于目前最大增益
                        gain = Gini_before - Gini_after

                    ## 第三个终止条件，当分裂后的增益小于阈值后者大于目前最大增益
                        if gain < self.min_impurity_decrease or gain < best_gain: 
                            continue
                        else:
                            ## 更新最大增益和最佳分裂位置
                            best_gain = gain
                            best_f_idx,best_f_val = f_idx,f_val
        ##-------------------------     连续特征属性的二分 case = 1   ---------------------------
            else:
                for f_val in np.linspace(X[:, f_idx].min()+1,X[:, f_idx].max()-1,num=50):
                        X_left, X_right, y_left, y_right,index_left,index_right = self.__binSplitData(X,y,index,f_idx,f_val)

                    ## 第二个终止条件： 分割后样本数据小于节点的最低样本数，则放弃分割   
                        if len(index_left)<self.min_samples_leaf or len(index_right)<self.min_samples_leaf:
                            continue
                        Gini_after = len(X_left)/len(X) * self.__Gini(X_left,y_left) + len(X_right)/len(X) * self.__Gini(X_right,y_right)
                    ## 第三个终止条件，当分裂后的增益小于阈值后者大于目前最大增益
                        gain = Gini_before - Gini_after

                    ## 第三个终止条件，当分裂后的增益小于阈值后者大于目前最大增益
                        if gain < self.min_impurity_decrease or gain < best_gain: 
                            continue
                        else:
                            ## 更新最大增益和最佳分裂位置
                            best_gain = gain
                            best_f_idx,best_f_val = f_idx,f_val
        return best_f_idx,best_f_val

    def __CART(self,X,y,index,probability):
        '''
        生成CART树
        :param X： 特征数据
        :param y: 目标数据
        :return; CART 树
        '''
        best_f_idx, best_f_val = self.__bestSplit(X,y,index)
        self.nodes += 1
        
       
        # best_f_idx 为空表示不能接续划分，则该点为叶子结点  best_f_val
        if best_f_idx is None:
            return index
        # 节点数超过最大深度的限制，也要返回叶节点，叶节点的值为当前数据中的目标值众数
        if self.max_depth:
            if self.nodes >= 2**self.max_depth:
                return index
        tree = dict()
        tree['cut_f'] = best_f_idx
        tree['cut_val'] = best_f_val
        X_left, X_right, y_left, y_right,index_left,index_right = self.__binSplitData(X,y,index,best_f_idx,best_f_val)
        tree['left_value'] = np.sum(y_left)/ np.sum(probability[index_left] * (1- probability[index_left]))
        tree['right_value'] = np.sum(y_right)/ np.sum(probability[index_right] * (1- probability[index_right]))
        tree['left'] = self.__CART(X_left,y_left,index_left,probability)
        tree['right'] = self.__CART(X_right,y_right,index_right,probability)
        return tree       
   
    
    def fit(self,X,y,probability):
        '''
        拟合模型，数据应该是 ndarray or series类型，dataframe通过 df.values转变成ndarray，不会报错
        :param X: 特征数据
        :param: y: 目标数据
        :param: sample_weight
        :return: None
        '''
        # 标记每个特征是离散还是连续，从而采用不同的二分方法
        self.index = np.array(range(len(X)))
        self.type_feature = self.__typeFeature(X) 
        self.tree = self.__CART(X,y,self.index,probability)
        
        return self.tree
    def predict(self,X_test):
        '''
        数据类别预测
        :param X_test:预测数据
        :return: y_: 类别预测结果
        '''

        return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])
    
    def __predict_one(self,x_test,tree,label = None):
        if isinstance(tree, dict):  # 非叶节点才做左右判断
           
            cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
            if self.type_feature[cut_f_idx] == 0:
                sub_tree = tree['left'] if x_test[cut_f_idx] == cut_val else tree['right']
                label = tree['left_value'] if x_test[cut_f_idx] == cut_val else tree['right_value']
            else:
                sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
                label = tree['left_value'] if x_test[cut_f_idx] <=  cut_val else tree['right_value']
            return self.__predict_one(x_test, sub_tree,label)
        else:
            return label

In [24]:
class GBDTClassifier():
    def __init__(self,estimators: int = 10, classifier = DecisionTreeClassifier,step: float = 0.1):
        self.estimators = estimators
        self.weakLearner = classifier
        self.step = step
        self.trees = []
        self.F_init = None
        
    def pseudoResiduals(self,y,probability):
        rm = y - probability
        return rm
    
    def TerminalRegions(self,tree):
        ###  找到每一个叶子节点内的数据，或者说找到叶子节点包含的区域
        global Rm
        for key, val in tree.items():
            if key == 'left' or key =='right':
                if isinstance(tree[key],dict):
                    self.TerminalRegions(tree[key])
                else:
                    Rm.append(val)
        return Rm
    def findRegions(self,x,Rm):
        for i in rangr(len(Rm)):
            (x == Rm[s[1]]).sum()

    def fit(self,X,y):
        yes = (y == 1).sum()
        no = (y == 0).sum()
        self.F_init = np.log(yes/no)
        ## step1 通过寻找损失函数最小是对应的来设置初值
        F_before = np.array([np.log(yes/no)] * len(X)) 
        for m in range(self.estimators):
            ##(a) 计算损失函数的负梯度值（也叫伪残差）
            probability = np.exp(F_before)/(1 + np.exp(F_before))
            rm = self.pseudoResiduals(y,probability)
            ## (b) 建立学习器拟合以上残差，
            tree_clf = self.weakLearner(max_depth = 4)
            tree = tree_clf.fit(X, rm,probability)
            self.trees.append(tree_clf)
            ## 并建立每个叶子节点最终区域
            global Rm
            Rm = []
            Rm = self.TerminalRegions(tree)
            Jm = len(Rm)
            gamma_m = np.zeros(len(X)) 
            ## （c）通过最小化每个节点中数据的损失函数和
            for j in range(Jm):
                gamma_m[Rm[j]] += np.sum(rm[Rm[j]])/ np.sum((probability[Rm[j]] * (1-probability[Rm[j]])))
            ##  (d)更新
            Fm = F_before + self.step *  gamma_m
            F_before = Fm
            
    def predict(self,x_test):
        M = self.estimators
        y_ = np.array([self.F_init] * len(x_test)) 
        for m in range(M):
            a=  self.trees[m].predict(X_test)
           # print('a.shape -----------------------------',a.shape,y_.shape)
            y_ += self.step * self.trees[m].predict(X_test)
        probability = np.exp(y_)/(1+np.exp(y_))
        return np.int64(probability > 0.5)

In [30]:
if __name__ == '__main__':
    from sklearn import datasets
    import pandas as pd
    import  numpy as np    
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    data = datasets.load_breast_cancer()
    data.target[data.target > 0] = 1
    data.target[data.target == 0] = 0
    scaler = StandardScaler()
    X = scaler.fit_transform(data.data)
    Y = data.target
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
    tree_clf = GBDTClassifier(estimators = 10)
    tree_clf.fit(X_train, Y_train)
    Y_pred = tree_clf.predict(X_test)
    print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))
    del tree_clf
    from sklearn.ensemble import GradientBoostingClassifier

    tree_clf = GradientBoostingClassifier()

    tree_clf.fit(X_train, Y_train)

    Y_pred = tree_clf.predict(X_test)
    print('sklearn acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))


acc:0.9122807017543859
sklearn acc:0.956140350877193
