# 分享最近 基于梯度提升决策树的实现

## 导入数据

In [13]:
import abc
import pandas as pd
import math
import numpy as np
import sys

from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'])
y = pd.DataFrame(diabetes.target,
                 columns=["target"])
datas = pd.concat([x, y], axis=1)
datas = datas.dropna()

数据格式 target为label 其余为fetaure

In [14]:
datas

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


In [15]:
# ============================================================================================================================================================
# from sklearn.datasets import load_iris
#
# iris = load_iris()
# x = pd.DataFrame(iris.data, columns=iris.feature_names)
# y = pd.DataFrame(iris.target, columns=["target"])
# datas = pd.concat([x, y], axis=1)
# datas = datas.iloc[:100, :]
# datas = datas.dropna()


# ============================================================================================================================================================


# data=data.set_index('id')

回归损失函数的定义 此处用MSE

In [16]:
class SqureLoss:
    def __init__(self):
        pass

    #     均值
    def f0(self, data, lable):
        data['f_0'] = data[lable].mean()

    def calculate_residual(self, data, m, lable):
        f_pre = 'f_' + str(m - 1)
        r_ = 'r_' + str(m)
        data[r_] = data[lable] - data[f_pre]

    def update_f_m(self, data, m, tree, learning_rate):
        f_pre = 'f_' + str(m - 1)
        f_ = 'f_' + str(m)
        r_ = 'r_' + str(m)
        data[f_] = data[f_pre]
        for leaf in tree.leafNodes:
            data.loc[leaf.data.index, f_] = data.loc[leaf.data.index, r_] * learning_rate + data.loc[
                leaf.data.index, f_pre]

用于分类的LOSS

In [17]:
class BinomialDevianceLoss:
    def __init__(self):
        pass

    #     log(p) 两个类所占的比例
    def f0(self, data, lable):
        pos = data[lable].sum()
        neg = data.shape[0] - pos
        f_0 = math.log(pos / neg)
        data['f_0'] = f_0
        return f_0

    def calculate_residual(self, data, m, lable):
        data['r_' + str(m)] = data[lable] - 1 / (1 + data['f_' + str(m - 1)].apply(lambda x: math.exp(-x)))

    def update_f_m(self, data, m, tree, learning_rate):
        f_pre = 'f_' + str(m - 1)
        f_ = 'f_' + str(m)
        r_ = 'r_' + str(m)
        data[f_] = data[f_pre]
        for leaf in tree.leafNodes:
            data.loc[leaf.data.index, f_] = data.loc[leaf.data.index][r_] * learning_rate + data.loc[leaf.data.index][
                f_pre]

树和节点的定义

In [18]:
class Node:
    def __init__(self):
        self.left_child = None
        self.right_child = None
        self.isleaf = False
        self.data = None
        self.leaf_value = None
        self.split_feature = None
        self.split_value = None

    def get_predict_value(self, x, data, iter):
        if self.isleaf:
            r_ = 'r_' + str(iter)
            return data.loc[self.data.index, r_].mean()
        if x[self.split_feature] >= self.split_value:
            return self.right_child.get_predict_value(x, data, iter)
        else:
            return self.left_child.get_predict_value(x, data, iter)


class Tree:
    def __init__(self, data, m, feature, lable, max_depth, min_samples_split):
        self.feature = feature
        self.lable = lable
        self.leafNodes = []
        self.max_depth = max_depth
        self.fm_1 = m
        self.min_samples_split = min_samples_split
        self.root_node = self.build_tree(data, 1)
#         print(self.root_node)

    def build_tree(self, data, depth):

        if len(data) <= self.min_samples_split or depth >= self.max_depth:
            node = Node()
            node.isleaf = True
            node.data = data
            # node.leaf_value = data[self.lable] -
            self.leafNodes.append(node)
            return node
        split_feature = ''
        split_value = 0
        min_se = sys.maxsize
        se = []
        for feature_t in self.feature:
            feature_value = data[feature_t].unique()

            for value in feature_value:

                left = data[self.lable][data[feature_t] < value].values
                right = data[self.lable][data[feature_t] >= value].values

                se_left = self.calculate_se(left)
                se_right = self.calculate_se(right)
                se_sum = se_left + se_right
                se.append(se_sum)
                if se_sum < min_se:
                    min_se = se_sum
                    split_feature = feature_t
                    split_value = value

        left_leaf = data[data[split_feature] < split_value]
        right_leaf = data[data[split_feature] >= split_value]
        node = Node()
        node.left_child = self.build_tree(data=left_leaf, depth=depth + 1)
        node.split_feature = split_feature
        node.split_value = split_value
        node.right_child = self.build_tree(data=right_leaf, depth=depth + 1)

        return node

    def calculate_se(self, label):
        if len(label) == 0:
            return 0
        mean = label.mean()
        return sum(np.square((label - mean)))

决策树的定义与实现

In [19]:
class AbstractGBDT(metaclass=abc.ABCMeta):
    def __init__(self):
        pass

    def fit(self, data):
        pass

    def predict(self, data):
        pass


class GBDTBASE(AbstractGBDT):

    def __init__(self, loss, learning_rate, tree_num, min_samples_split, max_depth):
        super().__init__()
        self.tree_num = tree_num
        self.loss = loss
        self.min_samples_split = min_samples_split
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = {}
        self.c_f = pd.DataFrame()
        self.data = None
        self.feature = None
        self.lable = None

    def fit(self, X, Y):
        self.feature = X.columns
        self.lable = 'target'
        self.data = pd.concat([X, Y], axis=1)
        self.loss.f0(data=self.data, lable=self.lable)
        for m in range(1, self.tree_num + 1):
            print('building NO:' + str(m) + ' tree')
            tree = Tree(data=self.data, m=m, feature=self.feature, lable=self.lable, max_depth=self.max_depth,
                        min_samples_split=self.min_samples_split)
            self.loss.calculate_residual(data=self.data, m=m, lable=self.lable)

            self.loss.update_f_m(self.data, m, tree, self.learning_rate)
            self.trees[m] = tree

    def predict(AbstractGBDT):
        pass

In [20]:
class GBDTRegressor(GBDTBASE):
    def __init__(self, learning_rate=0.01, tree_num=10, min_samples_split=2, max_depth=4):
        super().__init__(SqureLoss(), learning_rate, tree_num, min_samples_split, max_depth)

    def predict(self, pre_data):
        pre_data['f_0'] = self.data['f_0'].mean()
        for iter in range(1, len(self.trees) + 1):
            f_prev = 'f_' + str(iter - 1)
            f_m = 'f_' + str(iter)

            pre_data[f_m] = pre_data[f_prev] + self.learning_rate * pre_data.apply(
                lambda x: self.trees[iter].root_node.get_predict_value(x, self.data, iter), axis=1)
        return pre_data[f_m]


class GBDTBinary(GBDTBASE):
    def __init__(self, learning_rate, tree_num, min_samples_split, max_depth, ):
        super().__init__(BinomialDevianceLoss(), learning_rate, tree_num, min_samples_split, max_depth)

    def predict(self, pre_data):
        pre_data['f_0'] = self.data['f_0'].mean()
        for iter in range(1, len(self.trees) + 1):
            f_prev = 'f_' + str(iter - 1)
            f_m = 'f_' + str(iter)
            pre_data[f_m] = pre_data[f_prev] + self.learning_rate * pre_data.apply(
                lambda x: self.trees[iter].root_node.get_predict_value(x, self.data, iter), axis=1)
        pre_data['predict_proba'] = pre_data[f_m].apply(lambda x: 1 / (1 + math.exp(-x)))
        pre_data['predict_label'] = pre_data['predict_proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        return pre_data['predict_label']

训练集与测试集划分

In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(datas[diabetes.feature_names], datas.loc[:, 'target'], test_size=0.4,
                                                    random_state=42)

In [22]:
def create_model(trial):
    learning_rate = trial.suggest_uniform("num_leaves", 0.11, 1)
    tree_num = trial.suggest_int("tree_num", 2, 20)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_samples_split = trial.suggest_int('min_samples_split', 1, 50)

    model = GBDTRegressor(learning_rate=learning_rate, tree_num=tree_num, min_samples_split=min_samples_split,
                          max_depth=max_depth)
    return model


def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    score = mean_squared_error(y_pre, y_test)
    print("==" * 40)
    print(f"MSE: = {score}")
    return score

Auto ML 参数调优

In [23]:
import optuna

# 'learning_rate': 0.18787147831167836, 'tree_num': 21, 'max_depth': 5, 'min_samples_split': 13

最小化LOSS为目标

In [24]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3)

params = study.best_params
print(params)

[32m[I 2020-11-22 15:51:00,789][0m A new study created in memory with name: no-name-bf66aecf-fe9c-41e1-9ba4-4dd331a1ee0a[0m


building NO:1 tree
building NO:2 tree
building NO:3 tree
building NO:4 tree


[32m[I 2020-11-22 15:51:22,583][0m Trial 0 finished with value: 3916.4885370144607 and parameters: {'num_leaves': 0.7497454803616358, 'tree_num': 4, 'max_depth': 6, 'min_samples_split': 7}. Best is trial 0 with value: 3916.4885370144607.[0m


MSE: = 3916.4885370144607
building NO:1 tree
building NO:2 tree
building NO:3 tree
building NO:4 tree
building NO:5 tree
building NO:6 tree
building NO:7 tree
building NO:8 tree
building NO:9 tree
building NO:10 tree
building NO:11 tree
building NO:12 tree
building NO:13 tree
building NO:14 tree
building NO:15 tree
building NO:16 tree
building NO:17 tree


[32m[I 2020-11-22 15:52:25,150][0m Trial 1 finished with value: 3638.50675462979 and parameters: {'num_leaves': 0.4745242147698795, 'tree_num': 17, 'max_depth': 5, 'min_samples_split': 45}. Best is trial 1 with value: 3638.50675462979.[0m


MSE: = 3638.50675462979
building NO:1 tree
building NO:2 tree
building NO:3 tree
building NO:4 tree
building NO:5 tree
building NO:6 tree
building NO:7 tree
building NO:8 tree
building NO:9 tree


[32m[I 2020-11-22 15:52:46,172][0m Trial 2 finished with value: 3611.9057637911073 and parameters: {'num_leaves': 0.923453438702633, 'tree_num': 9, 'max_depth': 3, 'min_samples_split': 13}. Best is trial 2 with value: 3611.9057637911073.[0m


MSE: = 3611.9057637911073
{'num_leaves': 0.923453438702633, 'tree_num': 9, 'max_depth': 3, 'min_samples_split': 13}
