In [7]:
import numpy as np
from DecisionTree import CARTRegression
import copy

class GBDT:
    pass
    def __init__(self,model=CARTRegression(),n_estimators = 3,learning_rate =1):
        self._n_estimators = n_estimators
        self._base_model = model
        self._estimators = []
        self._learning_rate = learning_rate

    def _train_val_split(self,X,y,val_size=0.25):
        pass
        "从测试集里面再划分测试集和验证集"
        data = np.c_[X,y[:np.newaxis]]
        #np.random.shuffle(data)

        n_sample,n_features = X.shape
        n_train = int(n_sample *(1-val_size))

        train_sample = data[0:n_train]
        val_sample = data[n_train:n_sample]

        X_train = train_sample[:,0:n_features]
        y_train = train_sample[:,n_features]
        X_val = val_sample[:,0:n_features]
        y_val = val_sample[:,n_features]

        return X_train,X_val,y_train,y_val


    def fit(self,X,y):
        #在训练集里面划分训练集合验证集
        X_train,X_val,y_train,y_val = self._train_val_split(X,y,val_size=0.25)

        #在验证误差连续5次迭代未改善时，直接停止训练
        min_val_error = float("inf")
        error_going_up = 0

        for n in range(self._n_estimators):
            model = copy.deepcopy(self._base_model)
            model.fit(X_train,y_train)
            #print("节点数量：",model.nodes_nums())
            self._estimators.append(model)
            y_train -= self._learning_rate*np.array(model.predict(X_train))

            #计算误差是否有改善
            y_pred = self.predict(X_val)
            val_error = np.average( (y_val - y_pred)**2,axis=0)
            if val_error < min_val_error:
                min_val_error = val_error
                error_going_up = 0
            else:
                error_going_up +=1
                #验证误差连续5次迭代未改善时，直接停止训练
                if error_going_up == 5:
                    break
        print("验证集：",np.sqrt(min_val_error),end=",")

    def predict(self,X_test):
        pass
        y_predict = np.zeros(X_test.shape[0])
        for m in self._estimators:
            y_predict += self._learning_rate* np.array(m.predict(X_test))
        return y_predict


In [10]:
import numpy as np
from DecisionTree import  CARTRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.datasets import  load_boston
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import  DecisionTreeRegressor

samples = load_boston()
X = samples.data
y = samples.target
for _ in range(10):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

    myCARTR = CARTRegression(min_sample=1,max_depth=10)
    myCARTR.fit(X_train,y_train)

    myCARTR.prune()
    y_new = myCARTR.predict(X_test)
    print("一棵树：",np.sqrt(mean_squared_error(y_new,y_test)),end=",")


    gbrt = GBDT(model=CARTRegression(min_sample=2,max_depth=2),n_estimators=100,learning_rate=0.2)
    gbrt.fit(X_train,y_train.ravel())
    y_new = gbrt.predict(X_test)
    print("测试集：",np.sqrt(mean_squared_error(y_new,y_test)),end=",")
    print("树的数量：",len(gbrt._estimators))

    gb = GradientBoostingRegressor(max_depth=2,n_estimators=len(gbrt._estimators),learning_rate=0.2)
    gb.fit(X_train,y_train)
    print("sklearn：",np.sqrt( mean_squared_error(gb.predict(X_test),y_test)))


一棵树： 3.980448765337629,验证集： 3.1388723013148656,测试集： 2.55277495628571,树的数量： 29
sklearn： 3.462934332778807
一棵树： 4.6836775521157605,验证集： 3.991849404197001,测试集： 4.468125776560781,树的数量： 40
sklearn： 3.4606363068393793
一棵树： 3.7740942286148544,验证集： 3.3079551013875292,测试集： 3.2157076990546827,树的数量： 100
sklearn： 3.3692885019958543
一棵树： 5.129851882669831,验证集： 2.9357681488444816,测试集： 3.787032568415106,树的数量： 33
sklearn： 3.8949693096754863
一棵树： 4.543721893400337,验证集： 3.1506368569296064,测试集： 4.049982900228398,树的数量： 29
sklearn： 3.571508278796438
一棵树： 3.259780412582844,验证集： 2.5916458265206916,测试集： 2.9931545684127694,树的数量： 35
sklearn： 3.3651412341359466
一棵树： 4.111176284392709,验证集： 3.3418518264944588,测试集： 3.158580592557628,树的数量： 58
sklearn： 3.70981667591992
一棵树： 4.633169203959174,验证集： 3.005335653652421,测试集： 3.760267318164416,树的数量： 45
sklearn： 4.663020268312013
一棵树： 3.546742119954205,验证集： 3.633117733151072,测试集： 2.6991409165327904,树的数量： 48
sklearn： 3.2988834754681773
一棵树： 4.694320707356863,验证集： 3.4321681339

可以看到测试结果，比单纯的一棵树要好，和sklearn自带的GradientBoostingRegressor预测结果比较接近。