## 回归方法

跟adaboost方法不同之处：
1. 提升树每个分类器学习的是之前所有分类器的残差
2. 提升树的组合方式是累加，没有权重

In [92]:
%matplotlib inline

import numpy as np

In [110]:
'''
split_dataset
parameter:
    dataset: dataset
    res: the fitting residual of the model
    split: split point
'''
def split_dataset(dataset, res, split):
    out = np.zeros_like(res)
    
    sub1 = dataset < split
    sub2 = dataset >= split
    
    c1 = np.mean(res[sub1])
    c2 = np.mean(res[sub2])
    
    out[sub1] = c1
    out[sub2] = c2
    
    return out, c1, c2

'''
generate stump, iterate all feature and split point to get a best feature split value
parameter:
    
'''
def generate_stump(dataset, labels):
    n, m = dataset.shape
    
    best_index = 0
    best_split = 0
    best_cs = [0, 0]
    best_res = np.inf
    max_step = 10
    model = []
    for i in range(m):
        vec = dataset[:, i]
        min_v, max_v = vec.min(), vec.max()
        stride = (max_v - min_v)/max_step
        for j in range(-1, max_step+1):
            split_point = min_v + j*stride
            pred, c1, c2 = split_dataset(vec, labels, split_point)
            res = labels - pred
            norm_res = np.sum(res*res)
            # print(">>> value {} cs {} error {}".format(split_point, (c1, c2), norm_res))
            if norm_res < best_res:
                best_res = norm_res
                best_index = i
                best_split = split_point
                best_cs = [c1, c2]
    return best_index, best_split, best_cs

## 预测函数

1. 给定模型
2. 累加每一个子分类器回归的值得到最终的值

In [111]:
'''
predict: predict the value of regress
parameter:
    dataset: dataset
    model: model
'''
def predict(dataset, models):
    n, m = dataset.shape
    
    out = np.zeros(n)
    for m in models:
        idx, value, cs = m
        c1, c2 = cs
        out[dataset[:, idx] < value] += c1
        out[dataset[:, idx] >= value] += c2
    
    return out

'''
training: train the dataset
parameter:
    dataset: dataset
    labels: labels
    cnt: number of subclassifier
    toler: terminate condition
'''
def train(dataset, labels, cnt, toler):
    n, m = dataset.shape
    
    res = labels.copy()
    models = []
    for i in range(cnt):
        index, split, cs = generate_stump(dataset, res)
        print("index {} value {} cs ({}, {})".format(index, split, cs[0], cs[1]))
        models.append([index, split, cs])
        pred = predict(dataset, models)
        res = labels - pred
        res_v = np.sum(res*res)
        print("new res: ", res_v)
        if res_v < toler:
            break;
    return models

## 准备数据

In [112]:
'''
load dataset
'''
def load_dataset():
    x = np.arange(1,11).reshape(-1,1)
    y = np.array([5.56, 5.7, 5.91, 6.40, 6.80, 7.05, 8.9, 8.7, 9.0, 9.05])
    
    return x, y

In [113]:
dataset, labels = load_dataset()

In [114]:
models = train(dataset, labels, 10, 0.05)

index 0 value 6.4 cs (6.236666666666667, 8.912500000000001)
new res:  1.9300083333333335
index 0 value 3.7 cs (-0.513333333333334, 0.219999999999999)
new res:  0.8006750000000016
index 0 value 6.4 cs (0.1466666666666668, -0.2200000000000002)
new res:  0.4780083333333344
index 0 value 4.6 cs (-0.16083333333333316, 0.1072222222222227)
new res:  0.30555925925925986
index 0 value 6.4 cs (0.0714814814814817, -0.10722222222222255)
new res:  0.22891522633744874
index 0 value 2.8 cs (-0.1506481481481483, 0.0376620370370373)
new res:  0.17217806498628246
index 0 value 8.2 cs (-0.01870949074074102, 0.07483796296296319)
new res:  0.1581762632351673
index 0 value 6.4 cs (0.04381751543209855, -0.06572627314814783)
new res:  0.1293766433555344
index 0 value 5.5 cs (-0.041038580246913446, 0.04103858024691309)
new res:  0.11253499266871056
index 0 value 7.3 cs (0.022484209656084633, -0.052463155864198065)
new res:  0.10073906671200575


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
