## To explore the GBDT.compute_loss

Binomial loss

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from random import sample
from math import exp, log

from gbdt.data import DataSet
from gbdt.model import GBDT
from gbdt.model import BinomialDeviance, RegressionLossFunction
from gbdt.tree import construct_decision_tree, Tree
from gbdt.tree import MSE

In [3]:
dataset = DataSet('data/credit.data.csv')
dataset.get_label_size()

2

#### The initial properties

In [4]:
max_iter = 20
sample_rate = 0.8
learn_rate = 0.5
max_depth = 7
loss_type = 'binary-classification'
split_points = 0
trees = dict()

#### Loss function: Binary classification
`GBDT.model.fit`: 

when `loss_type == 'binary-classification'`


In [5]:
loss = BinomialDeviance(n_classes=dataset.get_label_size())
loss.K

1

In [6]:
f = dict()
loss.initialize(f, dataset)

In [7]:
train_data = dataset.get_instances_idset()
# train_data is a set, which contains IDs from 1 to 653

In [8]:
# define compute_loss                
def compute_loss(dataset, subset, f):
    total_loss = 0.0
    if loss.K == 1:
#         print(loss.K)
        for id in dataset.get_instances_idset():
            y_i = dataset.get_instance(id)['label']
            f_value = f[id]
            p_1 = 1/(1+exp(-2*f_value))
            try:
                total_loss -= ((1+y_i)*log(p_1)/2) + ((1-y_i)*log(1-p_1)/2) # Here we will explain deeper!
            except ValueError as e:
                print(y_i, p_1)
    else:
        for id in dataset.get_instances_idset():
            instance = dataset.get_instance(id)
            f_values = f[id]
            exp_values = {}
            for label in f_values:
                print('label is:', label)
                exp_values[label] = exp(f_values[label])
            probs = {}
            for label in f_values:
                probs[label] = exp_values[label]/sum(exp_values.values())
                # 预测的越准确则log(probs[instance["label"]])越接近0 loss也就越小
            total_loss -= log(probs[instance["label"]])
    return total_loss/dataset.size()


In [9]:
def compute_instance_f_value(instance):
    """计算样本的f值"""
    if loss.K == 1:
        f_value = 0.0
        for ite in trees:
            f_value += learn_rate * trees[ite].get_predict_value(instance)
    else:
        f_value = dict()
        for label in loss.labelset:
            f_value[label] = 0.0
        for ite in trees:
            # 对于多分类问题，为每个类别构造一颗回归树
            for label in loss.labelset:
                tree = trees[ite][label]
                f_value[label] += learn_rate*tree.get_predict_value(instance)
    return f_value

def predict(instance):
    """
    对于回归和二元分类返回f值
    对于多元分类返回每一类的f值
    """
    return compute_instance_f_value(instance)

def predict_prob(instance):
    """为了统一二元分类和多元分类，返回属于每个类别的概率"""
    if isinstance(loss, RegressionLossFunction):
        raise RuntimeError('regression problem can not predict prob ')
    if loss.K == 1:
        f_value = compute_instance_f_value(instance)
        probs = dict()
        probs['+1'] = 1/(1+exp(-2*f_value))
        probs['-1'] = 1 - probs['+1']
    else:
        f_value = compute_instance_f_value(instance)
        exp_values = dict()
        for label in f_value:
            exp_values[label] = exp(f_value[label])
        exp_sum = sum(exp_values.values())
        probs = dict()
        # 归一化，并得到相应的概率值
        for label in exp_values:
            probs[label] = exp_values[label]/exp_sum
    return probs

def predict_label(instance):
    """预测标签"""
    predict_label = None
    if isinstance(loss, BinomialDeviance):
        probs = predict_prob(instance)
        predict_label = 1 if probs['+1'] >= probs['-1'] else -1
    else:
        probs = self.predict_prob(instance)
        # 选出K分类中，概率值最大的label
        for label in probs:
            if not predict_label or probs[label] > probs[predict_label]:
                predict_label = label
    return predict_label

`GBDT.compute_loss`：

如果`loss.K==1`,对应的是log loss，此时loss的计算公式为$loss = loss - likelihood$
$$likelihood = \bigg(\frac{1+y_i}{2}\log p + \frac{1-y_i}{2}\log(1-p)\bigg)$$ 
其中，$f = f[id], p = \frac{1}{1+\exp(-2f)}$.

回忆：在`GBDT.model.BinomialDeviance.compute_residual`中，定义loss为
$$loss = \frac{2y_i}{1+\exp{(-2y_if)}}$$

#### 问题：这两个loss等价嘛？
我们从第一个表达式推到第二个：

第一个loss是来自于logistic regression的log loss，label的取值是0,1；第二个loss是来自于Friedman的文章，label的取值是-1,1。所以需要将0,1取值变换为-1,1取值：

$y = 2Y-1$，如果$Y\in\{0,1\}$，则$y\in\{-1,1\}$映射

令$p = P(y=1|x)=\frac{1}{1+\exp(-2f)}$， 
$$\begin{array}{rl}
likelihood(Y,p) & = Y\log p + (1-Y)\log(1-p)\\
     & = \frac{1+y}{2}\log p + \frac{1-y}{2}\log(1-p) \\
     & = \frac{1+y}{2}\log\frac{1}{1+\exp(-2f)} + \frac{1-y}{2}\log \frac{\exp(-2f)}{1+ \exp{(-2f)}}\\
     & = \frac{1+y}{2}\log\frac{1}{1+\exp(-2f)} + \frac{1-y}{2}\log \frac{1}{1+ \exp{(2f)}}\\
     & = -\frac{1+y}{2}\log\big(1+e^{-2f}\big) - \frac{1-y}{2}\log\big({1+ e^{2f}}\big)\\
     & = -\log(1-e^{-2yf}), ~~~(y=-1~or ~1)
\end{array}$$

目标是极大化似然，等价于极小化负的似然，即极小化的损失函数：$\min ~L(y, f(x)) = \min ~\log(1-e^{-2yf})$

此时，$$f^*(x) = \arg\min\limits_{f(x)} E_{y|x}\bigg(\log(1+e^{-2yf(x)})\bigg) = \frac{1}{2}\log\frac{P(y=1|x)}{P(y=-1|x)}$$
为理论上$f(x)$的最优的映射。

证明：因为$E(\log(1+e^{-2yf(x)})) = P(y=1|x)\log(1+e^{-2f}) + P(y=-1|x)\log(1+e^{2f})$

所以
$$
\begin{array}{rl}
\frac{\partial E(\log(1+e^{-2yf}))}{\partial f} & = P(y=1|x)\frac{-2e^{-2f}}{1+e^{-2f}} + P(y=-1|x)\frac{2e^{2f}}{1+e^{2f}} \\
                                                & \propto P(y=1|x)\frac{-1}{1+e^{2f}} + P(y=-1|x)\frac{e^{2f}}{1+e^{2f}} \\
                                                & \propto -P(y=1|x) + e^{2f}P(y=-1|x) \\
                                                & = 0
\end{array}
$$
所以$f^* = \frac{1}{2}\log\frac{P(y=1|x)}{P(y=-1|x)}$

参考资料：
- http://statweb.stanford.edu/~tibs/book/chap14.pdf Page10
- http://docs.salford-systems.com/GreedyFuncApproxSS.pdf Page8

In [10]:
for iter_m in range(1, max_iter+1): # Chao: 用决策树拟合的步数 m
    subset = train_data
    if 0 < sample_rate < 1:
        # Chao： 这里只选择80%的Id来构造决策树。
        # Chao：未来，剩下20%的Id只是通过这个构造好的决策树的leafnodes得到取值
        subset = sample(subset, int(len(subset)*sample_rate))
    # 用损失函数的负梯度作为回归问题提升树的残差近似值
    residual = loss.compute_residual(dataset, subset, f)
    leaf_nodes = []
    targets = residual
    tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, max_depth, loss, split_points)
    trees[iter_m] = tree
    loss.update_f_value(f, tree, leaf_nodes, subset, dataset, learn_rate) # Chao：更新每个Id（样本点）的值
    if isinstance(loss, RegressionLossFunction):
        # todo 判断回归的效果
        pass
    else:
        train_loss = compute_loss(dataset, train_data, f)
#         print("iter_m%d : train loss=%f" % (iter_m,train_loss))
        print(iter_m, train_loss)

1 0.3850905347256806
2 0.2479308474226447
3 0.1688799662267075
4 0.11971562470711063
5 0.09054012247466625
6 0.06904249771756119
7 0.05364686501770368
8 0.043670593958652235
9 0.034551810011379226
10 0.0268865680632009
11 0.023609457248059188
12 0.019432939290180737
13 0.017353362121343054
14 0.014058836244390267
15 0.012388793007838022
16 0.010032983389178995
17 0.008900641635052671
18 0.007912802950569075
19 0.006688882609641617
20 0.0055106738485271875


In [11]:
f[1]
f[2]
f[3]

2.325045467075996

3.579335791032858

3.0914734636045216

In [12]:
ins = dataset.get_instance(1)

In [13]:
ins

{'A1': 'b',
 'A10': 't',
 'A11': 1.0,
 'A12': 'f',
 'A13': 'g',
 'A14': 202.0,
 'A15': 0.0,
 'A2': 30.83,
 'A3': 0.0,
 'A4': 'u',
 'A5': 'g',
 'A6': 'w',
 'A7': 'v',
 'A8': 1.25,
 'A9': 't',
 'label': 1.0}

In [14]:
compute_instance_f_value(ins)

2.325045467075996

In [15]:
predict(ins)

2.325045467075996

In [16]:
predict_prob(ins)

{'+1': 0.9905298094644651, '-1': 0.009470190535534884}

In [17]:
predict_label(ins)

1

### Real use case

In [18]:
# data_file = './data/credit.data.csv'
# dateset = DataSet(data_file)
gbdt = GBDT(max_iter=20, sample_rate=0.8, learn_rate=0.5, max_depth=7, loss_type='binary-classification')
gbdt.fit(dataset, dataset.get_instances_idset())

iter1 : train loss=0.397886
iter2 : train loss=0.254912
iter3 : train loss=0.177351
iter4 : train loss=0.133279
iter5 : train loss=0.100853
iter6 : train loss=0.072687
iter7 : train loss=0.057461
iter8 : train loss=0.045802
iter9 : train loss=0.037130
iter10 : train loss=0.030485
iter11 : train loss=0.025559
iter12 : train loss=0.019611
iter13 : train loss=0.015378
iter14 : train loss=0.014205
iter15 : train loss=0.012190
iter16 : train loss=0.009855
iter17 : train loss=0.008412
iter18 : train loss=0.007138
iter19 : train loss=0.005737
iter20 : train loss=0.004679


In [19]:
ins = dataset.get_instance(1)
gbdt.compute_instance_f_value(ins)

2.4720614878340497

In [20]:
gbdt.predict_prob(ins)
gbdt.predict_label(ins)

{'+1': 0.9929252478828322, '-1': 0.007074752117167793}

1