#### AdaBoost算法：  
**输入**：训练数据集$T=\{(x_1,y_1),(x_2,y_2),...,(x_N,y_N) \}$，其中$x_i\in X\subseteq R^n$，$y_i\in Y=\{-1,+1 \}$；弱学习算法；  
**输出**：最总分类器$G(x)$。  
（1）初始化训练数据的权值分布  
&emsp;&emsp;&emsp;$D_1=(w_{11},...,w_{1i},...,w_{1N})$,&emsp;$w_{1i}=\frac{1}{N}$,&emsp;$i=1,2,...,N $  
（2）对$m=1,2,...,M$  
&emsp;（a）使用具有权值分布$D_m$的训练数据集学习，得到基本分类器  
&emsp;&emsp;&emsp;$G_m(x):X\to \{-1,+1 \}$  
&emsp;（b）计算$G_m(x)$在训练数据集上的分类误差率  
&emsp;&emsp;&emsp;$e_m=\sum_{i=1}^NP(G_m(x_i)\ne y_i)=\sum_{i=1}^Nw_{mi}I(G_m(x_i)\ne y_i)$  
&emsp;（c）计算$G_m(x)$的系数  
&emsp;&emsp;&emsp;$\alpha_m=\frac{1}{2}log\frac{1-e_m}{e_m}$  
&emsp;这里的对数是自然对数。  
&emsp;（d）更新训练数据集的权值分布  
&emsp;&emsp;&emsp;$D_{m+1}=(w_{m+1,1},...,w_{m+1,i},...,w_{m+1,N})$  
&emsp;&emsp;&emsp;$w_{m+1,i}=\frac{w_{mi}}{Z_m}exp(-\alpha_my_iG_m(x_i)),i=1,2,....N $  
&emsp;这里，$Z_m$是规范因子  
&emsp;&emsp;&emsp;$Z_m=\sum_{i=1}^Nw_{mi}exp(-\alpha_my_iG_m(x_i))$  
&emsp;它使$D_{m+1}$成为一个概率分布。  
（3）构建基本分类器的线性组合  
&emsp;&emsp;&emsp;$f(x)=\sum_{m=1}^M\alpha_mG_m(x)$  
&emsp;得到最终分类器  
&emsp;&emsp;&emsp;$G(x)=sign(f(x))$  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;$=sign(\sum_{m=1}^M\alpha_mG_m(x))$

In [1]:
import numpy as np

#### 加载文件  
$param fileName$:要加载的文件路径  
*return*: 数据集和标签集

In [2]:
def loadData(fileName):
    dataArr = []
    labelArr = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split(',')
        dataArr.append([int(int(num) > 128) for num in curLine[1:]])
        if int(curLine[0]) == 0:
            labelArr.append(1)
        else:
            labelArr.append(-1)
            
    #返回数据集和标记
    return dataArr, labelArr

#### 创建单层提升树  
$param trainDataArr$:训练数据集数组  
$param trainLabelArr$: 训练标签集数组  
$param D$: 算法8.1中的D  
*return*: 创建的单层提升树

In [3]:
def createSigleBoostingTree(trainDataArr, trainLabelArr, D):
    m, n = np.shape(trainDataArr)
    # 单层树的字典
    sigleBoostTree = {}
    # 初始化分类误差率
    sigleBoostTree['e'] = 1
    # 对每一个特征进行遍历，寻找用于划分的最合适的特征
    for i in range(n):
        for div in [-0.5, 0.5, 1.5]:
            for rule in ['LisOne', 'HisOne']:
                Gx, e = calc_e_Gx(trainDataArr, trainLabelArr, i, div, rule, D)
                if e < sigleBoostTree['e']:
                    sigleBoostTree['e'] = e
                    # 存储最优划分点、划分规则、预测结果、特征索引
                    sigleBoostTree['div'] = div
                    sigleBoostTree['rule'] = rule
                    sigleBoostTree['Gx'] = Gx
                    sigleBoostTree['feature'] = i
                    
    # 返回单层的提升树
    return sigleBoostTree

#### 创建提升树   
$param trainDataList$:训练数据集  
$param trainLabelList$: 训练测试集  
$param treeNum$: 树的层数  
*return*: 提升树

In [4]:
def createBosstingTree(trainDataList, trainLabelList, treeNum = 50):
    trainDataArr = np.array(trainDataList)
    trainLabelArr = np.array(trainLabelList)
    finallpredict = [0] * len(trainLabelArr)
    m, n = np.shape(trainDataArr)

    D = [1 / m] * m
    tree = []
    #循环创建提升树
    for i in range(treeNum):
        curTree = createSigleBoostingTree(trainDataArr, trainLabelArr, D)
        alpha = 1/2 * np.log((1 - curTree['e']) / curTree['e'])
        Gx = curTree['Gx']
        D = np.multiply(D, np.exp(-1 * alpha * np.multiply(trainLabelArr, Gx))) / sum(D)
        curTree['alpha'] = alpha
        tree.append(curTree)

        finallpredict += alpha * Gx
        error = sum([1 for i in range(len(trainDataList)) if np.sign(finallpredict[i]) != trainLabelArr[i]])
        #计算当前最终误差率
        finallError = error / len(trainDataList)
        if finallError == 0:
            return tree
        print('iter:%d:%d, sigle error:%.4f, finall error:%.4f'%(i, treeNum, curTree['e'], finallError ))
        
    #返回整个提升树
    return tree

#### 计算分类错误率  
$param trainDataArr$:训练数据集数字  
$param trainLabelArr$: 训练标签集数组  
$param n$: 要操作的特征  
$param div$:划分点  
$param rule$:正反例标签  
$param D$:权值分布D  
*return*:预测结果，分类误差率

In [5]:
def calc_e_Gx(trainDataArr, trainLabelArr, n, div, rule, D):
    e = 0
    x = trainDataArr[:, n]
    y = trainLabelArr
    predict = []

    if rule == 'LisOne':
        L = 1; H = -1
    else:
        L = -1; H = 1

    #遍历所有样本的特征m
    for i in range(trainDataArr.shape[0]):
        if x[i] < div:
            #如果小于划分点，则预测为L
            predict.append(L)
            if y[i] != L:
                e += D[i]
        elif x[i] >= div:
            predict.append(H)
            if y[i] != H:
                e += D[i]
    #返回预测结果和分类错误率e
    return np.array(predict), e

#### 输出单独层预测结果  
$param x$: 预测样本  
$param div$: 划分点  
$param rule$: 划分规则  
$param feature$: 进行操作的特征

In [6]:
def predict(x, div, rule, feature):
    if rule == 'LisOne':
        L = 1; H = -1
    else:
        L = -1; H = 1

    #判断预测结果
    if x[feature] < div:
        return L
    else:
        return H

#### 测试  
$param testDataList$:测试数据集  
$param testLabelList$: 测试标签集  
$param tree$: 提升树  
*return*: 准确率

In [7]:
def model_test(testDataList, testLabelList, tree):
    errorCnt = 0
    #遍历每一个测试样本
    for i in range(len(testDataList)):
        result = 0
        #遍历每层的树
        for curTree in tree:
            div = curTree['div']
            rule = curTree['rule']
            feature = curTree['feature']
            alpha = curTree['alpha']
            result += alpha * predict(testDataList[i], div, rule, feature)
        #预测结果取sign值
        if np.sign(result) != testLabelList[i]:
            errorCnt += 1
            
    #返回准确率
    return 1 - errorCnt / len(testDataList)

#### 开始实验

In [9]:
# 获取训练集
trainDataList, trainLabelList = loadData('Mnist/mnist_train.csv')

# 获取测试集
testDataList, testLabelList = loadData('Mnist/mnist_test.csv')

#创建提升树
print('start init train')
tree = createBosstingTree(trainDataList[:10000], trainLabelList[:10000], 5)

#测试
print('start to test')
accuracy = model_test(testDataList[:1000], testLabelList[:1000], tree)
print('the accuracy is:',accuracy)

start init train
iter:0:5, sigle error:0.0804, finall error:0.0804
iter:1:5, sigle error:0.1448, finall error:0.0804
iter:2:5, sigle error:0.1362, finall error:0.0585
iter:3:5, sigle error:0.1864, finall error:0.0667
iter:4:5, sigle error:0.2249, finall error:0.0474
start to test
the accuracy is: 0.944
