### 1.构建单层决策树

In [1]:
#构造简单数据集
import numpy as np
def loadSimData():
    dataArr = np.array([[1,2.1],[2,1.1],[1.3,1],[1,1],[2,1]])
    classLabels = [1,1,-1,-1,1]
    return dataArr,classLabels
#画出正反例
import matplotlib.pyplot as plt
dataArr,classLabels = loadSimData()
plt.scatter(dataArr[2:4,0],dataArr[2:4,1],s=50,c='red',marker='s')
plt.scatter(dataArr[0:2,0],dataArr[0:2,1],s=50,c='green',marker='o')
plt.scatter(dataArr[4,0],dataArr[4,1],s=50,c='green',marker='o')

<matplotlib.collections.PathCollection at 0x12780470>

In [2]:
#dataArr为数据集，dimen为特征维数，threshVal为阈值，threshIneq可以在大于、小于之间切换
def stumpClassify(dataArr,dimen,threshVal,threshIneq):
    dataMat = np.mat(dataArr)
    retMat = np.mat(np.ones([dataMat.shape[0],1]))#将初始标签都置为1 
    if threshIneq == 'forward':
        retMat[dataMat[:,dimen]<=threshVal] = -1.0#小于等于阈值为-1
    else:
        retMat[dataMat[:,dimen]>threshVal] = -1.0#大于阈值为-1
    return retMat

In [11]:
#dataArr微数据集，classLabels为数据标签，D为权重向量
def buildStump(dataArr,classLabels,D):
    dataMat = np.mat(dataArr);labelMat = np.mat(classLabels).T#将数组ndarray格式转化为矩阵格式
    m,n = dataMat.shape
    numSteps = 10.0;bestStump = {};bestClassEstimate = np.mat(np.zeros([m,1]))
    minErrRate = np.inf#将最小错误率初始值设为无穷大，以便下面寻找最小错误率
    for i in range(n):#在所有维数即所有特征上循环
        rangeMin = dataMat[:,i].min();rangeMax = dataMat[:,i].max()
        stepSize = (rangeMax - rangeMin)/numSteps
        for j in range(-1,int(numSteps+1)):#在当前维数即当前特征上遍历所有阈值
            for inequal in ['forward','reverse']:#在大于小于之间切换，即大于阈值为正例还是负例之间切换
                threshVal = rangeMin + float(j)*stepSize
                predictClass = stumpClassify(dataArr,i,threshVal,inequal)
                #构建一个列向量errArr，如果predictClass中的值不等于labelMat中的值，那么errArr相应中的值为1，相等置为0
                errArr = np.mat(np.ones([m,1]))
                errArr[predictClass==labelMat] = 0
                weightError = D.T*errArr#相当于计算分类错误率
                #print("split:dim %d,thresh:%.2f,inequal:%s,weightError:%.2f"%(i,threshVal,inequal,weightError))
                if weightError<minErrRate:
                    minErrRate = weightError
                    bestClassEstimate = predictClass.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['inequal'] = inequal
    return bestStump,minErrRate,bestClassEstimate

In [12]:
D = np.mat(np.ones([5,1])/5)
buildStump(dataArr,classLabels,D)

({'dim': 0, 'thresh': 1.3, 'inequal': 'forward'},
 matrix([[0.2]]),
 matrix([[-1.],
         [ 1.],
         [-1.],
         [-1.],
         [ 1.]]))

### 2.基于单层决策树的Adaboost训练过程

In [13]:
def adaBoostTrain(dataArr,classLabels,Iter):
    weakClassify = []#创建一个弱分类器列表，用于存放弱分类器
    m = dataArr.shape[0]#样本数量
    D = np.mat(np.ones([m,1])/m)#初始化样本权重向量
    aggClassEsti = np.mat(np.zeros([m,1]))#记录每个数据点的估计累计值
    for i in range(Iter):
        bestStump,minErrRate,bestClassEstimate = buildStump(dataArr,classLabels,D)
        #print("D:",D.T)
        alpha = 0.5*np.log((1-minErrRate)/max(minErrRate,np.exp(-16)))#分配该弱分类器的权重alpha
        alpha = float(alpha)#上面得到的alpha为一个矩阵形式
        bestStump['alpha'] = alpha
        weakClassify.append(bestStump)#存储弱分类器
        #print("estimate:",bestClassEstimate.T)
        
        #为下一次迭代做准备，重新分配样本权重向量，上一次分对的样本权重将会减小，分错的样本权重将会增大
        expon = np.multiply(-alpha*np.mat(classLabels).T,bestClassEstimate)
        D = np.multiply(D,np.exp(expon))
        D = D/D.sum()
        
        aggClassEsti += alpha*bestClassEstimate#记录每个数据点的估计累计值
       # print("aggClassEsti",aggClassEsti.T)
        
        retMat = np.mat(np.zeros([m,1]))
        retMat[np.sign(aggClassEsti)!=np.mat(classLabels).T] = 1#为了得到二分类结果需要用到硬极限函数
        errRate = retMat.sum()/m#错误率
        print("total errRate:",errRate)
        if errRate == 0:break
    return weakClassify,aggClassEsti

In [14]:
weakClassify,aggClassEsti = adaBoostTrain(dataArr,classLabels,40)

total errRate: 0.2
total errRate: 0.2
total errRate: 0.0


In [153]:
weakClassify

[{'dim': 0, 'thresh': 1.3, 'inequal': 'forward', 'alpha': 0.6931471805599453},
 {'dim': 1, 'thresh': 1.0, 'inequal': 'forward', 'alpha': 0.9729550745276565},
 {'dim': 0, 'thresh': 0.9, 'inequal': 'forward', 'alpha': 0.8958797346140273}]

### 3.测试算法：基于AdaBoost的分类

In [154]:
def adaboostClassify(data,weakClassify):
    m = data.shape[0]
    aggClassEst = np.mat(np.zeros([m,1]))
    for i in range(len(weakClassify)):
        classEst = stumpClassify(data,weakClassify[i]['dim'],weakClassify[i]['thresh'],weakClassify[i]['inequal'])
        aggClassEst += weakClassify[i]['alpha']*classEst
        #print(aggClassEst)
    return np.sign(aggClassEst)

In [155]:
adaboostClassify(np.array([[0,0]]),weakClassify)

matrix([[-1.]])

In [156]:
adaboostClassify(dataArr,weakClassify)

matrix([[ 1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]])

### 4.在一个难数据集上应用Adaboost

In [157]:
#这里将在马疝病数据集上应用AdaBoost分类器，前面曾经用过逻辑回归预测患有疝病的马是否能够存活，
#这里想知道利用多个单层决策树和Adaboost能不能预测得更准
#1.自适应加载数据
def loadDataSet(filename):
    numFeat = len(open(filename).readline().split('\t'))#自动检测出特征数，假定最后一个为类别标签
    dataList = [];labelList = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = []
        curLine = line.split('\t')
        for i in range(numFeat-1):
            lineArr.append(float(curLine[i]))
        dataList.append(lineArr)
        labelList.append(float(curLine[-1]))
    return dataList,labelList

In [187]:
trainData,trainLabel = loadDataSet('horseColicTraining2.txt')
dataArr2 = np.array(trainData)
weakClassify,aggClassEsti = adaBoostTrain(dataArr2,labelList,Iter = 10)

total errRate: 0.2842809364548495
total errRate: 0.2842809364548495
total errRate: 0.24749163879598662
total errRate: 0.24749163879598662
total errRate: 0.25418060200668896
total errRate: 0.2408026755852843
total errRate: 0.2408026755852843
total errRate: 0.22073578595317725
total errRate: 0.24749163879598662
total errRate: 0.23076923076923078


In [188]:
testData,testLabel = loadDataSet('horseColicTest2.txt')
testArr = np.array(testData)
predictLabel = adaboostClassify(testArr,weakClassify)
m = testArr.shape[0]
errNum = np.mat(np.zeros([m,1]))
errNum[predictLabel!=np.mat(testLabel).T] = 1
print('the rate of error is %.2f'%(errNum.sum()/m))

the rate of error is 0.24
