In [1]:
import numpy as np
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray =np.ones((np.shape(dataMatrix)[0],1))  #分类标签的初始值都设为1
    if threshIneq == 'lt':  #"lt":less than 小于  "gt",greater than 大于
        # print(dataMatrix[:,dimen] <= threshVal)
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0  
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray

dataMatrix=np.mat([[1,2,3,4],[2,1,0,3],[2,3,1,3]])
stumpClassify(dataMatrix,1,2,'lt')

array([[-1.],
       [-1.],
       [ 1.]])

In [2]:
def buildStump(dataArr,classLabels,D):
    dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
    m,n = np.shape(dataMatrix)
    numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
    minError = np.inf 
    for i in range(n): # 遍历每一个特征
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax-rangeMin)/numSteps  
        for j in range(-1,int(numSteps)+1): # 稍有别于regTree中CART，遍历特征中的每个取值，这里用numSteps替代
            for inequal in ['lt', 'gt']: 
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)
                errArr = np.mat(np.ones((m,1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T*errArr  # 标签权重*errArr，(1,m)*(m,1),结果为单一数值
                # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst

In [17]:
def adaBoostTrainDS(dataArr,classLabels,numIt=40,debug=False):
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m,1))/m)   #初始化所有的标签的权重都相同
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)
        if debug: print("D:",D.T)
        alpha = float(0.5*np.log((1.0-error)/max(error,1e-16))) #计算alpha
        bestStump['alpha'] = alpha  
        weakClassArr.append(bestStump)   #保存bestStump，即每个弱分类树
        
        #更新D，即标签的权重
        if debug: print("classEst: ",classEst.T)
        expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst) #正确的分类为-alpha，错误的分类为alpha
        D = np.multiply(D,np.exp(expon))                                
        D = D/D.sum()
        
        #计算分类误差，误差为零时，退出
        aggClassEst += alpha*classEst   
        if debug: print("aggClassEst: ",aggClassEst.T)
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,np.ones((m,1))) #计算分类错误的累计和
        if debug: print("aggErrors:",aggErrors)
        errorRate = aggErrors.sum()/m  
        if debug: print("total error: ",errorRate)
        if errorRate == 0.0: break
    return weakClassArr,errorRate

In [19]:
def adaClassify(datToClass,classifierArr,debug=False):
    dataMatrix = np.mat(datToClass)
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(len(classifierArr)):
        #每个弱分类树中包括'dim','thresh','ineq',通过stumpClassify，计算出classEst，再*'alpha'，累加得出最后结果
        classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        if debug: print("aggClassEst:",aggClassEst)
    return np.sign(aggClassEst)

In [5]:
def loadSimpData():
    datMat = np.mat([[1,2.1],[2,1.1],[1.3,1],[1,1],[2,1]])
    classLabels = [1.0,1.0,-1.0,-1.0,1.0]
    return datMat,classLabels

In [6]:
datArr,labelArr = loadSimpData()
print("adaBoostTrainDS:=====")
classifierArr,errorRate= adaBoostTrainDS(datArr,labelArr,30)
print("errorRate:",errorRate)
print("adaClassify:=====")
adaClassify([[0,0],[9,0]],classifierArr)

adaBoostTrainDS:=====
D: [[0.2 0.2 0.2 0.2 0.2]]
classEst:  [[-1.  1. -1. -1.  1.]]
aggClassEst:  [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
aggErrors: [[1.]
 [0.]
 [0.]
 [0.]
 [0.]]
total error:  0.2
D: [[0.5   0.125 0.125 0.125 0.125]]
classEst:  [[ 1.  1. -1. -1. -1.]]
aggClassEst:  [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
aggErrors: [[0.]
 [0.]
 [0.]
 [0.]
 [1.]]
total error:  0.2
D: [[0.28571429 0.07142857 0.07142857 0.07142857 0.5       ]]
classEst:  [[1. 1. 1. 1. 1.]]
aggClassEst:  [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
aggErrors: [[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
total error:  0.0
errorRate: 0.0
adaClassify:=====
aggClassEst: [[-0.69314718]
 [ 0.69314718]]
aggClassEst: [[-1.66610226]
 [-0.27980789]]
aggClassEst: [[-2.56198199]
 [ 0.61607184]]


matrix([[-1.],
        [ 1.]])

In [9]:
# 测试两两排列组合
import itertools
for i in itertools.combinations([1,2,3], 2):
    print(i)

(1, 2)
(1, 3)
(2, 3)


In [54]:
# 以下将adaBoost推广到多分类，方法是将多个分类，通过两两组合，各自训练，最后将所有分类结果合并，计算出频数最高的分类结果
import itertools
# 训练
def adaBoostTrainMutil(dataArr,classLabels,numIt=40):
    label_dist=sorted(list(set(list(classLabels))))
    print("label_dist:",label_dist)
    adaBoost_dict = {}
    
    # 将标签两两组合，对每个组合单独训练
    for i,j in itertools.combinations(label_dist, 2):
        # print(i,j)

        index_1 = np.nonzero(np.array(classLabels[:])== i)
        # print("index_1",index_1)
        data_1 = dataArr[index_1[0],:] 
        index_2 = np.nonzero(np.array(classLabels[:])== j)
        data_2 = dataArr[index_2[0],:]
        data = np.vstack((data_1,data_2))   #data为标签i，j的训练集，lables为标签
        lables =[1]*data_1.shape[0]+[-1]*data_2.shape[0] 
        print("data:",data)
        print("lables:",lables)
        
        
        #(i,j)组合下训练的adaBoost_dict[(i,j)]
        this_adaBoost=adaBoost_dict[(i,j)]={}        
        this_adaBoost['weakClassArr'],this_adaBoost['errorRate']=adaBoostTrainDS(data,lables,numIt) #
    return adaBoost_dict

  
       
adaBoost_dict = adaBoostTrainMutil(datArr,labelArr,numIt=40) 
print(adaBoost_dict)

from collections import Counter
# 预测
def adaClassifyMutil(datToClass,adaBoost_dict):
    pred_labels_array = None
    for name,value in adaBoost_dict.items():
        value_1,value_2 = name
        print("name",value_1,value_2)
        this_classifierArr = adaBoost_dict[name]['weakClassArr']
        Classify = adaClassify(datToClass,this_classifierArr)
#         print(Classify)
#         print(np.nonzero(Classify==1)[0])
#         print(np.nonzero(Classify==-1)[0])
#         print(Classify[np.nonzero(Classify==1)[0],:])
        Classify[np.nonzero(Classify==1)[0],:] = -99   #增加两个中间变量
        Classify[np.nonzero(Classify==-1)[0],:] = -101
        Classify[np.nonzero(Classify==-99)[0],:] = value_1 #将分类1转换为原来的value_1，
        Classify[np.nonzero(Classify==-101)[0],:] = value_2 #同理上
        print("Classify",Classify.T)
        if pred_labels_array is None:
            pred_labels_array = Classify
        else:
            pred_labels_array = np.hstack((pred_labels_array,Classify)) #将所有结果合并
            
    print(pred_labels_array)
    y_pred = []
    for labels in np.array(pred_labels_array):
        # print("labels",labels)
        pred = Counter(labels.flatten()).most_common(1)[0][0] #计算出频数最高的分类
        y_pred.append(pred)
    return y_pred
        
y_pred = adaClassifyMutil(datArr,adaBoost_dict)
print(y_pred)

label_dist: [-1.0, 1.0]
data: [[1.3 1. ]
 [1.  1. ]
 [1.  2.1]
 [2.  1.1]
 [2.  1. ]]
lables: [1, 1, -1, -1, -1]
{(-1.0, 1.0): {'weakClassArr': [{'dim': 0, 'thresh': 1.3, 'ineq': 'gt', 'alpha': 0.6931471805599453}, {'dim': 1, 'thresh': 1.0, 'ineq': 'gt', 'alpha': 0.9729550745276566}, {'dim': 0, 'thresh': 0.9, 'ineq': 'gt', 'alpha': 0.8958797346140276}], 'errorRate': 0.0}}
name -1.0 1.0
Classify [[ 1.  1. -1. -1.  1.]]
[[ 1.]
 [ 1.]
 [-1.]
 [-1.]
 [ 1.]]
[1.0, 1.0, -1.0, -1.0, 1.0]


In [51]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris_dataset=datasets.load_iris()
data_x = iris_dataset.data
data_y = iris_dataset.target
x_train,x_test,y_train,y_test=train_test_split(data_x,data_y,random_state=0,test_size=0.25)

In [53]:
adaBoost_dict = adaBoostTrainMutil(x_train,y_train,numIt=50) 
y_pred = adaClassifyMutil(x_test,adaBoost_dict)
print('Test set score:{:.5f}'.format(np.mean(y_pred==y_test))) 

label_dist: [0, 1, 2]
data: [[4.7 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.4 3.7 1.5 0.2]
 [4.8 3.1 1.6 0.2]
 [5.3 3.7 1.5 0.2]
 [4.3 3.  1.1 0.1]
 [5.4 3.4 1.7 0.2]
 [5.7 4.4 1.5 0.4]
 [4.6 3.1 1.5 0.2]
 [4.6 3.4 1.4 0.3]
 [4.8 3.  1.4 0.1]
 [5.1 3.8 1.6 0.2]
 [4.8 3.4 1.6 0.2]
 [4.5 2.3 1.3 0.3]
 [4.9 3.  1.4 0.2]
 [4.4 3.2 1.3 0.2]
 [5.  3.6 1.4 0.2]
 [5.1 3.5 1.4 0.3]
 [4.4 3.  1.3 0.2]
 [5.4 3.9 1.7 0.4]
 [5.1 3.5 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.2 3.4 1.4 0.2]
 [5.  3.2 1.2 0.2]
 [5.1 3.3 1.7 0.5]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.8 4.  1.2 0.2]
 [5.1 3.8 1.5 0.3]
 [4.7 3.2 1.6 0.2]
 [5.  3.3 1.4 0.2]
 [5.  3.  1.6 0.2]
 [5.1 3.4 1.5 0.2]
 [5.5 3.5 1.3 0.2]
 [5.1 3.7 1.5 0.4]
 [4.9 3.1 1.5 0.1]
 [4.6 3.2 1.4 0.2]
 [5.9 3.  4.2 1.5]
 [5.8 2.6 4.  1.2]
 [5.  2.  3.5 1. ]
 [5.6 2.5 3.9 1.1]
 [6.3 3.3 4.7 1.6]
 [5.5 2.4 3.8 1.1]
 [7.  3.2 4.7 1.4]
 [6.  3.4 4.5 1.6]
 [5.6 2.7 4.2 1.3]
 [5.6 2.9 3.6 1.3]
 [5.5 2.5 4.  1.3]
 [6.1 3.  4.6 1.4]
 [5.7 3.  4.2 1.2]
 [6.9 3.1 4.9 1.5]
 [5

In [62]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         n_estimators=40, learning_rate=1)
bdt.fit(x_train, y_train)
y_pred = bdt.predict(x_test)
print('Test set score:{:.5f}'.format(np.mean(y_pred==y_test))) 

Test set score:0.97368
