In [18]:

# -*- coding: utf-8 -*-   
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体  
  
mpl.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题


from numpy import *
import numpy as np
def loadSimpData():
    datMat = matrix([[1.,2.1],[2.,1.1],[1.3,1.],[1.,1.],[2.,1.]])
    classLabels = [1.0,1.0,-1.0,-1.0,1.0]
    return datMat,classLabels
#一般性的读取指定文件，呢能够自动检测出特征的数目
def loadDataSet(filename):
    numPeat = len(open(filename).readline().split('\t'))
    dataMat = [];labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numPeat-1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat
            
#通过阈值比较对数据进行分类
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray = ones((shape(dataMatrix)[0],1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = 1.0
    return retArray

#弱学习器
def buildStump(dataArr,classLabels,D):#D是权重向量
    dataMatrix = mat(dataArr);labelMat = mat(classLabels).T
    m,n = shape(dataMatrix)
    numSteps = 10.0;bestStump = {};bestClasEst = mat(zeros((m,1)))#构建bestStump空字典
    minError = inf#初始化为正无穷大
    for i in range(n):#对每一个特征
        rangeMin = dataMatrix[:,i].min();rangeMax = dataMatrix[:,i].max();#通过每一个特征中的最大最小值得到步长
        stepSize = (rangeMax - rangeMin)/numSteps
        for j in range(-1,int(numSteps)+1):#对每一个步长
            for inequal in ['lt','gt']:#在大于和小于之间切换不等式
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#遍历函数所有的可能输入
                errArr = mat(ones((m,1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T*errArr #calc total error multiplied by D
                print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst

def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)
        print("权重向量D:",D.T)
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#避免由于error太小导致这个数太大
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print("第 %d 分类值classEst:" % int(i+1),classEst.T)
        expon = multiply(-1*alpha*mat(classLabels).T,classEst)#规范化因子
        D = multiply(D,exp(expon))
        D = D/D.sum()
        aggClassEst += alpha*classEst#基本线性分类组合
        print("线性分类组合",aggClassEst.T)
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
        errorRate = aggErrors.sum()/m
        print("总的错误率：",errorRate,"\n")
        if errorRate == 0.0:break
    return weakClassArr,aggClassEst

def adaClassify(datToClass,classifierArr):
    dataMatrix = mat(datToClass)
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix,int(classifierArr[i]['dim']),classifierArr[i]['thresh'],classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        print(aggClassEst)
    return sign(aggClassEst)

import matplotlib.pyplot as plt
def plotROC(predStrengths,classLabels):
    cur = (1.0,1.0)#设定右上方的点及真阳率和假阳率都很大
    ySum = 0.0
    numPosClas = sum(array(classLabels) == 1.0)
    yStep = 1/float(numPosClas)#y改变的步数
    xStep = 1/float(len(classLabels)-numPosClas)#x改变的步数
    
    sortedIndicies = predStrengths.argsort()#返回的是数组从小往大的索引值
    fig = plt.figure()
    fig.clf()#清理图的函数
    ax = plt.subplot(111)
    for index in sortedIndicies.tolist()[0]:#tolist()如果是array([1,2,3])就会转化成[1,2,3],如果是mat([1,2,3]),就转换成[[1,2,3]]
        if classLabels[index] == 1.0:#从
            delX = 0;delY = yStep;
        else:
            delX = xStep;delY = 0;
            ySum += cur[1]#加起来得到 y
        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY],c='b')
        cur = (cur[0]-delX,cur[1]-delY)#每迭代一次减少
    ax.plot([0,1],[0,1],'b--')#确定横坐标和纵坐标
    plt.xlabel('假阳率');plt.ylabel('真阳率')
    plt.title('马的ROC曲线')
    ax.axis([0,1,0,1])
    plt.show()
    print("曲线下的面积是: ",ySum*xStep)#ysum是所有竖条的高度

In [2]:
datMat,classLabels = loadSimpData()
datMat

matrix([[ 1. ,  2.1],
        [ 2. ,  1.1],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2. ,  1. ]])

In [3]:
classLabels

[1.0, 1.0, -1.0, -1.0, 1.0]

In [4]:
D = np.mat(ones((5,1))/5)
buildStump(datMat,classLabels,D)

split: dim 0, thresh 0.90, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 0.90, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.00, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 1.00, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.10, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 1.10, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.20, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 1.20, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.30, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, thresh 1.30, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.40, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, thresh 1.40, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.50, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, thresh 1.50, thresh ineq

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[ 0.2]]), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [20]:
adaboost,aggClassEst1 = adaBoostTrainDS(datMat,classLabels,40)
adaboost

split: dim 0, thresh 0.90, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 0.90, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.00, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 1.00, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.10, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 1.10, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.20, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, thresh 1.20, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.30, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, thresh 1.30, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.40, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, thresh 1.40, thresh ineqal: gt, the weighted error is 0.400
split: dim 0, thresh 1.50, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, thresh 1.50, thresh ineq

[{'alpha': 0.6931471805599453, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 {'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
 {'alpha': 0.8958797346140273,
  'dim': 0,
  'ineq': 'lt',
  'thresh': 0.90000000000000002}]

In [21]:
adaClassify([0,0],adaboost)

[[-0.69314718]]
[[-1.66610226]]
[[-2.56198199]]


matrix([[-1.]])

In [22]:
adaClassify([[5,5],[0,0]],adaboost)

[[ 0.69314718]
 [-0.69314718]]
[[ 1.66610226]
 [-1.66610226]]
[[ 2.56198199]
 [-2.56198199]]


matrix([[ 1.],
        [-1.]])

In [23]:
dataArr,labelArr = loadDataSet('E:/谢远东/机器学习/机器学习实践/机器学习实战源代码/machinelearninginaction/Ch07/horseColicTraining2.txt')

In [28]:
classiferArray,aggClassEst2 = adaBoostTrainDS(dataArr,labelArr,10)

split: dim 0, thresh 0.90, thresh ineqal: lt, the weighted error is 0.405
split: dim 0, thresh 0.90, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.00, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.00, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.10, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.10, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.20, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.20, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.30, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.30, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.40, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.40, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.50, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.50, thresh ineq

In [25]:
testArr,testLabelArr = loadDataSet('E:/谢远东/机器学习/机器学习实践/机器学习实战源代码/machinelearninginaction/Ch07/horseColicTest2.txt')

In [29]:
prediction10 = adaClassify(testArr,classiferArray)


[[ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [-0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063332]
 [ 0.27063

In [30]:
errArr = mat(ones((67,1)))
errNum = errArr[prediction10 != mat(testLabelArr).T].sum()
errNum

22.0

In [31]:
errorRate = errNum/67
errorRate

0.32835820895522388

In [32]:
dataArr1,labelArr1 = loadDataSet('E:/谢远东/机器学习/机器学习实践/机器学习实战源代码/machinelearninginaction/Ch07/horseColicTraining2.txt')
classiferArray,aggClassEst = adaBoostTrainDS(dataArr1,labelArr1,10)
plotROC(aggClassEst.T,labelArr)

split: dim 0, thresh 0.90, thresh ineqal: lt, the weighted error is 0.405
split: dim 0, thresh 0.90, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.00, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.00, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.10, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.10, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.20, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.20, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.30, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.30, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.40, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.40, thresh ineqal: gt, the weighted error is 0.405
split: dim 0, thresh 1.50, thresh ineqal: lt, the weighted error is 0.438
split: dim 0, thresh 1.50, thresh ineq