In [294]:
from numpy import *

def loadData(name):
    data= []
    y = []
    f = open(name)
    for x in f.readlines():
        x = x.strip().split('\t')
        data.append(map(float, x[:-1]))
        y.append(float(x[-1]))
    return array(data), array(y)

def sc(x, th, op):
    result = ones(x.shape[0])
    if op == 'lt':
        result[x <= th] = -1.0
    else:
        result[x > th] = -1.0
    return result

def bs(X, y, D):
    m, n = X.shape
    numThresh = 10
    bestStump = {}
    best_y = zeros(X.shape[0])
    minErr = inf
    for k, x in enumerate(X.T):
        x_min, x_max = x.min() - 1 , x.max() + 1
        thresh = linspace(x_min, x_max, numThresh)
        for th in thresh:
            for op in ['lt', 'gt']:
                y_bar = sc(x, th, op)
                yErr = ones(m)
                yErr[ y_bar == y ] = 0
                DErr = D.dot(yErr)
                if DErr < minErr:
                    bestStump['dim'] = k
                    bestStump['thresh'] = th
                    bestStump['operator'] = op
                    best_y = y_bar
                    minErr = DErr               
    return bestStump, best_y, minErr

def train(X, y, loops):
    weakClassArr = []
    m = X.shape[0]
    D = ones(m) / m
    agg = zeros(m)
    for k in xrange(loops):
        bestStump, y_bar, error = bs(X, y, D)
        alpha = float(0.5 * log((1 - error) / max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        ea = -1.0 * alpha * multiply(y, y_bar)
        D = multiply(D, exp(ea)) / sum(D)
        agg += alpha * y_bar
        aggErr = 1.0 * sum( sign(agg) != y ) / y.size
        print aggErr
        if aggErr == 0.0 :
            break
    return weakClassArr

def predict(X, classArr):
    m = X.shape[0]
    aggClassEst = zeros(m)
    for c in classArr:
        y_bar = sc(X, c['dim'], c['thresh'], c['operator'])
        aggClassEst += c['alpha'] * y_bar
    print aggClassEst
    return sign(aggClassEst)

In [295]:
def plotROC(predStrengths, classLabels):
    import matplotlib.pyplot as plt
    cur = (1.0,1.0) #cursor
    ySum = 0.0 #variable to calculate AUC
    numPosClas = sum(array(classLabels)==1.0)
    yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas)
    sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse
    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(111)
    #loop through all the values, drawing a line segment at each point
    for index in sortedIndicies.tolist()[0]:
        if classLabels[index] == 1.0:
            delX = 0; delY = yStep;
        else:
            delX = xStep; delY = 0;
            ySum += cur[1]
        #draw line from cur to (cur[0]-delX,cur[1]-delY)
        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
        cur = (cur[0]-delX,cur[1]-delY)
    ax.plot([0,1],[0,1],'b--')
    plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
    plt.title('ROC curve for AdaBoost horse colic detection system')
    ax.axis([0,1,0,1])
    plt.show()
    print "the Area Under the Curve is: ",ySum*xStep

In [296]:
X, y = loadData('horseColicTraining2.txt')
train(X, y, 10)

0.284280936455
0.284280936455
0.224080267559
0.224080267559
0.234113712375
0.234113712375
0.227424749164
0.23745819398
0.234113712375
0.220735785953


[{'alpha': 0.46166237926576786,
  'dim': 17,
  'operator': 'gt',
  'thresh': 50.333333333333329},
 {'alpha': 0.41434684510520126,
  'dim': 9,
  'operator': 'gt',
  'thresh': 3.666666666666667},
 {'alpha': 0.3075279464574483,
  'dim': 18,
  'operator': 'lt',
  'thresh': 49.555555555555557},
 {'alpha': 0.2292189670356418, 'dim': 3, 'operator': 'gt', 'thresh': 61.0},
 {'alpha': 0.18116903387556668,
  'dim': 11,
  'operator': 'lt',
  'thresh': 0.33333333333333326},
 {'alpha': 0.1699585184196405,
  'dim': 4,
  'operator': 'lt',
  'thresh': 53.444444444444443},
 {'alpha': 0.1697802921220716,
  'dim': 5,
  'operator': 'gt',
  'thresh': 2.333333333333333},
 {'alpha': 0.17128699870707895,
  'dim': 5,
  'operator': 'lt',
  'thresh': 0.33333333333333326},
 {'alpha': 0.13228770578244511,
  'dim': 9,
  'operator': 'lt',
  'thresh': 4.4444444444444446},
 {'alpha': 0.16845030993198298,
  'dim': 11,
  'operator': 'gt',
  'thresh': 2.333333333333333}]