# AdaBoosting

In [1]:
import numpy as np
import pandas as pd

## Load data

In [2]:
df = pd.DataFrame({'x1':[1., 2., 1.3, 1., 2.], 'x2': [2.1, 1.1, 1., 1., 1.], 
                   'Labels': [1.0, 1.0, -1.0, -1.0, 1.0]})

In [7]:
datMat = df.loc[:, ['x1', 'x2']].values

In [93]:
df

Unnamed: 0,Labels,x1,x2
0,1.0,1.0,2.1
1,1.0,2.0,1.1
2,-1.0,1.3,1.0
3,-1.0,1.0,1.0
4,1.0,2.0,1.0


In [12]:
labels = df.Labels.values

In [105]:
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = np.ones((np.shape(dataMatrix)[0], ))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray

In [160]:
def buildStump(dataMatrix, labels, D):
    '''
    '''
    # Initial values
    m, n = np.shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClassEst = np.zeros((m, ))
    minError = np.inf
    
    # for-loop about features
    for i in range(n):
        rangeMin = dataMatrix[:, i].min()
        rangeMax = dataMatrix[:, i].max()
        stepSize = (rangeMax - rangeMin)/numSteps
        
        # for-loop about steps
        for j in range(-1, int(numSteps) + 1): # -1 means threshVal less than minVal
            
            # for-loop about inequality
            for inequal in ['lt', 'gt']: # try both less than and greater than  to check the error
                
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = np.ones((m, ))
                errArr[predictedVals == labels] = 0
                weightedError = D.dot(errArr) # scalar value
                # print "split: dim %d, thresh %.2f, thresh ineqal: %s,\
                # the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
                
                if weightedError < minError:
                    minError = weightedError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst

In [107]:
D = np.ones((5, ))/5.

In [108]:
buildStump(datMat, labels, D)

split: dim 0, thresh 0.90, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 0.90, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.00, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 1.00, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.10, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 1.10, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.20, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 1.20, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.30, thresh ineqal: lt,                the weighted error is 0.200
split: dim 0, thresh 1.30, thresh ineqal: gt,                the weighted error is 0.800
split: dim 0, thresh 1.40, thresh ineqal: lt,                the weighted error is 0.200
split: dim 0, thresh 

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 0.20000000000000001,
 array([-1.,  1., -1., -1.,  1.]))

In [162]:
def adaBoostingTrainDS(datMat, labels, numIt=40):
    weakClassArr = []
    m = np.shape(datMat)[0]
    D = np.ones((m, ))/m
    aggClassEst = np.zeros((m, ))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(datMat, labels, D)
        # print "D:", D
        alpha = float(0.5*np.log((1.0-error)/max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        # print "ClassEst: ", classEst
        expon = -1*alpha*labels*classEst
        D = D*np.exp(expon)
        D = D/D.sum()
        aggClassEst += alpha*classEst
        # print "aggClassEst: ", aggClassEst
        aggErrors = (np.sign(aggClassEst) != labels) * np.ones((m, ))
        errorRate = aggErrors.sum()/m
        print "total error:", errorRate, "\n"
        if errorRate == 0.0:
            break
    return weakClassArr

In [110]:
classifierArray = adaBoostingTrainDS(datMat, labels, 40)

split: dim 0, thresh 0.90, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 0.90, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.00, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 1.00, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.10, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 1.10, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.20, thresh ineqal: lt,                the weighted error is 0.400
split: dim 0, thresh 1.20, thresh ineqal: gt,                the weighted error is 0.600
split: dim 0, thresh 1.30, thresh ineqal: lt,                the weighted error is 0.200
split: dim 0, thresh 1.30, thresh ineqal: gt,                the weighted error is 0.800
split: dim 0, thresh 1.40, thresh ineqal: lt,                the weighted error is 0.200
split: dim 0, thresh 

In [111]:
classifierArray

[{'alpha': 0.6931471805599453, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 {'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
 {'alpha': 0.8958797346140273,
  'dim': 0,
  'ineq': 'lt',
  'thresh': 0.90000000000000002}]

## Test

In [126]:
def adaClassify(dataMatrix, classifierArr):
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.zeros((m, ))
    for i in range(len(classifierArray)):
        classEst = stumpClassify(dataMatrix, classifierArray[i]['dim'],
                                classifierArray[i]['thresh'], classifierArray[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        print aggClassEst
    return np.sign(aggClassEst)

In [131]:
test_datMat = np.array([[0., 0.]])

In [132]:
adaClassify(test_datMat, classifierArray)

[-0.69314718]
[-1.66610226]
[-2.56198199]


array([-1.])

# Horse colic dataset using AdaBoosting

In [184]:
df_horse = pd.read_csv('Horse_colic_dataset/horseColicTraining2.txt', sep='\t', names=np.arange(22))

In [185]:
datMat_horse = df_horse.values[:, 0: 21]

In [186]:
labels_horse = df_horse.values[:, -1]

In [188]:
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 10)

total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769 



In [189]:
df_test = pd.read_csv('Horse_colic_dataset/horseColicTest2.txt', sep='\t', names=np.arange(22))

In [191]:
datMat_test = df_test.values[:, 0: 21]

In [192]:
labels_test = df_test.values[:, -1]

In [193]:
prediction = adaClassify(datMat_test, classifierArray)

[ 0.46166238  0.46166238 -0.46166238 -0.46166238  0.46166238  0.46166238
  0.46166238  0.46166238  0.46166238  0.46166238  0.46166238 -0.46166238
 -0.46166238  0.46166238  0.46166238  0.46166238  0.46166238 -0.46166238
 -0.46166238 -0.46166238 -0.46166238  0.46166238 -0.46166238 -0.46166238
  0.46166238  0.46166238  0.46166238  0.46166238  0.46166238  0.46166238
  0.46166238  0.46166238 -0.46166238  0.46166238  0.46166238  0.46166238
  0.46166238  0.46166238  0.46166238  0.46166238  0.46166238  0.46166238
  0.46166238  0.46166238  0.46166238  0.46166238 -0.46166238  0.46166238
 -0.46166238  0.46166238  0.46166238  0.46166238  0.46166238  0.46166238
  0.46166238  0.46166238  0.46166238 -0.46166238  0.46166238 -0.46166238
  0.46166238  0.46166238 -0.46166238  0.46166238  0.46166238  0.46166238
  0.46166238]
[ 0.77414483  0.77414483 -0.14917993 -0.14917993  0.77414483  0.77414483
  0.14917993  0.77414483  0.77414483  0.14917993  0.14917993 -0.14917993
 -0.14917993  0.77414483  0.77414483 

In [194]:
errArr = np.ones((67, ))

In [196]:
errArr[prediction != labels_test].sum()/67

0.23880597014925373