In [26]:
import numpy as np

def loadSimpleData():
	dataMat = np.mat(
        [[1.0, 2.1],
		[2.0, 1.1],
		[1.3, 1.0],
		[1.0, 1.0],
		[2.0, 1.0]])
	labels = [1.0, 1.0, -1.0, -1.0, 1.0]
	return dataMat, labels

# single layer classify tree model
def stumpClassify(dataMat, dimen, threshVal, threshIneq):
	result = np.ones((np.shape(dataMat)[0], 1))
	if threshIneq == 'lt':
		result[dataMat[:, dimen] <= threshVal] = -1.0
	else:
		result[dataMat[:, dimen] > threshVal] = 1.0
	return result

# data array, class labels and weighted vector D
def buildStump(dataArr, labels, D):
	dataMat = np.mat(dataArr)
	labelMat = np.mat(labels).T
	m, n = np.shape(dataMat)
	
	numSteps = 10
	bestStump = {}
	bestClassEst = np.mat(np.zeros((m, 1)))
	minError = np.inf
	
	# iterate features
	for i in range(n):
		rangeMin = dataMat[:, i].min()
		rangeMax = dataMat[:, i].max()
		stepSize = (rangeMax - rangeMin) / numSteps

		# extend steps range
		for j in range(-1, int(numSteps) + 1):
			for inequal in ['lt', 'gt']:
				threshVal = (rangeMin + float(j) * stepSize)
				predictedVals = \
					stumpClassify(dataMat, i, threshVal, inequal)

				# mark error predicated label
				errArr = np.mat(np.ones((m, 1)))
				errArr[predictedVals == labelMat] = 0
				weightedErr = D.T * errArr
				#print 'split: dim %d, thresh %.2f, thresh inequal: '\
				#	'%s, the weighted error is %.3f' \
				#	% (i, threshVal, inequal, weightedErr)
				#print predictedVals

				if weightedErr < minError:
					minError = weightedErr
					bestClassEst = predictedVals.copy()
					bestStump['dim'] = i
					bestStump['thresh'] = threshVal
					bestStump['ineq'] = inequal

	return bestStump, minError, bestClassEst

dataMat, labels = loadSimpleData()
D = np.mat(np.ones((5,1))/5)
buildStump(dataMat, labels, D)

split: dim 0, thresh 0.90, thresh inequal: lt, the weighted error is 0.400
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]]
split: dim 0, thresh 0.90, thresh inequal: gt, the weighted error is 0.400
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]]
split: dim 0, thresh 1.00, thresh inequal: lt, the weighted error is 0.400
[[-1.]
 [ 1.]
 [ 1.]
 [-1.]
 [ 1.]]
split: dim 0, thresh 1.00, thresh inequal: gt, the weighted error is 0.400
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]]
split: dim 0, thresh 1.10, thresh inequal: lt, the weighted error is 0.400
[[-1.]
 [ 1.]
 [ 1.]
 [-1.]
 [ 1.]]
split: dim 0, thresh 1.10, thresh inequal: gt, the weighted error is 0.400
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]]
split: dim 0, thresh 1.20, thresh inequal: lt, the weighted error is 0.400
[[-1.]
 [ 1.]
 [ 1.]
 [-1.]
 [ 1.]]
split: dim 0, thresh 1.20, thresh inequal: gt, the weighted error is 0.400
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]]
split: dim 0, thresh 1.30, thresh inequal: lt, the weighted error is 0.200
[[-1.]
 [ 1.]
 [-1.]
 [-1.]
 [ 1.]]
s

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[ 0.2]]), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [24]:
def adaBoostTrainDS(dataArr, labels, numIt = 40):
	weakClassifiers = []

	m = np.shape(dataArr)[0]
	D = np.mat(np.ones((m, 1)) / m)
    
	# aggregated class estimation for each point
	aggClassEst = np.mat(np.zeros((m, 1)))

	for i in range(numIt):
		bestStump, error, classEst = buildStump(dataArr, labels, D)
		print 'D: ', D.T
		print 'classEst: ', classEst.T

        # alpah is the weight of the current weak classifier
		# eg. error = 0.2, alpha = 1/2 * ln4
		alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))
		bestStump['alpha'] = alpha
		weakClassifiers.append(bestStump)

        # label and classEst values are both 1 or -1 if classify right
		# if classify right, -1 * (1 * 1), -1 * (-1 * -1), -alpha
		# if classify wrong, -1 * (1 * -1), -1 * (-1 * 1), alpha
		expon = np.multiply(-1 * alpha * np.mat(labels).T, classEst)

		# eg. old D1 = 0.2, right class, exp(-1/2 * ln4) = 0.5, new D1 = 0.1
		# eg. old D1 = 0.2, wrong class, exp(1/2 * ln4) = 2, new D1 = 0.4
		D = np.multiply(D, np.exp(expon))

        # the weight for the wrong point increased from 0.2 to 0.5
		# eg. 0.4 / (0.4 + 0.1 + 0.1 + 0.1 + 0.1) = 0.5
		D = D / D.sum()

		aggClassEst += alpha * classEst
		print 'aggClassEst: ', aggClassEst.T

		# The sign function returns -1 if x < 0, 1 if x > 0
		aggErrors = np.multiply(np.sign(aggClassEst) != \
			np.mat(labels).T, np.ones((m, 1)))
		errorRate = aggErrors.sum() / m
		print 'total error: ', errorRate
		if errorRate == 0.0:
			break
	return weakClassifiers

classifiers = adaBoostTrainDS(dataMat, labels)
print classifiers

D:  [[ 0.2  0.2  0.2  0.2  0.2]]
classEst:  [[-1.  1. -1. -1.  1.]]
aggClassEst:  [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
total error:  0.2
D:  [[ 0.5    0.125  0.125  0.125  0.125]]
classEst:  [[ 1.  1. -1. -1. -1.]]
aggClassEst:  [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
total error:  0.2
D:  [[ 0.28571429  0.07142857  0.07142857  0.07142857  0.5       ]]
classEst:  [[ 1.  1.  1.  1.  1.]]
aggClassEst:  [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
total error:  0.0
[{'dim': 0, 'ineq': 'lt', 'thresh': 1.3, 'alpha': 0.6931471805599453}, {'dim': 1, 'ineq': 'lt', 'thresh': 1.0, 'alpha': 0.9729550745276565}, {'dim': 0, 'ineq': 'lt', 'thresh': 0.90000000000000002, 'alpha': 0.8958797346140273}]


In [25]:
def adaClassify(dataToClass, classifiers):
	dataMat = np.mat(dataToClass)
	m = np.shape(dataMat)[0]
	aggClassEst = np.mat(np.zeros((m, 1)))
	for i in range(len(classifiers)):
		classEst = stumpClassify(dataMat, classifiers[i]['dim'], \
			classifiers[i]['thresh'], classifiers[i]['ineq'])
		aggClassEst += classifiers[i]['alpha'] * classEst
		print aggClassEst
	return np.sign(aggClassEst)

adaClassify(dataMat, classifiers)

[[-0.69314718]
 [ 0.69314718]
 [-0.69314718]
 [-0.69314718]
 [ 0.69314718]]
[[ 0.27980789]
 [ 1.66610226]
 [-1.66610226]
 [-1.66610226]
 [-0.27980789]]
[[ 1.17568763]
 [ 2.56198199]
 [-0.77022252]
 [-0.77022252]
 [ 0.61607184]]


matrix([[ 1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]])