# AdaBoost

Собственно, AdaBoost на основе признаков Хаара. На каждой итерации строится случайная маска, и если ошибка с её использованием больше 0.5, маска умножается на -1. Таким образом, все классификаторы выдают результат лучше случайного (ошибка < 0.5) — условия сходимости алгоритма выполнены. 
Код — в файле adaboost.py

Посмотрим на зависимость ошибки от количества используемых классификаторов:

### 50 классификаторов

In [2]:
from idx2numpy import convert_from_file
from adaboost import AdaBoostMulticlass
from numpy import reshape, multiply

train_images = reshape(convert_from_file("train-images.idx3-ubyte").astype('float64'), (60000, 784))
train_labels = convert_from_file("train-labels.idx1-ubyte")
test_images = reshape(convert_from_file("t10k-images.idx3-ubyte").astype('float64'), (10000, 784))
test_labels = convert_from_file("t10k-labels.idx1-ubyte")

train_images = multiply(train_images, 1 / 255)
test_images = multiply(test_images, 1 / 255)

ada = AdaBoostMulticlass(50)
ada.fit(train_images, train_labels)
score = 0
for i in range(10000):
    if ada.predict(test_images[i]) == test_labels[i]:
        score += 1

print("Score is %s" % (score / 10000))

Score is 0.2639


### 500 классификаторов

In [3]:
from idx2numpy import convert_from_file
from adaboost import AdaBoostMulticlass
from numpy import reshape, multiply

train_images = reshape(convert_from_file("train-images.idx3-ubyte").astype('float64'), (60000, 784))
train_labels = convert_from_file("train-labels.idx1-ubyte")
test_images = reshape(convert_from_file("t10k-images.idx3-ubyte").astype('float64'), (10000, 784))
test_labels = convert_from_file("t10k-labels.idx1-ubyte")

train_images = multiply(train_images, 1 / 255)
test_images = multiply(test_images, 1 / 255)

ada = AdaBoostMulticlass(500)
ada.fit(train_images, train_labels)
score = 0
for i in range(10000):
    if ada.predict(test_images[i]) == test_labels[i]:
        score += 1

print("Score is %s" % (score / 10000))

Score is 0.3968


### 10000 классификаторов

In [4]:
from idx2numpy import convert_from_file
from adaboost import AdaBoostMulticlass
from numpy import reshape, multiply
from time import time

train_images = reshape(convert_from_file("train-images.idx3-ubyte").astype('float64'), (60000, 784))
train_labels = convert_from_file("train-labels.idx1-ubyte")
test_images = reshape(convert_from_file("t10k-images.idx3-ubyte").astype('float64'), (10000, 784))
test_labels = convert_from_file("t10k-labels.idx1-ubyte")

train_images = multiply(train_images, 1 / 255)
test_images = multiply(test_images, 1 / 255)

start = time()
ada = AdaBoostMulticlass(10000)
ada.fit(train_images, train_labels)
score = 0
for i in range(10000):
    if ada.predict(test_images[i]) == test_labels[i]:
        score += 1

print("Score is %s" % (score / 10000))
finish = time() - start
h = finish // 3600
m = (finish % 3600) // 60
s = finish - h * 3600 - m * 60
print("Hardcore version with 10k classifiers took %s h. %s m. %s s." % (h, m, s))

Score is 0.4867
Hardcore version with 10k classifiers took 10.0 h. 50.0 m. 12.542388677597046 s.
