In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
from PIL import Image
import glob
import os
from sklearn.model_selection import train_test_split
from math import log, exp
from time import time

In [2]:
def predict(alphas, predictors, X, K):
    alphas = alphas.reshape(alphas.shape[0], 1)
    predictions = np.transpose(np.array([p.predict(X) for p in predictors]))
    results = []
    for p in predictions:
        results.append([np.sum(alphas * np.where(p == i, 1, 0)) for i in range(1,K+1)])
    results = [np.argmax(r) + 1 for r in results]
#     for p in predictions:
#         optimal = 0
#         classifier = 0
#         for i in range(1,K+1):
#             guess = np.sum(alphas * np.where(p == i, 1, 0))
#             if guess > optimal:
#                 optimal = guess
#                 classifier = i
#         results.append(classifier)
    return np.array(results)

In [3]:
def accuracy(y_hat, y_true):
    ones = np.where(y_hat == y_true, 1, 0)
    solid = np.ones((1,len(y_hat)))
    return ((ones @ ones.T) / (solid @ solid.T))[0][0]

In [4]:
def Boost(X, Y, wl, M, verbose=False):
    start = time()
    n = X.shape[0]
    K = max(Y) + 1
    w = np.array([1/n for _ in range(n)])
    errs = []
    Ts = []
    alphas = []
    X = np.array(X)
    Y = np.array(Y)
    indices = [i for i in range(n)]
    for m in range(M):
        T = None
        if wl == "NN":
            T = MLPClassifier()
        elif wl == "TREE":
            T = DecisionTreeClassifier(max_depth=1, criterion="gini")
        T.fit(X, Y, sample_weight=w)
        Ts.append(T)
        y_hat = T.predict(X)
        misclassified = np.where(Y != y_hat, 1, 0)
        validation = np.where(Y == y_hat, 1, 0)
        err = np.sum(w * misclassified)/np.sum(w)
        if err == 0:
            Ts.pop()
            break
        errs.append(err)
        alpha = np.log((1-err)/err) + np.log(K-1)
        alphas.append(alpha)
        if verbose and m % verbose == 0:
            print(f"After {time()-start:.2f} seconds, iteration {m} has error {err:%}\nalpha {alpha}\naccuracy {accuracy(predict(np.array(alphas), Ts, X, K), Y):%}")
        w *= np.exp(alpha * misclassified)
        
    return np.array(alphas)/sum(alphas), Ts

In [5]:
IMAGE_DIR = "./data/data/data"

def load():
    file_list = glob.glob(IMAGE_DIR + "/*.jpg")
    X = []
    Y = []

    for fname in file_list:
        with Image.open(fname) as img:
            np_img = np.array(img).flatten()
        label = int(os.path.split(fname)[-1].split('.')[0].split('_')[3])-1

        X.append(np_img)
        tempy = np.zeros(15)
        tempy[label] = 1
        Y.append(tempy)
    X, Y = np.array(X), np.array(Y)
    return(X, Y)

In [6]:
# create test, train split
X, Y = load()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.8, random_state=2021)
scaler = StandardScaler().fit(Xtrain)
Xtrain, Xtest = scaler.transform(Xtrain), scaler.transform(Xtest)
Ytrain = [np.argmax(y) + 1 for y in Ytrain]
Ytest = [np.argmax(y) + 1 for y in Ytest]

In [7]:
start = time()
M = 500
alphas, model = Boost(Xtrain, Ytrain, "TREE", M, verbose=10)
print(f"{len(model)} trees required {time()-start} seconds to train")

After 1.95 seconds, iteration 0 has error 88.683333%
alpha 0.6492547940645355
accuracy 11.316667%
After 22.37 seconds, iteration 10 has error 91.007936%
alpha 0.39344592731017736
accuracy 11.516667%
After 44.04 seconds, iteration 20 has error 90.309898%
alpha 0.4759081138464847
accuracy 13.908333%
After 68.77 seconds, iteration 30 has error 91.014725%
alpha 0.39261595968336627
accuracy 13.316667%
After 94.90 seconds, iteration 40 has error 90.096243%
alpha 0.5000859180947073
accuracy 13.458333%
After 124.43 seconds, iteration 50 has error 90.735325%
alpha 0.4263121885596117
accuracy 14.516667%
After 155.54 seconds, iteration 60 has error 90.843528%
alpha 0.4133725660718288
accuracy 15.600000%
After 189.61 seconds, iteration 70 has error 90.821310%
alpha 0.4160406927920688
accuracy 17.475000%
After 225.86 seconds, iteration 80 has error 90.605706%
alpha 0.44163546452630387
accuracy 17.408333%
After 269.57 seconds, iteration 90 has error 91.064605%
alpha 0.3865013383594351
accuracy 18.26

In [None]:
K = max(Ytrain) + 1
y_pred = predict(alphas, model, Xtrain, K)

In [None]:
print(f"{accuracy(y_pred,Ytrain):.2%}")