In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
from PIL import Image
import glob
import os
from sklearn.model_selection import train_test_split
from math import log, exp
from time import time

In [18]:
def predict(alphas, predictors, X, K):
    alphas = alphas.reshape(alphas.shape[0], 1)
    predictions = np.transpose(np.array([p.predict(X) for p in predictors]))
    result = [np.argmax([np.where(p==i, 1, 0) @ alphas for i in range(K)]) for p in predictions]
#     for p in predictions:
#         options = [np.where(p == i, 1, 0) @ alphas for i in range(K)]
#         result.append(np.argmax(options))
    return result

In [4]:
def accuracy(y_hat, y_true):
    ones = np.where(np.array(y_hat) == y_true, 1, 0)
    solid = np.ones((1,len(y_hat)))
    return ((ones @ ones.T) / (solid @ solid.T))[0][0]

In [9]:
def Boost(X, Y, wl, M, verbose=False):
    n = X.shape[0]
    K = max(Y) + 1
    w = [1/n for _ in range(n)]
    errs = []
    Ts = []
    alphas = []
    X = np.array(X)
    Y = np.array(Y)
    indices = [i for i in range(n)]
    for m in range(M):
        local_X = X[indices]
        local_Y = Y[indices]
        T = None
        if wl == "NN":
            T = MLPClassifier()
        elif wl == "TREE":
            T = DecisionTreeClassifier(max_depth=2)
        T.fit(local_X, local_Y)
        Ts.append(T)
        y_hat = [round(p) for p in T.predict(X)]
        ones = np.where(Y != y_hat, 1, 0)
        err = (np.array(w).reshape(1,len(w)) @ ones.reshape(len(w),1))[0][0]/sum(w)
        if err == 0:
            Ts.pop()
            break
        errs.append(err)
        alpha = log((1-err)/err)+log(K-1)
        alphas.append(alpha)
        if verbose and m % verbose == 0:
            print(f"Iteration {m} has error {err:%}\nand accuracy {accuracy(predict(np.array(alphas), Ts, X, K), Y):%}")
        w = [w[i]*exp(alpha) if Y[i] != y_hat[i] else w[i] for i in range(n)]
        normalizer = sum(w)
        w = [w[i]/normalizer for i in range(n)]
        for count,wi in enumerate(w):
            if np.isnan(wi):
                print(f"NaN at {count} on iteration {m}")
        indices = np.random.choice(n,n,p=w)
        
    return np.array(alphas)/sum(alphas), Ts

In [6]:
IMAGE_DIR = "./data/data/data"

def load():
    file_list = glob.glob(IMAGE_DIR + "/*.jpg")
    X = []
    Y = []

    for fname in file_list:
        with Image.open(fname) as img:
            np_img = np.array(img).flatten()
        label = int(os.path.split(fname)[-1].split('.')[0].split('_')[3])-1

        X.append(np_img)
        tempy = np.zeros(15)
        tempy[label] = 1
        Y.append(tempy)
    X, Y = np.array(X), np.array(Y)
    return(X, Y)

In [7]:
# create test, train split
X, Y = load()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.8, random_state=2021)
Ytrain = [np.argmax(y) for y in Ytrain]
Ytest = [np.argmax(y) for y in Ytest]

In [14]:
start = time()
M = 1000
alphas, model = Boost(Xtrain, Ytrain, "TREE", M)
print(f"{len(model)} trees required {time()-start} seconds to train")

1000 trees required 2561.9330780506134 seconds to train


In [None]:
K = max(Ytrain) + 1
y_pred = predict(alphas, model, Xtrain, K)

In [None]:
print(f"{accuracy(y_pred,Ytrain):.2%}")