In [7]:
from functions import *
import glob
from PIL import Image
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [8]:
IMAGE_DIR = "./data/data/data"

def load():
    file_list = glob.glob(IMAGE_DIR + "/*.jpg")
    X = []
    Y = []

    for fname in file_list:
        with Image.open(fname) as img:
            np_img = np.array(img).flatten()
        label = int(os.path.split(fname)[-1].split('.')[0].split('_')[3])-1

        X.append(np_img)
        tempy = np.zeros(15)
        tempy[label] = 1
        Y.append(tempy)
    X, Y = np.array(X), np.array(Y)
    return(X, Y)

In [9]:
# create data
X, Y = load()

In [10]:
def update_weights(weights, best):
    def f(x):
        return 1/2*(1/(1-best.error_rate)) * x if x in best.miss_data else 1/2*(1/best.error_rate) * x
    return np.array(map(f,weights)).tolist()

def run(Weak_Learners, data, eval_set, H, weights):
    for wl in Weak_Learners:
        wl.miss_classify(data, eval_set)
        wl.calc_error_rate(weights)
    
    best = Weak_Learners[0]
    for wl in Weak_Learners:
        if wl.error_rate < best.error_rate:
            best = wl
    
    best.calc_voting_power()
    H.append(best)
    
    weights = update_weights(weights, best)
    
    accuracy = H_accuracy(H, data, eval_set)
    
    return accuracy

In [11]:
"""
Here we test the performace of our model with different amounts of training data 
"""
from time import time

trainSizes = [x / 10 for x in range(5, 10, 1)] # Create a list of [0.5, ..., 0.9] If x < 0.5 we get an out of bounds error on the weights

hard_start = time()

for trainRatio in trainSizes:
    start = time()
    
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=trainRatio, random_state=2021)
    Ytrain_classes = []
    Ytest_classes = []
    for i in range(Ytrain.shape[1]):
        Ytrain_i = classify(Ytrain, i)
        Ytrain_classes.append(Ytrain_i)
        Ytest_i = classify(Ytest, i)
        Ytest_classes.append(Ytest_i)
    
    train_accuracies = []
    test_accuracies = []
    for i in range(Ytrain.shape[1]):
        H_train = []
        H_test = []
        Weak_Learners = []
        for _ in range(10):
            model = ShallowTree()
            model.fit(Xtrain, Ytrain_classes[0])
            Weak_Learners.append(WeakLearner(model, 0))

        weights = np.array([1/len(Xtrain) for _ in range(len(Xtrain))])
        train_local_accuracies = []
        test_local_accuracies = []
        for r in range(10):
            train_local_accuracies.append(run(Weak_Learners, Xtrain, Ytrain_classes[i], H_train, weights))
            test_local_accuracies.append(run(Weak_Learners, Xtest, Ytest_classes[i], H_test, weights))

        train_accuracies.append(sum(train_local_accuracies) / len(train_local_accuracies))
        test_accuracies.append(sum(test_local_accuracies)/len(test_local_accuracies))

    print(f"Ratio: {trainRatio:.2%}\n\t Train Accuracy: {sum(train_accuracies)/len(train_accuracies):.2%}")
    print(f"Ratio: {trainRatio:.2%}\n\t Test Accuracy: {sum(test_accuracies)/len(test_accuracies):.2%}")
    print(f"{time()-start:.2f} seconds elapsed")
    
print(f"{time()-hard_start:.2f} seconds to run")

Ratio: 50.00%
	 Train Accuracy: 80.98%
Ratio: 50.00%
	 Test Accuracy: 81.74%
529.81 seconds elapsed
Ratio: 60.00%
	 Train Accuracy: 81.00%
Ratio: 60.00%
	 Test Accuracy: 81.67%
617.50 seconds elapsed
Ratio: 70.00%
	 Train Accuracy: 80.36%
Ratio: 70.00%
	 Test Accuracy: 81.00%
619.04 seconds elapsed
Ratio: 80.00%
	 Train Accuracy: 81.37%
Ratio: 80.00%
	 Test Accuracy: 81.35%
666.12 seconds elapsed
Ratio: 90.00%
	 Train Accuracy: 81.92%
Ratio: 90.00%
	 Test Accuracy: 83.08%
800.24 seconds elapsed
3232.72 seconds to run


In [12]:
print(Ytrain.shape)
print(Ytest.shape)

(13500, 15)
(1500, 15)


In [13]:
""" 
Now we vary the size of the trees
We use 80% train, 20% test (The standard)
"""
hard_start = time()

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.8, random_state=2021)
Ytrain_classes = []
Ytest_classes = []


for treeSize in range(2, 10):
    start = time()
    for i in range(Ytrain.shape[1]):
        Ytrain_i = classify(Ytrain, i)
        Ytrain_classes.append(Ytrain_i)
        Ytest_i = classify(Ytest, i)
        Ytest_classes.append(Ytest_i)

    train_accuracies = []
    test_accuracies = []
    for i in range(Ytrain.shape[1]):
        H_train = []
        H_test =[]
        Weak_Learners = []
        for _ in range(10):
            model = ShallowTree(treeSize)
            model.fit(Xtrain, Ytrain_classes[0])
            Weak_Learners.append(WeakLearner(model, 0))

        weights = np.array([1/len(Xtrain) for _ in range(len(Xtrain))])
        local_accuracies = []
        
        for r in range(10):
            train_local_accuracies.append(run(Weak_Learners, Xtrain, Ytrain_classes[i], H_train, weights))
            test_local_accuracies.append(run(Weak_Learners, Xtest, Ytest_classes[i], H_test, weights))

        train_accuracies.append(sum(train_local_accuracies) / len(train_local_accuracies))
        test_accuracies.append(sum(test_local_accuracies)/len(test_local_accuracies))

    print(f"Percent correct with training set and tree size {treeSize}: {sum(train_accuracies)/len(train_accuracies):.2%}")
    print(f"Percent correct with test set and tree size {treeSize}: {sum(test_accuracies)/len(test_accuracies):.2%}")
    print(f"{time()-start:.2f} seconds elapsed")
print(f"{time()-hard_start:.2f} seconds to run")

Percent correct with training set and tree size 2: 82.10%
Percent correct with test set and tree size 2: 82.49%
685.31 seconds elapsed
Percent correct with training set and tree size 3: 81.15%
Percent correct with test set and tree size 3: 81.34%
770.76 seconds elapsed
Percent correct with training set and tree size 4: 80.59%
Percent correct with test set and tree size 4: 80.99%
848.00 seconds elapsed
Percent correct with training set and tree size 5: 80.21%
Percent correct with test set and tree size 5: 80.79%
872.08 seconds elapsed
Percent correct with training set and tree size 6: 79.90%
Percent correct with test set and tree size 6: 80.59%
962.80 seconds elapsed
Percent correct with training set and tree size 7: 79.61%
Percent correct with test set and tree size 7: 80.38%
1051.88 seconds elapsed
Percent correct with training set and tree size 8: 79.32%
Percent correct with test set and tree size 8: 80.17%
1139.44 seconds elapsed
Percent correct with training set and tree size 9: 79