In [2]:
from functions import *
import glob
from PIL import Image
import os
from sklearn.model_selection import train_test_split

In [4]:
IMAGE_DIR = "./data/data/data"

def load():
    file_list = glob.glob(IMAGE_DIR + "/*.jpg")
    X = []
    Y = []

    for fname in file_list:
        with Image.open(fname) as img:
            np_img = np.array(img).flatten()
        label = int(os.path.split(fname)[-1].split('.')[0].split('_')[3])-1

        X.append(np_img)
        tempy = np.zeros(15)
        tempy[label] = 1
        Y.append(tempy)
    X, Y = np.array(X), np.array(Y)
    return(X, Y)

In [5]:
# create test, train split
X, Y = load()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.8, random_state=2021)
Ytrain_classes = []
for i in range(Ytrain.shape[1]):
    Ytrain_0 = classify(Ytrain, i)
    Ytrain_classes.append(Ytrain_0)
    print(f"{Ytrain_0.count(1)} datapoints have classification {i}")

794 datapoints have classification 0
807 datapoints have classification 1
790 datapoints have classification 2
821 datapoints have classification 3
770 datapoints have classification 4
811 datapoints have classification 5
784 datapoints have classification 6
824 datapoints have classification 7
799 datapoints have classification 8
790 datapoints have classification 9
788 datapoints have classification 10
802 datapoints have classification 11
815 datapoints have classification 12
796 datapoints have classification 13
809 datapoints have classification 14


In [6]:
def update_weights(weights, best):
    for i in range(len(weights)):
        if i not in best.miss_data:
            weights[i] = 1/2*(1/(1 - best.error_rate))*weights[i]
        else:
            weights[i] = 1/2*(1/best.error_rate)*weights[i]

def run(Weak_Learners, data, eval_set, H, weights):
    for wl in Weak_Learners:
        wl.miss_classify(data, eval_set)
        wl.calc_error_rate(weights)
    
    best = Weak_Learners[0]
    for wl in Weak_Learners:
        if wl.error_rate < best.error_rate:
            best = wl
    
    best.calc_voting_power()
    H.append(best)
    
    update_weights(weights, best)
    
    accuracy = H_accuracy(H, data, eval_set)
    
    return accuracy

In [36]:
"""
Here we test the performace of our model with different amounts of training data 
TODO: METHOD OF CALCULATING ACCURACY MAY BE INCORRECT
"""
trainSizes = [x / 10 for x in range(1, 10, 1)] # Create a list of [0.1, 0.2, 0.3, ..., 0.9]

for trainRatio in trainSizes:
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=trainRatio, random_state=2021)
    Ytrain_classes = []
    Ytest_classes = []
    for i in range(Ytrain.shape[1]):
        Ytrain_i = classify(Ytrain, i)
        Ytrain_classes.append(Ytrain_i)
        Ytest_i = classify(Ytest, i)
        Ytest_classes.append(Ytest_i)
    
    accuracies = []
    for i in range(Ytrain.shape[1]):
        H = []
        Weak_Learners = []
        for _ in range(10):
            model = ShallowTree()
            model.fit(Xtrain, Ytrain_classes[0])
            Weak_Learners.append(WeakLearner(model, 0))

        weights = np.array([1/len(Xtrain) for _ in range(len(Xtrain))])
        local_accuracies = []
        for r in range(10):
            local_accuracies.append(run(Weak_Learners, Xtrain, Ytrain_classes[i], H, weights))
        accuracies.append(sum(local_accuracies)/len(local_accuracies))
        #training_accuracy = H_accuracy(H, Xtrain, Ytrain_classes[i])
        #print(f"Training accuracy for class {i}: {training_accuracy:.2%}")
    #     WL.miss_classify(Xtrain, Ytrain_classes[0])
    #     WL.calc_error_rate(np.array([1 for d in range(Xtrain.shape[0])]))
    print(f"Ratio: {trainRatio:.2%}\n\tAccuracy: {sum(accuracies)/len(accuracies):.2%}")


Ratio: 10.00%
	Accuracy: 80.66%
Ratio: 20.00%
	Accuracy: 83.26%
Ratio: 30.00%
	Accuracy: 82.12%
Ratio: 40.00%
	Accuracy: 80.77%
Ratio: 50.00%
	Accuracy: 80.98%
Ratio: 60.00%
	Accuracy: 81.00%
Ratio: 70.00%
	Accuracy: 80.36%
Ratio: 80.00%
	Accuracy: 81.37%
Ratio: 90.00%
	Accuracy: 81.92%


In [14]:
""" 
Now we vary the size of the trees
"""
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.8, random_state=2021)
Ytrain_classes = []
Ytest_classes = []
for treeSize in range(2, 10):
    for i in range(Ytrain.shape[1]):
        Ytrain_i = classify(Ytrain, i)
        Ytrain_classes.append(Ytrain_i)
        Ytest_i = classify(Ytest, i)
        Ytest_classes.append(Ytest_i)

    accuracies = []
    for i in range(Ytrain.shape[1]):
        H = []
        Weak_Learners = []
        for _ in range(10):
            model = ShallowTree(treeSize)
            model.fit(Xtrain, Ytrain_classes[0])
            Weak_Learners.append(WeakLearner(model, 0))

        weights = np.array([1/len(Xtrain) for _ in range(len(Xtrain))])
        local_accuracies = []
        for r in range(10):
            local_accuracies.append(run(Weak_Learners, Xtest, Ytrain_classes[i], H, weights))
        accuracies.append(sum(local_accuracies)/len(local_accuracies))

    print(f"Percent correct with tree size {treeSize}: {sum(accuracies)/len(accuracies):.2%}")

Percent correct with tree size 2: 81.35%
Percent correct with tree size 3: 80.31%
Percent correct with tree size 4: 80.59%
Percent correct with tree size 5: 80.00%
Percent correct with tree size 6: 79.61%
Percent correct with tree size 7: 79.04%
Percent correct with tree size 8: 78.41%
Percent correct with tree size 9: 78.25%
