In [23]:
from functions import *
import glob
from PIL import Image
import os
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import fashion_mnist
from time import time

In [None]:
IMAGE_DIR = "./data/Chinese_MNIST/data/data"

def load():
    file_list = glob.glob(IMAGE_DIR + "/*.jpg")
    X = []
    Y = []

    for fname in file_list:
        with Image.open(fname) as img:
            np_img = np.array(img).flatten()
        label = int(os.path.split(fname)[-1].split('.')[0].split('_')[3])-1

        X.append(np_img)
        Y.append(label)

    return(np.array(X), np.array(Y))

def get_dataset(i):
    X = None
    Y = None 
    if i == 0: #Sign Language MNIST 
        train_df = pd.read_csv("./data/Sign_Language_MNIST/sign_mnist_train.csv")
        test_df = pd.read_csv("./data/Sign_Language_MNIST/sign_mnist_test.csv")
        
        Ytrain = train_df['label']
        Ytest = test_df['label']
        del train_df['label']
        del test_df['label']
        
        Xtrain = train_df.values
        Xtest = test_df.values
        X = np.append(Xtrain, Xtest, 0)
        Y = np.append(Ytrain, Ytest, 0)
        
    elif i == 1:#Chinese MNIST 
        X, Y = load()
        
    elif i == 2: #Fashion MNIST
        (Xtrain, Ytrain), (Xtest, Ytest) = fashion_mnist.load_data()
        Xtrain = Xtrain.reshape((len(Xtrain), np.prod(Xtrain.shape[1:])))
        Xtest = Xtest.reshape((len(Xtest), np.prod(Xtest.shape[1:])))
        X = np.append(Xtrain, Xtest, 0)
        Y = np.append(Ytrain, Ytest, 0)
        
    return (X, Y)

In [None]:
"""
Here we test the performace of our model with different amounts of training data (Stochastic Boosting)
"""
from time import time

all_train_accuracy_ratio = list()
all_testing_accuracy_ratio = list()

for batch_div in range(1, 10):
    batch_train_accuracy = list()
    batch_test_accuracy = list()
    
    for dataset in range(3):
        hard_start = time()

        X, Y = get_dataset(dataset)
        trainSizes = [x / 10 for x in range(1, 10, 1)] # Create a list of [0.5, ..., 0.9] If x < 0.5 we get an out of bounds error on the weights

        train_accuracy = []
        test_accuracy = []

        for trainRatio in trainSizes:
            start = time()
            print(f"Dataset: {dataset}; Batch Size: len(X)/{batch_div}; Ratio: {trainRatio:.2%}")
            
            Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=trainRatio, random_state=2021)

            model = Boost(n_estimators=500, base_learner=ShallowTree()).update_fit(Xtrain,Ytrain, verbose=100, batch_size=len(Xtrain)//batch_div)

            train_acc = model.accuracy(model.predict(Xtrain[:1000,]),Ytrain[:1000,])
            train_accuracy.append(train_acc)
            print(f"Ratio: {trainRatio:.2%}\n\t Train Accuracy: {train_acc:.2%}")
            test_acc = model.accuracy(model.predict(Xtest[:1000,]),Ytest[:1000,])
            test_accuracy.append(test_acc)
            print(f"Ratio: {trainRatio:.2%}\n\t Test Accuracy: {test_acc:.2%}")
            print(f"{time()-start:.2f} seconds elapsed")

        batch_train_accuracy.append(train_accuracy)
        batch_test_accuracy.append(test_accuracy)
        print(f"{time()-hard_start:.2f} seconds to run")
    
    all_train_accuracy_ratio.append(batch_train_accuracy)
    all_testing_accuracy_ratio.append(batch_test_accuracy)

In [None]:
""" 
Now we vary the size of the trees
We use 80% train, 20% test (The standard)
"""
all_train_accuracy_tree = list()
all_testing_accuracy_tree = list()

for batch_div in range(1, 10):
    batch_train_accuracy = list()
    batch_test_accuracy = list()
    
    for dataset in range(3):
        hard_start = time()
        X, Y = get_dataset(2)

        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.8, random_state=2021)

        train_accuracies = []
        test_accuracies = []

        for treeSize in range(1, 10):
            start = time()
            print(f"Dataset: {dataset}; Batch Size: len(X)/{batch_div}; Treesize: {treeSize}")
            model = Boost(n_estimators=250, base_learner=ShallowTree(treeSize)).update_fit(Xtrain, Ytrain, verbose=50, batch_size=len(Xtrain)//batch_div)

            train_acc = model.accuracy(model.predict(Xtrain),Ytrain)
            train_accuracies.append(train_acc)
            test_acc = model.accuracy(model.predict(Xtest),Ytest)
            test_accuracies.append(test_acc)

            print(f"Percent correct with training set and tree size {treeSize}: {train_acc:.2%}")
            print(f"Percent correct with test set and tree size {treeSize}: {test_acc/len(test_accuracies):.2%}")
            print(f"{time()-start:.2f} seconds elapsed")

        batch_train_accuracy.append(train_accuracy)
        batch_test_accuracy.append(test_accuracy)
        print(f"{time()-hard_start:.2f} seconds to run")
        
    all_train_accuracy_tree.append(batch_train_accuracy)
    all_testing_accuracy_tree.append(batch_test_accuracy)