In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import cv2
import numpy as np
import sys
from keras.applications.vgg16 import preprocess_input as preprocess_input_16
from keras.applications.vgg19 import preprocess_input as preprocess_input_19
from sklearn.metrics import confusion_matrix
from keras.applications.vgg19 import VGG19
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Model
from keras.preprocessing import image
import pickle
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

sys.path.append(os.path.abspath("../places_keras/Keras-VGG16-places365/"))
from vgg16_places_365 import VGG16_Places365
from vgg16_hybrid_places_1365 import VGG16_Hubrid_1365

# Constants, paths, etc
NUM_AUGMENTATION = 14
TRAIN_PATH = '../input/train/'
VAL_PATH = '../input/val/'
TEST_PATH = '../input/test/'
AUG_TRAIN_PATH = '../input/aug/train/'
AUG_VAL_PATH = '../input/aug/val/'
AUG_TEST_PATH = '../input/aug/test/'
PATH_FEATURES = '../features/'
PICKLE_PATH = '../pickle/'
TEST_ANSWERS = '../input/test_answers.csv'


# Separate 20% of the files for validation
def separate_for_validation(city):
    l = []
    for filename in tqdm(os.listdir('../input/train/{}'.format(city))):
        l.append(filename)

    for f in np.random.choice(l, size=int(0.2*len(l)), replace=False):
        print('mv {} ../../val/{}/'.format(f, city))

# Read pre-computed descriptors and return them
def read_vgg_features(dataset="train", input_name=''):
    x = np.loadtxt('{}{}_{}_{}.txt'.format(PATH_FEATURES, "x", dataset, input_name))
    y = np.loadtxt('{}{}_{}_{}.txt'.format(PATH_FEATURES, "y", dataset, input_name))
    return x, y

# Compute normalized accuracy
def normalized_accuracy(y_true, y_pred):
    conf = confusion_matrix(y_true, y_pred)
    norm_acc = 0.0
    for i in range(conf.shape[0]):
        norm_acc += (conf[i][i] / np.sum(conf[i,:]))
    return norm_acc / (conf.shape[0])

# Pass an image through the network and return its feature vector
def get_vgg_feature_for_image(model, img_path, weights):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    
    if weights == 'places' or weights == 'places_2':
        x = preprocess_input_16(x)        
    elif weights == 'imagenet':
        x = preprocess_input_19(x)
    return model.predict(x)

# Perform data augmentation in a single image
def augment(image, filename, path, save_output=False):
    aug_images = []
    rows, cols,_ = image.shape

    # Original image
    aug_images.append(image)
    
    # Flip
    aug_images.append(cv2.flip(image, 1))
    
    # Rotate
    angles = [-20, -10, 10, 20]
    for angle in angles:
        M = cv2.getRotationMatrix2D((cols/2,rows/2), angle, 1)
        aug_images.append(cv2.warpAffine(image,M,(cols,rows)))

    # Brightness
    brightness_values = [-25, -12, 12, 25]  
    for brightness in brightness_values:
        changed = np.add(image.astype('uint32'), brightness)
        changed = changed.clip(0, 255).astype('uint8')
        aug_images.append(changed)    
    
    # Contrast
    contrast_values = [0.8, 0.9, 1.1, 1.2]    
    for contrast in contrast_values:
        changed = image.astype('uint32') * contrast
        changed = changed.clip(0, 255).astype('uint8')
        aug_images.append(changed)
    
    if save_output:
        for i, aug_image in enumerate(aug_images):
            cv2.imwrite("{}{}_{:02d}.jpg".format(path, filename.replace('.jpg', ''), i), aug_image)
    return aug_images

# Perform augmentation in the given dataset
def augment_and_save(dataset="train"):
    if dataset == "train":
        input_path = TRAIN_PATH
        output_path = AUG_TRAIN_PATH        
    elif dataset == "val":
        input_path = VAL_PATH
        output_path = AUG_VAL_PATH        
    elif dataset == "test":
        input_path = TEST_PATH
        output_path = AUG_TEST_PATH        
    else:
        raise ValueError("dataset must be train, val or test")

    for filename in tqdm(os.listdir(input_path)):
        # Avoid temporary MAC files, etc
        if filename.endswith('.jpg'):
            image = cv2.imread(input_path + filename)
            augment(image, filename, output_path, save_output=True)
            
# Get desired path for the augmented images
def get_augmented_path(filename, aug_path):
    path = []
    for i in range(NUM_AUGMENTATION):
        path.append("{}{}_{:02d}.jpg".format(aug_path, filename.replace('.jpg', ''), i))
    return path

# Read test answers from CSV file and return them as a dictionary
def get_test_answers():
    file = open(TEST_ANSWERS, 'r') 
    answers = dict()
    next(file)
    
    str_to_num = dict()
    str_to_num["boston_marathon"] = 0
    str_to_num["austin_marathon"] = 1
    str_to_num["occupy_baltimore"] = 2
    str_to_num["occupy_portland"] = 3

    for line in file:
        val = line.split('|')
        name = val[1]
        true = val[-1].replace('\n', '')

        if true == "Y":
            answer = val[-2].replace('\n', '')
            answers[name] = str_to_num[answer]     
    return answers


# Describe images in train/test/val dataset using the method provided. Write results to 'features' folder
def get_vgg_features(dataset, output_name='', use_augmentation=False, weights='places'):
    if dataset == "train":
        path = TRAIN_PATH
        aug_path = AUG_TRAIN_PATH
    elif dataset == "val":
        path = VAL_PATH
        aug_path = AUG_VAL_PATH
    elif dataset == "test":
        path = TEST_PATH
        aug_path = AUG_TEST_PATH
        answers = get_test_answers()
    else:
        raise ValueError("dataset must be train, val or test")

    filenames = []
    x, y = np.empty(0), np.empty(0)

    if weights == 'places':
        base_model = VGG16_Hubrid_1365(weights='places')
    elif weights == 'imagenet':
        base_model = VGG19(weights='imagenet')
    elif weights == 'places_2':
        base_model = VGG16_Places365(weights='places')

    model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

    i = 0
    for filename in tqdm(os.listdir(path)):
        # Avoid temporary MAC files, etc
        if filename.endswith('.jpg'):
            image = cv2.imread(path + filename)
            input_images = []
            
            filenames.append(filename)
            if use_augmentation:
                input_images = get_augmented_path(filename, aug_path)
            else:
                input_images.append(path + filename)
            
            for image in input_images:
                if dataset == "train" or dataset == "val":
                    y = np.append(y, int(filename[:2]))
                else:
                    y = np.append(y, answers[filename.replace('.jpg', '')])

                descriptor = get_vgg_feature_for_image(model, image, weights)
                if x.size == 0:
                    x = descriptor.copy()
                else:
                    x = np.vstack((x, descriptor))

    np.savetxt('{}{}_{}_{}.txt'.format(PATH_FEATURES, "x", dataset, output_name), x)
    np.savetxt('{}{}_{}_{}.txt'.format(PATH_FEATURES, "y", dataset, output_name), y)
    return x, y, filenames


# UNCOMMENT TO EXTRACT FEATURES FROM THE NETWORK
#
# x_train, y_train, _ = get_vgg_features(dataset="train", output_name="places_4class")
# x_val, y_val, _ = get_vgg_features(dataset="val", output_name="places_4class")

# x_train_img, y_train_img, _ = get_vgg_features(dataset="train", output_name="imagenet_4class", 
#                                                weights='imagenet')
# x_val_img, y_val_img, _ = get_vgg_features(dataset="val", output_name="imagenet_4class", 
#                                            weights='imagenet')
# augment_and_save(dataset="train")
# augment_and_save(dataset="val")
# x_train_aug, y_train_aug, _ = get_vgg_features(dataset="train", use_augmentation=True, 
#                                        output_name="aug_places_4class")
# x_val_aug, y_val_aug, _ = get_vgg_features(dataset="val", use_augmentation=True,
#                                    output_name="aug_places_4class")
# x_train_img_aug, y_train_img_aug, _ = get_vgg_features(dataset="train", use_augmentation=True, 
#                                                output_name="aug_imagenet_4class", 
#                                                weights='imagenet')
# x_val_img_aug, y_val_img_aug, _ = get_vgg_features(dataset="val", use_augmentation=True,
#                                            output_name="aug_imagenet_4class", 
#                                            weights='imagenet')
# x_test, y_test, _ = get_vgg_features(dataset="test", output_name="places_4class")
# x_test_img, y_test_img, _ = get_vgg_features(dataset="test", output_name="imagenet_4class", weights='imagenet')
# x_test365, y_test365, _ = get_vgg_features(dataset="test", output_name="places_365", weights='places_2')



# Read pre-computed feature vectors from file system
x_train, y_train = read_vgg_features(dataset="train", input_name="places_4class")
x_val, y_val = read_vgg_features(dataset="val", input_name="places_4class")
x_test, y_test = read_vgg_features(dataset="test", input_name="places_4class")

x_train_img, y_train_img = read_vgg_features(dataset="train", input_name="imagenet_4class")
x_val_img, y_val_img = read_vgg_features(dataset="val", input_name="imagenet_4class")
x_test_img, y_test_img = read_vgg_features(dataset="test", input_name="imagenet_4class")

x_train365, y_train365 = read_vgg_features(dataset="train", input_name="places_365")
x_val365, y_val365 = read_vgg_features(dataset="val", input_name="places_365")
x_test365, y_test365 = read_vgg_features(dataset="test", input_name="places_365")


x_train_aug, y_train_aug = read_vgg_features(dataset="train", input_name="aug_places_4class")
x_val_aug, y_val_aug = read_vgg_features(dataset="val", input_name="aug_places_4class")
x_train_img_aug, y_train_img_aug = read_vgg_features(dataset="train", input_name="aug_imagenet_4class")
x_val_img_aug, y_val_img_aug = read_vgg_features(dataset="val", input_name="aug_imagenet_4class")
x_train_aug365, y_train_aug365 = read_vgg_features(dataset="train", input_name="places_365_aug")
x_val_aug365, y_val_aug365 = read_vgg_features(dataset="val", input_name="places_365_aug")

Using TensorFlow backend.


In [2]:
#
# This block tests several different classifiers, performing grid-search on the necessary parameters.
# Might take a long time to run.
#
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

def test_svms(x_train, y_train, x_val, y_val, verbose=False):
    result_str = "k = {:>8}, c = {:>8}, g = {:>8}, n_acc = {:>8.2f}, f1: {:>8.2f}"
    kernels = ['linear', 'rbf']
    cs = [10**i for i in range(-1, 8)]

    best_nacc = -1
    best_f1 = -1

    n_accs = []
    f1s = []
    for k in kernels:
        for c in cs:
            
            if k == 'rbf':
                gammas = [10**i for i in range(-8, 2)]
            else:
                gammas = ["auto"]
            for g in gammas:
                clf = SVC(C=c, kernel=k, gamma=g)
                clf.fit(x_train, y_train)
                preds = clf.predict(x_val)

                n_acc = normalized_accuracy(y_val, preds)
                f1 = f1_score(y_val, preds, average = 'macro')

                best_f1 = np.max((best_f1, f1))
                best_nacc = np.max((best_nacc, n_acc))
                n_accs.append(n_acc)
                f1s.append(f1)
                
                if verbose:
                    print(result_str.format(k, c,  g, n_acc, f1))

    print("Best results: n_acc = {:>8.2f}, f1: {:>8.2f}".format(best_nacc, best_f1))


def test_random_forest(x_train, y_train, x_val, y_val, verbose=False):
    result_str = "crit = {:>8}, n = {:>8},  n_acc = {:>8.2f}, f1: {:>8.2f}"
    
    criterion = ['gini', 'entropy']
    num_trees = [10*i for i in range(1, 101)]
    
    best_nacc = -1
    best_f1 = -1

    n_accs = []
    f1s = []
    for c in criterion:
        for n in num_trees:
            
            clf = RandomForestClassifier(n_estimators=n, criterion=c, n_jobs=-1, random_state=1107)
            clf.fit(x_train, y_train)
            preds = clf.predict(x_val)

            n_acc = normalized_accuracy(y_val, preds)
            f1 = f1_score(y_val, preds, average = 'macro')

            best_f1 = np.max((best_f1, f1))
            best_nacc = np.max((best_nacc, n_acc))
            n_accs.append(n_acc)
            f1s.append(f1)
            if verbose:
                print(result_str.format(c, n, n_acc, f1))

    print("Best results: n_acc = {:>8.4f}, f1: {:>8.4f}".format(best_nacc, best_f1))

def test_log_r(x_train, y_train, x_val, y_val, multi_class='ovr', solver='liblinear', verbose=False):
    result_str = "LOGR: C = {:>8}, n_acc = {:>8.2f}, f1: {:>8.2f}"

    cs = [10**i for i in range(-10, 10)]
    best_nacc = -1
    best_f1 = -1
    
    n_accs = []
    f1s = []
    for c in cs:
        log_r = LogisticRegression(C=c, multi_class=multi_class, solver=solver)
        log_r.fit(x_train, y_train)
        preds = log_r.predict(x_val)

        n_acc = normalized_accuracy(y_val, preds)
        f1 = f1_score(y_val, preds, average = 'macro')

        best_f1 = np.max((best_f1, f1))
        best_nacc = np.max((best_nacc, n_acc))
        n_accs.append(n_acc)
        f1s.append(f1)
        if verbose:
            print(result_str.format(c, n_acc, f1))
       
    print("Best results: n_acc = {:>8.2f}, f1: {:>8.2f}".format(best_nacc, best_f1))


def test_bagging(x_train, y_train, x_val, y_val, verbose=False):
    result_str = "n_est = {:>8},  n_acc = {:>8.4f}, f1: {:>8.4f}"
    num_est = [10 * i for i in range(1, 21)]

    best_nacc = -1
    best_f1 = -1

    n_accs = []
    f1s = [] 

    for n_est in num_est:
        clf = BaggingClassifier(n_estimators=n_est)
        clf.fit(x_train, y_train)
        preds = clf.predict(x_val)

        n_acc = normalized_accuracy(y_val, preds)
        f1 = f1_score(y_val, preds, average = 'macro')

        best_f1 = np.max((best_f1, f1))
        best_nacc = np.max((best_nacc, n_acc))
        n_accs.append(n_acc)
        f1s.append(f1)
        if verbose:
            print(result_str.format(n_est, n_acc, f1))

    print("Best results: n_acc = {:>8.2f}, f1: {:>8.2f}".format(best_nacc, best_f1))

def test_adaboost(x_train, y_train, x_val, y_val, verbose=False):
    result_str = "n_est = {:>8},  n_acc = {:>8.4f}, f1: {:>8.4f}"
    num_est = [50 * i for i in range(1, 21)]

    best_nacc = -1
    best_f1 = -1

    n_accs = []
    f1s = [] 

    for n_est in num_est:
        clf = AdaBoostClassifier(n_estimators=n_est)
        clf.fit(x_train, y_train)
        preds = clf.predict(x_val)

        n_acc = normalized_accuracy(y_val, preds)
        f1 = f1_score(y_val, preds, average = 'macro')

        best_f1 = np.max((best_f1, f1))
        best_nacc = np.max((best_nacc, n_acc))
        n_accs.append(n_acc)
        f1s.append(f1)
        if verbose:
            print(result_str.format(n_est, n_acc, f1))

    print("Best results: n_acc = {:>8.2f}, f1: {:>8.2f}".format(best_nacc, best_f1))

    
#
# Test each classifier vs each dataset in the validation set.
# Augmented sets are commented as they take a long time to run
#
print("LogR")
test_log_r(x_train, y_train, x_val, y_val)
test_log_r(x_train_img, y_train_img, x_val_img, y_val_img)
test_log_r(x_train365, y_train365, x_val365, y_val365)
# test_log_r(x_train_aug, y_train_aug, x_val, y_val)
# test_log_r(x_train_img_aug, y_train_img_aug, x_val_img, y_val_img)
# test_log_r(x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365)

print("SVMs")
test_svms(x_train, y_train, x_val, y_val)
test_svms(x_train_img, y_train_img, x_val_img, y_val_img)
test_svms(x_train365, y_train365, x_val365, y_val365)
# test_svms(x_train_aug, y_train_aug, x_val, y_val)
# test_svms(x_train_img_aug, y_train_img_aug, x_val_img, y_val_img)
# test_svms(x_train_aug365, y_train_aug365, x_val365, y_val365)

print("Random Forests")
test_random_forest(x_train, y_train, x_val, y_val)
test_random_forest(x_train_img, y_train_img, x_val_img, y_val_img)
test_random_forest(x_train365, y_train365, x_val365, y_val365)
# test_random_forest(x_train_aug, y_train_aug, x_val, y_val)
# test_random_forest(x_train_img_aug, y_train_img_aug, x_val_img, y_val_img)
# test_random_forest(x_train_aug365, y_train_aug365, x_val365, y_val365)

print("Bagging and boosting")
test_bagging(x_train, y_train, x_val, y_val)
test_adaboost(x_train, y_train, x_val, y_val)
test_bagging(x_train_img, y_train_img, x_val_img, y_val_img)
test_adaboost(x_train_img, y_train_img, x_val_img, y_val_img)
test_bagging(x_train365, y_train365, x_val365, y_val365)
test_adaboost(x_train365, y_train365, x_val365, y_val365)

LogR
Best results: n_acc =     0.71, f1:     0.71
Best results: n_acc =     0.73, f1:     0.73
Best results: n_acc =     0.71, f1:     0.70
SVMs


  'precision', 'predicted', average, warn_for)


Best results: n_acc =     0.73, f1:     0.73
Best results: n_acc =     0.75, f1:     0.75
Best results: n_acc =     0.71, f1:     0.71
Random Forests
Best results: n_acc =   0.7313, f1:   0.7269
Best results: n_acc =   0.7563, f1:   0.7514
Best results: n_acc =   0.7312, f1:   0.7250
Bagging and boosting
Best results: n_acc =     0.69, f1:     0.69
Best results: n_acc =     0.62, f1:     0.62
Best results: n_acc =     0.74, f1:     0.74
Best results: n_acc =     0.53, f1:     0.53
Best results: n_acc =     0.71, f1:     0.71
Best results: n_acc =     0.56, f1:     0.57


In [3]:
#
# Given the best classifiers obtained above in the validation set, this block provides utilities
# to train, store and read them from disk
#

# Creates an array of dictionaries containing the optimal parameteres of each classifier
def get_classif_params():
    # [Type of classif, C, kernel type, gamma]
    params = []

    # no aug - places hybrid
    params.append({"classif": "svm", "k": 'rbf', "c": 1, "g": 0.0001, "aug": False, "net": "places"})
    params.append({"classif": "logr", "c": 0.001, "aug": False, "net": "places"})
    params.append({"classif": "rf", "crit": 'gini', "n": 170, "aug": False, "net": "places"})

    # no aug - imagenet
    params.append({"classif": "svm", "k": 'rbf', "c": 1, "g": 0.0001, "aug": False, "net": "imagenet"})
    params.append({"classif": "logr", "c": 10, "aug": False, "net": "imagenet"})
    params.append({"classif": "rf", "crit": 'gini', "n": 790, "aug": False, "net": "imagenet"})

    # no aug - places365
    params.append({"classif": "svm", "k": 'rbf', "c": 1, "g": 0.001, "aug": False, "net": "365"})
    params.append({"classif": "logr", "c": 0.01, "aug": False, "net": "365"})
    params.append({"classif": "rf", "crit": 'entropy', "n": 560, "aug": False, "net": "365"})
    
    # aug - places
    params.append({"classif": "svm", "k": 'rbf', "c": 0.1, "g": 0.0001, "aug": True, "net": "places"})
    params.append({"classif": "logr", "c": 0.0001, "aug": True, "net": "places"})
    params.append({"classif": "rf", "crit": 'gini', "n": 220, "aug": True, "net": "places"})
    
    # aug - imagenet
    params.append({"classif": "svm", "k": 'rbf', "c": 0.1, "g": 0.0001, "aug": True, "net": "imagenet"})
    params.append({"classif": "logr", "c": 0.0001, "aug": True, "net": "imagenet"})
    params.append({"classif": "rf", "crit": 'gini', "n": 600, "aug": True, "net": "imagenet"})
    
    # aug - places365
    params.append({"classif": "svm", "k": 'rbf', "c": 10000, "g": 1e-7, "aug": True, "net": "365"})
    params.append({"classif": "logr", "c": 0.001, "aug": True, "net": "365"})
    params.append({"classif": "rf", "crit": 'entropy', "n": 600, "aug": True, "net": "365"})
    return params

# Train classifiers and save them to disk
def train_and_save_best_classif(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img, 
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365, preffix="", verbose=False):

    params = get_classif_params()
    for p in params:
        xt, yt, xv, yv = get_sets(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img, 
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug, 
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365, p)

        if p["classif"] == 'svm':
            classif = SVC(C=p["c"], kernel=p["k"], gamma=p["g"])
            classif.fit(xt, yt)
        elif p["classif"]  == 'logr':
            classif = LogisticRegression(C=p["c"], multi_class='ovr', solver='liblinear')
            classif.fit(xt, yt)
        elif p["classif"] == "rf":
            classif = RandomForestClassifier(n_estimators=p["n"], criterion=p["crit"], n_jobs=-1, random_state=1107)
            classif.fit(xt, yt)
       
        pickle.dump(classif, open('{}{}_{}.pickle'.format(PICKLE_PATH, preffix, p), 'wb'))
        
        if verbose:
            preds = classif.predict(xv)
            n_acc = normalized_accuracy(yv, preds)
            f1 = f1_score(y_val, preds, average = 'macro')
            print("params: {}, n_acc: {:.4f}, f1 macro: {:.4f}".format(p, n_acc, f1))

# Load pre-trained classifiers from disk
def load_best_classif(preffix):
    params = get_classif_params()
    classifs = []
    for p in params:
        classifs.append(pickle.load(open('{}{}_{}.pickle'.format(PICKLE_PATH, preffix, p), 'rb')))
    return classifs

# Auxiliary method that returns the training and validation sets for the corresponding classifier
def get_sets(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img, 
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365, p):
    aug, net = p["aug"], p["net"]
    
    if aug == False and net == "places":
        xt, yt, xv, yv = x_train, y_train, x_val, y_val
    elif aug == False and net == "imagenet":
        xt, yt, xv, yv = x_train_img, y_train_img, x_val_img, y_val_img
    elif aug == False and net == "365":
        xt, yt, xv, yv = x_train365, y_train365, x_val365, y_val365
    elif aug == True and net == "places":
        xt, yt, xv, yv = x_train_aug, y_train_aug, x_val, y_val
    elif aug == True and net == "imagenet":
        xt, yt, xv, yv = x_train_img_aug, y_train_img_aug, x_val_img, y_val_img
    elif aug == True and net == "365":
        xt, yt, xv, yv = x_train_aug365, y_train_aug365, x_val365, y_val365
        
    return xt, yt, xv, yv
 
# Auxiliary method that returns the testing for the corresponding classifier
def get_test_sets(x_test, y_test, x_test_img, y_test_img, x_test365, y_test365, params):
    if params["net"] == "places":
        xt, yt = x_test, y_test
    elif params["net"] == "imagenet":
        xt, yt = x_test_img, y_test_img
    elif params["net"] == "365":
        xt, yt = x_test365, y_test365
    return xt, yt
    
# Save our classifiers to disk, for reusal later
train_and_save_best_classif(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365, preffix="4class")

In [5]:
# Now, check the accuracy of our classifiers in the test dataset
def check_test_accuracy(x_test, y_test, x_test_img, y_test_img, x_test365, y_test365):
    classifs = load_best_classif("4class")
    params = get_classif_params()
    confs = []
    for i, c in enumerate(classifs):
        xt, yt = get_test_sets(x_test, y_test, x_test_img, y_test_img, x_test365, y_test365, params[i])
            
        preds = c.predict(xt)
        n_acc = normalized_accuracy(yt, preds)
        f1 = f1_score(yt, preds, average = 'macro')                     
        print("{}: n_acc: {:.2f}, f1 macro: {:.2f}".format(c.__class__.__name__, n_acc, f1))
    return preds, confs

check_test_accuracy(x_test, y_test, x_test_img, y_test_img, x_test365, y_test365);

SVC: n_acc: 0.73, f1 macro: 0.73
LogisticRegression: n_acc: 0.70, f1 macro: 0.70
RandomForestClassifier: n_acc: 0.73, f1 macro: 0.73
SVC: n_acc: 0.73, f1 macro: 0.73
LogisticRegression: n_acc: 0.72, f1 macro: 0.72
RandomForestClassifier: n_acc: 0.72, f1 macro: 0.72
SVC: n_acc: 0.64, f1 macro: 0.63
LogisticRegression: n_acc: 0.61, f1 macro: 0.60
RandomForestClassifier: n_acc: 0.62, f1 macro: 0.61
SVC: n_acc: 0.72, f1 macro: 0.72
LogisticRegression: n_acc: 0.71, f1 macro: 0.71
RandomForestClassifier: n_acc: 0.72, f1 macro: 0.72
SVC: n_acc: 0.71, f1 macro: 0.71
LogisticRegression: n_acc: 0.72, f1 macro: 0.72
RandomForestClassifier: n_acc: 0.73, f1 macro: 0.73
SVC: n_acc: 0.59, f1 macro: 0.58
LogisticRegression: n_acc: 0.61, f1 macro: 0.61
RandomForestClassifier: n_acc: 0.63, f1 macro: 0.61


In [None]:
#
# Fusion code - Majority vote and meta-learning
#

# Returns predictions for all classifiers and all datasets. Used for majority voting and meta-learning
def get_all_preds(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365,
                                x_test=None, y_test = None, x_test_img = None, y_test_img = None,
                                x_test365=None, y_test365 = None, tests=False):

    classifs = load_best_classif("4class")
    params = get_classif_params()
    
    if tests:
        preds = np.zeros((len(classifs), x_test.shape[0]))
    else:
        preds = np.zeros((len(classifs), x_val.shape[0]))
    for i, c in enumerate(classifs):
        xt, yt, xv, yv = get_sets(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365, params[i])
        
        if tests:
            xv, yv = get_test_sets(x_test, y_test, x_test_img, y_test_img, x_test365, y_test365, params[i])
        preds[i] = c.predict(xv)
    return preds

# Same for the train data-set
def get_preds_train(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365):
    classifs = load_best_classif("4class")
    params = get_classif_params()
    preds_train = np.zeros((int(len(classifs)/2), x_train.shape[0]))

    for i, c in enumerate(classifs):
        if params[i]["aug"] == False:
            xt, yt, xv, yv = get_sets(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                    x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                    x_train365, y_train365, x_val365, y_val365, 
                                    x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365, params[i])
            preds_train[i] = c.predict(xt)
    return preds_train

# Test majority voting
def test_majority_vote(y_val, preds):
    pred_int = preds.astype('uint32')
    majority_preds = np.zeros(preds.shape[1])
    for i in range(preds.shape[1]):
        majority_preds[i] = np.argmax(np.bincount(pred_int[:, i])) 
    
    n_acc = normalized_accuracy(y_val, majority_preds)
    f1 = f1_score(y_val, majority_preds, average = 'macro')
    print("Majority vote: n_acc: {:.2f}, f1: {:.2f}".format(n_acc, f1))


# Test meta-learning
def test_meta_learning(y_train, y_val, preds_train, preds_val, verbose=False):
    classifs = load_best_classif("4class")

    x_meta_train = np.transpose(preds_train)
    x_meta_val = np.transpose(preds_val)
    cs = [10**i for i in range(-5, 5)]
    gammas = [10**i for i in range(-8, 8)]
    
    best_f1, best_nacc = -1, -1

    for c in cs:
        for g in gammas:
            clf = SVC(C=c, kernel='rbf', gamma=g)
            clf.fit(x_meta_train, y_train)
            preds = clf.predict(x_meta_val)
            n_acc = normalized_accuracy(y_val, preds)
            f1 = f1_score(y_val, preds, average = 'macro')
            best_f1 = np.max((best_f1, f1))
            best_nacc = np.max((best_nacc, n_acc))
            if verbose:
                print("c: {}, n_acc: {:.2f}, f1: {:.2f}".format(c, n_acc, f1))
    print("Meta-Learning: n_acc: {:.2f}, f1: {:.2f}".format(best_nacc, best_f1))
            
# Get predictions for validation
preds_train = get_preds_train(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365)

# Get predictions for validation
preds_val = get_all_preds(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365)

# And for tests
preds_test = get_all_preds(x_train, y_train, x_val, y_val, x_train_img, y_train_img, x_val_img, y_val_img,
                                x_train_aug, y_train_aug, x_train_img_aug, y_train_img_aug,
                                x_train365, y_train365, x_val365, y_val365, 
                                x_train_aug365, y_train_aug365, x_val_aug365, y_val_aug365,
                                x_test, y_test, x_test_img, y_test_img, x_test365, y_test365, tests=True)

# Test majority voting and meta learning on validation set - First for non-augmented classifiers only,
# then for augmented only, and then for both
print("Validation set")
test_majority_vote(y_val, preds_val[:9,:])
test_majority_vote(y_val, preds_val[9:,:])
test_majority_vote(y_val, preds_val)
test_meta_learning(y_train, y_val, preds_train, preds_val[:9])


# Same for test dataset
print("\nTest Dataset")
test_majority_vote(y_test, preds_test[:9,:])
test_majority_vote(y_test, preds_test[9:,:])
test_majority_vote(y_test, preds_test)
test_meta_learning(y_train, y_test, preds_train, preds_test[:9])