# Ensemble Version 1

Ensemble of 8 different CNN models, average weighted by Kaggle LB score

## Imports

Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import pickle, os
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import set_random_seed
import keras
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from src.sampler import sample_weighter
from src.model_api import (getSimpleModel, getSimpleModelV2, getSimpleModelV3, getSimpleModelV4,
                           getSimpleModelV5, getSimpleModelV6, getSimpleModelV7, getSimpleModelV8)
from src.prediction import array_filter, array_to_string
from src.custom_metric import as_keras_metric

Import data

In [3]:
with open('data/derived/data_train.pickle', 'rb') as file:
    data_train = pickle.load(file)
with open('data/derived/labels_train.pickle', 'rb') as file:
    labels_train = pickle.load(file)
print('Shape of data_train:', data_train.shape)
print('Length of labels_train:', len(labels_train))

Shape of data_train: (25361, 100, 100, 3)
Length of labels_train: 25361


In [4]:
with open('data/derived/data_test.pickle', 'rb') as file:
    data_test = pickle.load(file)
with open('data/derived/test_file_names.pickle', 'rb') as file:
    filenames_test = pickle.load(file)
print('Shape of data_test:', data_test.shape)
print('Length of filename_test:', len(filenames_test))

Shape of data_test: (7960, 100, 100, 3)
Length of filename_test: 7960


Convert labels to integers for model training

In [5]:
label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
print('Original labels: {}'.format(labels_train[:5]))
print('Encoded labels: {}'.format(labels_train_encoded[:5]))
print('Label encoder classes: {}'.format(label_encoder.classes_[:5]))

Original labels: ['new_whale', 'new_whale', 'w_75d0e61', 'w_396c12b', 'w_d8de44c']
Encoded labels: [   0    0 2308 1134 4209]
Label encoder classes: ['new_whale' 'w_0003639' 'w_0003c59' 'w_0027efa' 'w_00289b1']


Compute sample size and number of classes

In [7]:
sample_size = len(labels_train_encoded)
num_classes = len(set(labels_train_encoded))
print('Sample size:', sample_size)
print('Number of clases:', num_classes)

Sample size: 25361
Number of clases: 5005


## Build ensemble

Create metrics and load base models

In [8]:
# Create precision and recall metrics
# Note: top_k_categorical_accuracy function in keras.metrics has default k value of 5
top_5_categorical_accuracy = keras.metrics.top_k_categorical_accuracy
f1_score = as_keras_metric(tf.contrib.metrics.f1_score)
metric_list = [top_5_categorical_accuracy, f1_score]

model1 = getSimpleModel(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model1.load_weights('models/weights-model1-final.hdf5')
model2 = getSimpleModelV2(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model2.load_weights('models/weights-model2-final.hdf5')
model3 = getSimpleModelV3(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model3.load_weights('models/weights-model3-final.hdf5')
model4 = getSimpleModelV4(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model4.load_weights('models/weights-model4-final.hdf5')
model5 = getSimpleModelV5(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model5.load_weights('models/weights-model5-final.hdf5')
model6 = getSimpleModelV6(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model6.load_weights('models/weights-model6-final.hdf5')
model7 = getSimpleModelV7(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model7.load_weights('models/weights-model7-final.hdf5')
model8 = getSimpleModelV8(num_classes=num_classes, resize_width=100, metric_list=metric_list)
model8.load_weights('models/weights-model8-final.hdf5')

Obtain predictions from base models

In [9]:
data_test_norm = (data_test / 255).astype(np.float32)

pred1 = model1.predict(data_test_norm)
pred2 = model2.predict(data_test_norm)
pred3 = model3.predict(data_test_norm)
pred4 = model4.predict(data_test_norm)
pred5 = model5.predict(data_test_norm)
pred6 = model6.predict(data_test_norm)
pred7 = model7.predict(data_test_norm)
pred8 = model8.predict(data_test_norm)

Compute weighted average of predictions

In [10]:
average_preds = 0.207 ** 2 * pred1 + 0.307 ** 2 * pred2 + 0.302 ** 2 * pred3 + 0.315 ** 2 * pred4 +\
0.285 ** 2 * pred5 + 0.334 ** 2 * pred6 + 0.301 ** 2 * pred7 + 0.329 ** 2 * pred8

Obtain labels with top 5 softmax values for each array row and concatenate labels

In [11]:
top5_indices = np.apply_along_axis(array_filter, arr=average_preds, axis=1, n_top=5, labels=label_encoder.classes_)
predictions_array = np.apply_along_axis(array_to_string, arr=top5_indices, axis=1)

Create submission DataFrame and export as CSV file

In [12]:
submission_df = pd.DataFrame({'Image': filenames_test, 'Id': predictions_array})
submission_df.to_csv('data/derived/submission_ensemble_1.csv', index=False)
print(submission_df.head())

           Image                                                 Id
0  660352b03.jpg  new_whale w_af367c3 w_8dc6c05 w_9b565fa w_b9c99cc
1  bec66f23c.jpg  new_whale w_bbfce38 w_c0d11da w_9c506f6 w_e906edd
2  fb8c2c146.jpg  new_whale w_a4ac5dd w_bf960fa w_dba1c08 w_16def42
3  0ff9cd790.jpg  new_whale w_34120de w_71b9a85 w_3815890 w_584e1dc
4  861e6c332.jpg  new_whale w_8c25681 w_6822dbc w_4f9c015 w_564a34b


Kaggle score: 0.346