# Ensemble Version 2

Ensemble averages consisting of simple CNN models

## Imports

Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import pickle
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import set_random_seed
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from src.sampler import up_down_sampler
from src.model_api import getSimpleModel
from src.prediction import array_filter, array_to_string
from src.custom_metric import as_keras_metric

Using TensorFlow backend.


Import data

In [3]:
with open('data/derived/data_train_v3.pickle', 'rb') as file:
    data_train = pickle.load(file)
with open('data/derived/labels_train_v3.pickle', 'rb') as file:
    labels_train = pickle.load(file)
with open('data/derived/data_test_v2.pickle', 'rb') as file:
    data_test = pickle.load(file)
with open('data/derived/test_file_names.pickle', 'rb') as file:
    filenames_test = pickle.load(file)
print('Shape of data_train:', data_train.shape)
print('Length of labels_train:', len(labels_train))
print('Shape of data_test:', data_test.shape)
print('Length of filename_test:', len(filenames_test))

Shape of data_train: (15697, 128, 128, 3)
Length of labels_train: 15697
Shape of data_test: (7960, 128, 128, 3)
Length of filename_test: 7960


Convert labels to integers for model training

In [4]:
label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
print('Original labels: {}'.format(labels_train[:5]))
print('Encoded labels: {}'.format(labels_train_encoded[:5]))
print('Label encoder classes: {}'.format(label_encoder.classes_[:5]))

Original labels: ['w_f48451c', 'w_c3d896a', 'w_20df2c5', 'w_dd88965', 'w_64404ac']
Encoded labels: [4785 3807  661 4314 1928]
Label encoder classes: ['w_0003639' 'w_0003c59' 'w_0027efa' 'w_00289b1' 'w_002c810']


Compute sample size and number of classes

In [5]:
sample_size = len(labels_train_encoded)
num_classes = len(set(labels_train_encoded))
print('Sample size:', sample_size)
print('Number of clases:', num_classes)

Sample size: 15697
Number of clases: 5004


## Create Base Models

In [None]:
# Normalize data_train
data_train_norm = (data_train / 255).astype(np.float32)

In [7]:
filepath = 'models/weights-{epoch:02d}-{val_loss:.3f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', save_best_only=True, save_weights_only=True,
                             mode='min')
early_stop = EarlyStopping(patience=2, monitor='val_loss', restore_best_weights=True)

# Create precision and recall metrics
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)
metric_list = ['accuracy', precision, recall]

model = getSimpleModel(num_classes=num_classes, resize_width=128, metric_list=metric_list)

batch_size = 32
image_gen = ImageDataGenerator(rotation_range=20,
                               width_shift_range=.1,
                               height_shift_range=.1,
                               shear_range=0.5,
                               zoom_range=(0.9, 1.1),
                               fill_mode='constant',
                               horizontal_flip=True)

Train n base models (random seeds 1 to n)

In [9]:
model_no_total = 3
for model_no in range(1, model_no_total + 1):
    sample_index, validation_index = up_down_sampler(labels_array=labels_train_encoded,
                                                     validation_size=1000,
                                                     size_per_class=20,
                                                     random_state=model_no)
    X_train = data_train_norm[sample_index]
    X_test = data_train_norm[validation_index]
    y_train = labels_train_encoded[sample_index]
    y_test = labels_train_encoded[validation_index]
    
    y_train = to_categorical(y_train, num_classes=num_classes)
    y_test = to_categorical(y_test, num_classes=num_classes)

    image_generator_samples = image_gen.flow(X_train, y_train, batch_size=batch_size, seed=model_no)

    np.random.seed(model_no)
    set_random_seed(model_no)
    train_size, epochs = X_train.shape[0], 10
    model.fit_generator(image_generator_samples, steps_per_epoch=5*train_size//epochs, 
                        validation_data=(X_test, y_test), epochs=epochs, shuffle=True,
                        callbacks=[checkpoint, early_stop])

    model.save_weights('models/weights_simple_{}.hdf5'.format(model_no))

MemoryError: 

Load model weights and compute ensemble average

In [6]:
model = getSimpleModel(num_classes=num_classes, resize_width=128, metric_list=metric_list)
data_test_norm = (data_test / 255).astype(np.float32)
model_no_total = 3
average_preds = np.zeros((len(filenames_test, num_classes)))
for model_no in range(1, model_no_total + 1):
    model.load_weights('models/weights_simple_{}.hdf5'.format(model_no))
    pred = model.predict(data_test_norm)
    average_preds += pred / model_no_total

Obtain labels with top 5 softmax values for each array row and concatenate labels

In [16]:
top5_indices = np.apply_along_axis(array_filter, arr=average_preds, axis=1, n_top=5,
                                   labels=label_encoder.classes_, threshold=0.02)
predictions_array = np.apply_along_axis(array_to_string, arr=top5_indices, axis=1)

Create submission DataFrame and export as CSV file

In [19]:
submission_df = pd.DataFrame({'Image': filenames_test, 'Id': predictions_array})
submission_df.to_csv('data/derived/submission_v6.csv', index=False)
print(submission_df.head())

           Image                                                 Id
0  c303faac6.jpg  new_whale w_17b0d3a w_789c969 w_67a9841 w_a9304b9
1  96c2b7290.jpg  new_whale w_af367c3 w_8c25681 w_6822dbc w_f765256
2  69f6cd44f.jpg  new_whale w_23a388d w_d405854 w_5773c71 w_03670aa
3  a965dea33.jpg  new_whale w_1f0cf0a w_3de579a w_985d205 w_cd4cb49
4  9a225e056.jpg  new_whale w_5a2634c w_700ebb4 w_0a155b9 w_23a388d


Kaggle score: 0.286