# Ensemble Version 1

Ensemble averages consisting of simple CNN models

## Imports

Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import pickle
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import set_random_seed
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from src.sampler import undersampler
from src.model_api import getSimpleModel
from src.prediction import array_filter, array_to_string
from src.custom_metric import as_keras_metric

Using TensorFlow backend.


Import data

In [3]:
with open('data/derived/data_train_v3.pickle', 'rb') as file:
    data_train = pickle.load(file)
with open('data/derived/labels_train_v3.pickle', 'rb') as file:
    labels_train = pickle.load(file)
with open('data/derived/data_test_v2.pickle', 'rb') as file:
    data_test = pickle.load(file)
with open('data/derived/test_file_names.pickle', 'rb') as file:
    filenames_test = pickle.load(file)
print('Shape of data_train:', data_train.shape)
print('Length of labels_train:', len(labels_train))
print('Shape of data_test:', data_test.shape)
print('Length of filename_test:', len(filenames_test))

Shape of data_train: (15697, 128, 128, 3)
Length of labels_train: 15697
Shape of data_test: (7960, 128, 128, 3)
Length of filename_test: 7960


Convert labels to integers for model training

In [4]:
label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
print('Original labels: {}'.format(labels_train[:5]))
print('Encoded labels: {}'.format(labels_train_encoded[:5]))
print('Label encoder classes: {}'.format(label_encoder.classes_[:5]))

Original labels: ['w_f48451c', 'w_c3d896a', 'w_20df2c5', 'w_dd88965', 'w_64404ac']
Encoded labels: [4785 3807  661 4314 1928]
Label encoder classes: ['w_0003639' 'w_0003c59' 'w_0027efa' 'w_00289b1' 'w_002c810']


Compute sample size and number of classes

In [5]:
sample_size = len(labels_train_encoded)
num_classes = len(set(labels_train_encoded))
print('Sample size:', sample_size)
print('Number of clases:', num_classes)

Sample size: 15697
Number of clases: 5004


## Create Base Models

In [6]:
# Normalize data_train
data_train_norm = data_train / 255

In [7]:
filepath = 'models/weights-{epoch:02d}-{val_loss:.3f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', save_best_only=True, save_weights_only=True,
                             mode='min')
early_stop = EarlyStopping(patience=2, monitor='val_loss', restore_best_weights=True)

# Create precision and recall metrics
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)
metric_list = ['accuracy', precision, recall]

model = getSimpleModel(num_classes=num_classes, resize_width=128, metric_list=metric_list)

batch_size = 32
image_gen = ImageDataGenerator(rotation_range=20,
                               width_shift_range=.1,
                               height_shift_range=.1,
                               shear_range=0.5,
                               zoom_range=(0.9, 1.1),
                               fill_mode='constant',
                               horizontal_flip=True)

Train n base models (random seeds 1 to n)

In [None]:
model_no_total = 3
for model_no in range(1, model_no_total + 1):
    sample_index, validation_index = undersampler(labels_array=labels_train_encoded, validation_size=5000,
                                                  random_state=model_no)
    X_train = data_train_norm[sample_index]
    X_test = data_train_norm[validation_index]
    y_train = labels_train_encoded[sample_index]
    y_test = labels_train_encoded[validation_index]
    
    y_train = to_categorical(y_train, num_classes=num_classes)
    y_test = to_categorical(y_test, num_classes=num_classes)

    image_generator_samples = image_gen.flow(X_train, y_train, batch_size=batch_size, seed=model_no)

    np.random.seed(model_no)
    set_random_seed(model_no)
    train_size, epochs = X_train.shape[0], 10
    model.fit_generator(image_generator_samples, steps_per_epoch=5*train_size//epochs, 
                        validation_data=(X_test, y_test), epochs=epochs, shuffle=True,
                        callbacks=[checkpoint, early_stop])

    model.save_weights('models/weights_simple_{}.hdf5'.format(model_no))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 395/2502 [===>..........................] - ETA: 2:43 - loss: 8.5237 - acc: 0.0000e+00 - precision: 1.9982e-04 - recall: 1.0000

Load model weights

In [6]:
resnet50_model_1.load_weights('models/weights_resnet50_1.hdf5')
resnet50_model_2.load_weights('models/weights_resnet50_2.hdf5')
resnet50_model_3.load_weights('models/weights_resnet50_3.hdf5')
resnet50_model_4.load_weights('models/weights_resnet50_4.hdf5')
densenet201_model_1.load_weights('models/weights_densenet201_1.hdf5')
densenet201_model_2.load_weights('models/weights_densenet201_2.hdf5')
densenet201_model_3.load_weights('models/weights_densenet201_3.hdf5')
densenet201_model_4.load_weights('models/weights_densenet201_4.hdf5')

## Ensemble v1.1

All model results having same weightage

Add model predictions

In [7]:
resnet50_preds_1 = resnet50_model_1.predict(resnet50_preprocess(data_test))
resnet50_preds_2 = resnet50_model_2.predict(resnet50_preprocess(data_test))
resnet50_preds_3 = resnet50_model_3.predict(resnet50_preprocess(data_test))
resnet50_preds_4 = resnet50_model_4.predict(resnet50_preprocess(data_test))
densenet201_preds_1 = densenet201_model_1.predict(densenet201_preprocess(data_test))
densenet201_preds_2 = densenet201_model_2.predict(densenet201_preprocess(data_test))
densenet201_preds_3 = densenet201_model_3.predict(densenet201_preprocess(data_test))
densenet201_preds_4 = densenet201_model_4.predict(densenet201_preprocess(data_test))
overall_preds = resnet50_preds_1 + resnet50_preds_2 + resnet50_preds_3 + resnet50_preds_4 +\
densenet201_preds_1 + densenet201_preds_2 + densenet201_preds_3 + densenet201_preds_4

Obtain labels with top 5 softmax values for each array row and concatenate labels

In [16]:
top5_indices = np.apply_along_axis(array_filter, arr=overall_preds, axis=1, n_top=5, labels=label_encoder.classes_)
predictions_array = np.apply_along_axis(array_to_string, arr=top5_indices, axis=1)

Create submission DataFrame and export as CSV file

In [19]:
submission_df = pd.DataFrame({'Image': filenames_test, 'Id': predictions_array})
submission_df.to_csv('submission_v1.csv', index=False)
print(submission_df.head())

           Image                                                 Id
0  c303faac6.jpg  new_whale w_17b0d3a w_789c969 w_67a9841 w_a9304b9
1  96c2b7290.jpg  new_whale w_af367c3 w_8c25681 w_6822dbc w_f765256
2  69f6cd44f.jpg  new_whale w_23a388d w_d405854 w_5773c71 w_03670aa
3  a965dea33.jpg  new_whale w_1f0cf0a w_3de579a w_985d205 w_cd4cb49
4  9a225e056.jpg  new_whale w_5a2634c w_700ebb4 w_0a155b9 w_23a388d


Kaggle score: 0.286

## Ensemble v1.2

Models weighted by accuracy score during training and validation phase

Add model predictions

In [20]:
resnet50_preds_1 = resnet50_model_1.predict(resnet50_preprocess(data_test))
resnet50_preds_2 = resnet50_model_2.predict(resnet50_preprocess(data_test))
resnet50_preds_3 = resnet50_model_3.predict(resnet50_preprocess(data_test))
resnet50_preds_4 = resnet50_model_4.predict(resnet50_preprocess(data_test))
densenet201_preds_1 = densenet201_model_1.predict(densenet201_preprocess(data_test))
densenet201_preds_2 = densenet201_model_2.predict(densenet201_preprocess(data_test))
densenet201_preds_3 = densenet201_model_3.predict(densenet201_preprocess(data_test))
densenet201_preds_4 = densenet201_model_4.predict(densenet201_preprocess(data_test))
overall_preds = 0.38*resnet50_preds_1 + 0.38*resnet50_preds_2 + 0.58*resnet50_preds_3 + 0.57*resnet50_preds_4 +\
0.38*densenet201_preds_1 + 0.38*densenet201_preds_2 + 0.58*densenet201_preds_3 + 0.57*densenet201_preds_4

Obtain labels with top 5 softmax values for each array row and concatenate labels

In [21]:
top5_indices = np.apply_along_axis(array_filter, arr=overall_preds, axis=1, n_top=5, labels=label_encoder.classes_)
predictions_array = np.apply_along_axis(array_to_string, arr=top5_indices, axis=1)

Create submission DataFrame and export as CSV file

In [22]:
submission_df = pd.DataFrame({'Image': filenames_test, 'Id': predictions_array})
submission_df.to_csv('submission_v2.csv', index=False)
print(submission_df.head())

           Image                                                 Id
0  c303faac6.jpg  new_whale w_17b0d3a w_67a9841 w_a9304b9 w_789c969
1  96c2b7290.jpg  new_whale w_af367c3 w_8c25681 w_6822dbc w_f765256
2  69f6cd44f.jpg  new_whale w_d405854 w_23a388d w_5773c71 w_fd3e556
3  a965dea33.jpg  new_whale w_1f0cf0a w_cd4cb49 w_3de579a w_343f088
4  9a225e056.jpg  new_whale w_5a2634c w_700ebb4 w_0a155b9 w_17b0d3a


Kaggle score: 0.286