# Simple CNN Model v3 (Single)

Single model trained on subsampled training set

## Imports

Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import pickle
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import set_random_seed
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from src.sampler import train_test_bootstrapper_v2
from src.model_api import getSimpleModelV2
from src.prediction import array_filter, array_to_string
from src.custom_metric import as_keras_metric

Using TensorFlow backend.


Import data

In [2]:
with open('data/derived/data_train_v2.pickle', 'rb') as file:
    data_train = pickle.load(file)
with open('data/derived/labels_train_v2.pickle', 'rb') as file:
    labels_train = pickle.load(file)
print('Shape of data_train:', data_train.shape)
print('Length of labels_train:', len(labels_train))

Shape of data_train: (15697, 224, 224, 3)
Length of labels_train: 15697


In [3]:
with open('data/derived/data_test.pickle', 'rb') as file:
    data_test = pickle.load(file)
with open('data/derived/test_file_names.pickle', 'rb') as file:
    filenames_test = pickle.load(file)
print('Shape of data_test:', data_test.shape)
print('Length of filename_test:', len(filenames_test))

Shape of data_test: (7960, 224, 224, 3)
Length of filename_test: 7960


Convert labels to integers for model training

In [4]:
label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
print('Original labels: {}'.format(labels_train[:5]))
print('Encoded labels: {}'.format(labels_train_encoded[:5]))
print('Label encoder classes: {}'.format(label_encoder.classes_[:5]))

Original labels: ['w_f48451c', 'w_c3d896a', 'w_20df2c5', 'w_dd88965', 'w_64404ac']
Encoded labels: [4785 3807  661 4314 1928]
Label encoder classes: ['w_0003639' 'w_0003c59' 'w_0027efa' 'w_00289b1' 'w_002c810']


Compute sample size and number of classes

In [5]:
sample_size = len(labels_train_encoded)
num_classes = len(set(labels_train_encoded))
print('Sample size:', sample_size)
print('Number of clases:', num_classes)

Sample size: 15697
Number of clases: 5004


## Train Single CNN Model

Simple CNN model (Random Seed = 2019, assuming class imbalance)

In [6]:
# Normalize data_train
data_train = data_train / 255
data_train = data_train.astype(dtype=np.float32)

In [7]:
filepath = 'models/weights-{epoch:02d}-{val_loss:.3f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', save_best_only=True, save_weights_only=True,
                             mode='min')
early_stop = EarlyStopping(patience=2, monitor='val_loss', restore_best_weights=True)

# Create precision and recall metrics
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)
metric_list = ['accuracy', precision, recall]

model = getSimpleModelV2(num_classes=num_classes, resize_width=224, metric_list=metric_list)

batch_size = 32
image_gen = ImageDataGenerator(rotation_range=20,
                               width_shift_range=.1,
                               height_shift_range=.1,
                               shear_range=0.5,
                               zoom_range=(0.9, 1.1),
                               fill_mode='constant',
                               horizontal_flip=True)

In [9]:
X_train, X_test, y_train, y_test = train_test_bootstrapper_v2(data_train, labels_train_encoded,
                                                              bootstrapper_size=sample_size,
                                                              class_imbalance=True, random_state=2019)

y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

image_generator_samples = image_gen.flow(X_train, y_train, batch_size=batch_size, seed=2019)

np.random.seed(2019)
set_random_seed(2019)
train_size, epochs = X_train.shape[0], 10
model.fit_generator(image_generator_samples, steps_per_epoch=2*train_size//epochs, 
                    validation_data=(X_test, y_test), epochs=epochs, shuffle=True,
                    callbacks=[checkpoint, early_stop])

model.save_weights('models/weights_CNN_single_v3.hdf5')

Epoch 1/10
Epoch 2/10
Epoch 3/10


Load model weights

In [10]:
model = getSimpleModelV2(num_classes=num_classes, resize_width=224, metric_list=metric_list)
model.load_weights('models/weights_CNN_single_v3.hdf5')

Model prediction

In [11]:
data_test = data_test / 255
data_test = data_test.astype(dtype=np.float32)
preds = model.predict(data_test)

Obtain labels with top 5 softmax values for each array row and concatenate labels

In [16]:
top5_indices = np.apply_along_axis(array_filter, arr=preds, axis=1, n_top=5, labels=label_encoder.classes_, threshold=0.001)
predictions_array = np.apply_along_axis(array_to_string, arr=top5_indices, axis=1)

Create submission DataFrame and export as CSV file

In [17]:
submission_df = pd.DataFrame({'Image': filenames_test, 'Id': predictions_array})
submission_df.to_csv('submission_v4.csv', index=False)
print(submission_df.head())

           Image                                                 Id
0  c303faac6.jpg  w_9d698cf w_d829778 w_c7e2a65 w_3511bc7 w_1a9e018
1  96c2b7290.jpg  w_d055e5f w_e29ed08 w_dd18ea8 w_707ce0f new_whale
2  69f6cd44f.jpg  w_931b78f w_902d8ef w_cb4d3ed w_d055e5f w_3d88ad4
3  a965dea33.jpg  w_707ce0f w_d055e5f w_e29ed08 w_e04d084 w_408f9ea
4  9a225e056.jpg  w_270f505 w_3511bc7 w_1a9e018 w_ba3c7af w_2c3768d


Kaggle score: 0.013