# Simple CNN Model v4 (Single)

Single model (simpler one) trained on subsampled training set

## Imports

Import libraries

In [12]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import set_random_seed
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from src.sampler import train_test_bootstrapper, train_test_bootstrapper_v2, undersampler
from src.model_api import getSimpleModel
from src.prediction import array_filter, array_to_string
from src.custom_metric import as_keras_metric

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import data

In [2]:
with open('data/derived/data_train_v3.pickle', 'rb') as file:
    data_train = pickle.load(file)
with open('data/derived/labels_train_v3.pickle', 'rb') as file:
    labels_train = pickle.load(file)
print('Shape of data_train:', data_train.shape)
print('Length of labels_train:', len(labels_train))

Shape of data_train: (15697, 128, 128, 3)
Length of labels_train: 15697


In [3]:
with open('data/derived/data_test_v2.pickle', 'rb') as file:
    data_test = pickle.load(file)
with open('data/derived/test_file_names.pickle', 'rb') as file:
    filenames_test = pickle.load(file)
print('Shape of data_test:', data_test.shape)
print('Length of filename_test:', len(filenames_test))

Shape of data_test: (7960, 128, 128, 3)
Length of filename_test: 7960


Convert labels to integers for model training

In [4]:
label_encoder = LabelEncoder()
labels_train_encoded = label_encoder.fit_transform(labels_train)
print('Original labels: {}'.format(labels_train[:5]))
print('Encoded labels: {}'.format(labels_train_encoded[:5]))
print('Label encoder classes: {}'.format(label_encoder.classes_[:5]))

Original labels: ['w_f48451c', 'w_c3d896a', 'w_20df2c5', 'w_dd88965', 'w_64404ac']
Encoded labels: [4785 3807  661 4314 1928]
Label encoder classes: ['w_0003639' 'w_0003c59' 'w_0027efa' 'w_00289b1' 'w_002c810']


Compute sample size and number of classes

In [5]:
sample_size = len(labels_train_encoded)
num_classes = len(set(labels_train_encoded))
print('Sample size:', sample_size)
print('Number of clases:', num_classes)

Sample size: 15697
Number of clases: 5004


## Train Single CNN Model

Simple CNN model (Random Seed = 2019, assuming class imbalance)

In [20]:
#sample_index, validation_index = undersampler(labels_array=labels_train_encoded, validation_size=5000, random_state=2019)

#X_train = data_train[sample_index] / 255
#X_validation = data_train[validation_index] / 255
#y_train = labels_train_encoded[sample_index]
#y_validation = labels_train_encoded[validation_index]

data_train_norm = data_train / 255
X_train, X_validation, y_train, y_validation = train_test_bootstrapper(data_train_norm, labels_train_encoded,
                                                                          bootstrapper_size=sample_size,
                                                                          class_imbalance=True, random_state=2019)

#X_train, X_validation, y_train, y_validation = train_test_split(data_train_norm, labels_train_encoded, test_size=0.2,
#                                                                random_state=2019)

y_train = to_categorical(y_train, num_classes=num_classes)
y_validation = to_categorical(y_validation, num_classes=num_classes)
print('Shape of X_train:', X_train.shape)
print('Shape of X_validation:', X_validation.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_validation:', y_validation.shape)

Shape of X_train: (15697, 128, 128, 3)
Shape of X_validation: (5913, 128, 128, 3)
Shape of y_train: (15697, 5004)
Shape of y_validation: (5913, 5004)


In [21]:
filepath = 'models/weights-{epoch:02d}-{val_loss:.3f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', save_best_only=True, save_weights_only=True,
                             mode='min')
early_stop = EarlyStopping(patience=2, monitor='val_loss', restore_best_weights=True)

# Create precision and recall metrics
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)
metric_list = ['accuracy', precision, recall]

model = getSimpleModel(num_classes=num_classes, resize_width=128, metric_list=metric_list)

batch_size = 32
image_gen = ImageDataGenerator(rotation_range=20,
                               width_shift_range=.1,
                               height_shift_range=.1,
                               shear_range=0.5,
                               zoom_range=(0.9, 1.1),
                               fill_mode='constant',
                               horizontal_flip=True)

In [22]:
image_generator_samples = image_gen.flow(X_train, y_train, batch_size=batch_size, seed=2019)

np.random.seed(2019)
set_random_seed(2019)
train_size, epochs = X_train.shape[0], 10
model.fit_generator(image_generator_samples, steps_per_epoch=2*train_size//epochs, 
                    validation_data=(X_validation, y_validation), epochs=epochs, shuffle=True,
                    callbacks=[checkpoint, early_stop])

model.save_weights('models/weights_CNN_single_v4.hdf5')

Epoch 1/10
Epoch 2/10
Epoch 3/10


Load model weights

In [23]:
model = getSimpleModel(num_classes=num_classes, resize_width=128, metric_list=metric_list)
model.load_weights('models/weights_CNN_single_v4.hdf5')

Model prediction

In [24]:
data_test_norm = data_test / 255
preds = model.predict(data_test_norm)

Obtain labels with top 5 softmax values for each array row and concatenate labels

In [27]:
top5_indices = np.apply_along_axis(array_filter, arr=preds, axis=1, n_top=5, labels=label_encoder.classes_, threshold=0.01)
predictions_array = np.apply_along_axis(array_to_string, arr=top5_indices, axis=1)

Create submission DataFrame and export as CSV file

In [29]:
submission_df = pd.DataFrame({'Image': filenames_test, 'Id': predictions_array})
submission_df.to_csv('submission_v5.csv', index=False)
print(submission_df.head())

           Image                                                 Id
0  c303faac6.jpg  w_23a388d w_6e209a8 w_88e4537 w_9b5109b w_5e8e218
1  96c2b7290.jpg  w_88e4537 w_23a388d w_5e8e218 w_60ce6fc w_6cda039
2  69f6cd44f.jpg  w_5773c71 w_7c27fbd w_aabdf8c w_e2a09d4 new_whale
3  a965dea33.jpg  w_a9304b9 w_d72771c w_1f0cf0a w_8da30ad new_whale
4  9a225e056.jpg  w_789c969 w_3de579a w_9b5109b w_a9304b9 new_whale


Kaggle score: 0.0053