# Download the kaggle data set.

In [1]:
import numpy as np 
import pandas as pd 
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.image as mplimg
from matplotlib.pyplot import imshow

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split


from keras import layers
from keras.utils import np_utils
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalAveragePooling2D
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

import keras.backend as K
from keras.models import Sequential

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

Using TensorFlow backend.


In [2]:
!pip freeze

absl-py==0.7.1
appnope==0.1.0
astor==0.8.0
attrs==19.1.0
backcall==0.1.0
bleach==3.1.0
Bottleneck==1.2.1
certifi==2018.8.24
cffi==1.12.3
Click==7.0
colorama==0.4.1
cycler==0.10.0
decorator==4.4.0
defusedxml==0.6.0
Django==2.2.3
entrypoints==0.3
enum34==1.1.6
gast==0.2.2
glob2==0.7
google-pasta==0.1.7
grpcio==1.22.0
h5py==2.8.0
image==1.5.27
imageio==2.5.0
ipykernel==4.6.1
ipython==7.6.1
ipython-genutils==0.2.0
ipywidgets==7.5.0
jedi==0.14.1
Jinja2==2.10.1
joblib==0.13.2
jsonschema==3.0.1
jupyter==1.0.0
jupyter-client==5.3.1
jupyter-console==6.0.0
jupyter-core==4.5.0
jupyter-http-over-ws==0.0.6
Keras==2.2.4
Keras-Applications==1.0.8
Keras-Preprocessing==1.1.0
kiwisolver==1.1.0
Markdown==3.1.1
MarkupSafe==1.1.1
matplotlib==3.0.3
mistune==0.8.4
mkl-fft==1.0.6
mkl-random==1.0.1
nbconvert==5.5.0
nbformat==4.4.0
networkx==2.3
notebook==5.2.2
numpy==1.16.4
olefile==0.46
opencv-python==3.4.5.20
pandas==0.24.2
pandocfilters==1.4.2
parso==0

# Explore the training data.
Let us explore the training data.


In [3]:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [4]:
print('Number of rows in train.csv', len(train_df))

Number of rows in train.csv 25361


# Identify the data points

## train.csv
There are 25361 rows in train.csv.  Which corresponds to the image entries in train.zip
We can see that the train.csv file has two data fields.  
* Image : The whale image file name
* Id is the whale Id.
Each whale is assigned a unique Id.  The unidentified whale's are assigned an Id new_whale.  


## train.zip
There are 25361 image files in train.zip file.  It has been extracted to input/train folder.  The filename corresponds to the Image column in train.csv file.

# Split the data into training, validation & test datasets





In [5]:
labels = train_df.Id
# Encode labels to integers using sklearning.preprocessing.LabelEncoder
# Convert the integer encoded array to category
le = LabelEncoder()
le.fit(labels)
y_transform = np_utils.to_categorical(le.transform(labels), num_classes=len(le.classes_))

X_train, X_tmp, Y_train, Y_tmp = train_test_split(train_df, y_transform, test_size=0.2, random_state=5)

X_val, X_test, Y_val, Y_test   = train_test_split(X_tmp, Y_tmp, test_size=0.5, random_state=5)

print('Training, Validation & testing data size', len(X_train),len(X_val), len(X_test))
gc.collect()

Training, Validation & testing data size 20288 2536 2537


0

In [0]:
def prepare_images(data):
    print("Preparing images")
    images = np.zeros((len(data), 100, 100, 3))
    count = 0
    
    for fig in data.Image:
        #load images into images of size 100x100x3
        img = image.load_img("input/train/"+fig, target_size=(100, 100, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        images[count] = x
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
        count += 1
    count = 0
    print("Finished!")      
    return images

# Create a CNN to create a base line model

In [7]:
model = Sequential()

model.add(Conv2D(filters = 16, kernel_size = 7, padding = 'same', activation = 'relu', 
          input_shape = (100, 100, 3))) #RGB image
model.add(MaxPooling2D(pool_size=3))
model.add(Conv2D(filters = 32, kernel_size = 7,  padding = 'same', activation = 'relu'))
model.add(MaxPooling2D(pool_size=3))
model.add(Conv2D(filters = 64, kernel_size = 7, padding = 'same', activation = 'relu'))
model.add(MaxPooling2D(pool_size=3))
model.add(GlobalAveragePooling2D())
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5005, activation='softmax'))

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 100, 100, 16)      2368      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 33, 33, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 33, 33, 32)        25120     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 11, 11, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 11, 11, 64)        100416    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 3, 3, 64)          0         
_________________________________________________________________
global_average_pooling2d_1 ( (None, 64)                0         
__________

In [0]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [0]:
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True 

x_train_images = prepare_images(X_train)
x_train_images /= 255

print("Shape X-train: ", x_train_images.shape)

x_val_images = prepare_images(X_val)
x_val_images /= 255

print("Shape X-val: ", x_val_images.shape)

x_test_images = prepare_images(X_test)
x_test_images /= 255

print("Shape X-test: ", x_test_images.shape)

Preparing images
Processing image:  1 ,  5e2572252.jpg
Processing image:  501 ,  b728ef1e9.jpg
Processing image:  1001 ,  942ab5de3.jpg
Processing image:  1501 ,  dd4cfa29f.jpg
Processing image:  2001 ,  614f10ee7.jpg
Processing image:  2501 ,  db9667359.jpg
Processing image:  3001 ,  86c9aa515.jpg
Processing image:  3501 ,  7f3aafbd2.jpg
Processing image:  4001 ,  6f0c3deb4.jpg
Processing image:  4501 ,  444b09aca.jpg
Processing image:  5001 ,  f532c9318.jpg
Processing image:  5501 ,  f2d3d0d0f.jpg
Processing image:  6001 ,  6ca37fe7c.jpg
Processing image:  6501 ,  3394e12db.jpg
Processing image:  7001 ,  feddb3aa9.jpg
Processing image:  7501 ,  3a8173905.jpg
Processing image:  8001 ,  16ddf58df.jpg
Processing image:  8501 ,  64b519010.jpg
Processing image:  9001 ,  c2a02f80e.jpg
Processing image:  9501 ,  770cb755e.jpg
Processing image:  10001 ,  803515118.jpg
Processing image:  10501 ,  5e8632b10.jpg
Processing image:  11001 ,  5f37d323c.jpg
Processing image:  11501 ,  204823b38.jpg

In [0]:
gc.collect()
os.makedirs('saved_models', exist_ok=True)
checkpointer = ModelCheckpoint(filepath='saved_models/weight.best.from_scratch.hdf5',
                               verbose=1, save_best_only = True)
model.fit(x_train_images, Y_train, epochs=20, batch_size=100, verbose=1,
                   validation_data=(x_val_images, Y_val), callbacks=[checkpointer])
gc.collect()

In [0]:
model.load_weights('saved_models/weight.best.from_scratch.hdf5')
pred = model.predict(x_test_images, verbose=1)
print(pred.shape)

## MAP@5 for Base CNN Model

In [0]:
import map5_method as map5
predictions=[]
for i, p in enumerate(pred):
  predictions.append(le.inverse_transform(p.argsort()[-5:][::-1]).tolist())
print('MAP@5 score for Base model = {}'.format(map5_per_set(X_test.Id, predictions )))

# Using VGG16 and Transfer learning

To reduce training time without sacrificing accuracy, lets train a CNN using transfer learning.

