# Download the kaggle data set.

In [1]:

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!ls ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c humpback-whale-identification
!rm -rf input
!mkdir -p input
!unzip -q train.zip -d input/train
#!ls input/train


kaggle.json
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv: Skipping, found more recently modified local copy (use --force to force download)
test.zip: Skipping, found more recently modified local copy (use --force to force download)
train.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import numpy as np 
import pandas as pd 
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.image as mplimg
from matplotlib.pyplot import imshow

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split


from keras import layers
from keras.utils import np_utils
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalAveragePooling2D
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

import keras.backend as K
from keras.models import Sequential

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

Using TensorFlow backend.


# Explore the training data.
Let us explore the training data.


In [3]:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [4]:
print('Number of rows in train.csv', len(train_df))

Number of rows in train.csv 25361


# Identify the data points

## train.csv
There are 25361 rows in train.csv.  Which corresponds to the image entries in train.zip
We can see that the train.csv file has two data fields.  
* Image : The whale image file name
* Id is the whale Id.
Each whale is assigned a unique Id.  The unidentified whale's are assigned an Id new_whale.  


## train.zip
There are 25361 image files in train.zip file.  It has been extracted to input/train folder.  The filename corresponds to the Image column in train.csv file.

# Split the data into training, validation & test datasets





In [21]:
labels = train_df.Id
# Encode labels to integers using sklearning.preprocessing.LabelEncoder
# Convert the integer encoded array to category
le = LabelEncoder()
le.fit(labels)
y_transform = np_utils.to_categorical(le.transform(labels), num_classes=len(le.classes_))

X_train, X_tmp, Y_train, Y_tmp = train_test_split(train_df, y_transform, test_size=0.2, random_state=5)

X_val, X_test, Y_val, Y_test   = train_test_split(X_tmp, Y_tmp, test_size=0.5, random_state=5)

print('Training, Validation & testing data size', len(X_train),len(X_val), len(X_test))
gc.collect()

Training, Validation & testing data size 20288 2536 2537


184

# Create a CNN to create a base line model

In [6]:
model = Sequential()

model.add(Conv2D(filters = 16, kernel_size = 7, padding = 'same', activation = 'relu', 
          input_shape = (100, 100, 3))) #RGB image
model.add(MaxPooling2D(pool_size=3))
model.add(Conv2D(filters = 32, kernel_size = 7,  padding = 'same', activation = 'relu'))
model.add(MaxPooling2D(pool_size=3))
model.add(Conv2D(filters = 64, kernel_size = 7, padding = 'same', activation = 'relu'))
model.add(MaxPooling2D(pool_size=3))
model.add(GlobalAveragePooling2D())
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5005, activation='softmax'))

print(model.output_shape)

model.summary()


W0718 06:12:34.409164 140297081849728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0718 06:12:34.429065 140297081849728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0718 06:12:34.432000 140297081849728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0718 06:12:34.451770 140297081849728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0718 06:12:34.506905 140297081849728 deprecation_wrapp

(None, 5005)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 100, 100, 16)      2368      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 33, 33, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 33, 33, 32)        25120     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 11, 11, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 11, 11, 64)        100416    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 3, 3, 64)          0         
_________________________________________________________________
global_average_pooling2d_1 ( (None, 64)                0       

In [0]:
def prepare_images(data):
    print("Preparing images")
    
    images = np.zeros((len(data), 100, 100, 3))
    
    count = 0
    
    for fig in data.Image:
        #load images into images of size 100x100x3
        img = image.load_img("input/train/"+fig, target_size=(100, 100, 3))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        images[count] = x
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
        count += 1
    
    count = 0
    
    print("Finished!")
            
    return images

In [8]:
# model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

W0718 06:12:34.592480 140297081849728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0718 06:12:34.624079 140297081849728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [9]:
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True 

x_train_images = prepare_images(X_train)
x_train_images /= 255

print("Shape X-train: ", x_train_images.shape)

x_val_images = prepare_images(X_val)
x_val_images /= 255

print("Shape X-val: ", x_val_images.shape)

x_test_images = prepare_images(X_test)
x_test_images /= 255

print("Shape X-test: ", x_test_images.shape)

Preparing images
Processing image:  1 ,  5e2572252.jpg
Processing image:  501 ,  b728ef1e9.jpg
Processing image:  1001 ,  942ab5de3.jpg
Processing image:  1501 ,  dd4cfa29f.jpg
Processing image:  2001 ,  614f10ee7.jpg
Processing image:  2501 ,  db9667359.jpg
Processing image:  3001 ,  86c9aa515.jpg
Processing image:  3501 ,  7f3aafbd2.jpg
Processing image:  4001 ,  6f0c3deb4.jpg
Processing image:  4501 ,  444b09aca.jpg
Processing image:  5001 ,  f532c9318.jpg
Processing image:  5501 ,  f2d3d0d0f.jpg
Processing image:  6001 ,  6ca37fe7c.jpg
Processing image:  6501 ,  3394e12db.jpg
Processing image:  7001 ,  feddb3aa9.jpg
Processing image:  7501 ,  3a8173905.jpg
Processing image:  8001 ,  16ddf58df.jpg
Processing image:  8501 ,  64b519010.jpg
Processing image:  9001 ,  c2a02f80e.jpg
Processing image:  9501 ,  770cb755e.jpg
Processing image:  10001 ,  803515118.jpg
Processing image:  10501 ,  5e8632b10.jpg
Processing image:  11001 ,  5f37d323c.jpg
Processing image:  11501 ,  204823b38.jpg

In [0]:
def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

def map_per_set(labels, predictions):
    """Computes the average over multiple images.

    Parameters
    ----------
    labels : list
             A list of the true labels. (Only one true label per images allowed!)
    predictions : list of list
             A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """
    return np.mean([map_per_image(l, p) for l,p in zip(labels, predictions)])

In [11]:
gc.collect()
os.makedirs('saved_models', exist_ok=True)
checkpointer = ModelCheckpoint(filepath='saved_models/weight.best.from_scratch.hdf5',
                               verbose=1, save_best_only = True)
model.fit(x_train_images, Y_train, epochs=25, batch_size=100, verbose=1,
                   validation_data=(x_val_images, Y_val), callbacks=[checkpointer])
gc.collect()

W0718 06:17:11.679155 140297081849728 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 20288 samples, validate on 2536 samples
Epoch 1/25

Epoch 00001: val_loss improved from inf to 5.91957, saving model to saved_models/weight.best.from_scratch.hdf5
Epoch 2/25

Epoch 00002: val_loss improved from 5.91957 to 5.83468, saving model to saved_models/weight.best.from_scratch.hdf5
Epoch 3/25

Epoch 00003: val_loss improved from 5.83468 to 5.79233, saving model to saved_models/weight.best.from_scratch.hdf5
Epoch 4/25

Epoch 00004: val_loss improved from 5.79233 to 5.78796, saving model to saved_models/weight.best.from_scratch.hdf5
Epoch 5/25

Epoch 00005: val_loss improved from 5.78796 to 5.70395, saving model to saved_models/weight.best.from_scratch.hdf5
Epoch 6/25

Epoch 00006: val_loss improved from 5.70395 to 5.65088, saving model to saved_models/weight.best.from_scratch.hdf5
Epoch 7/25

Epoch 00007: val_loss did not improve from 5.65088
Epoch 8/25

Epoch 00008: val_loss improved from 5.65088 to 5.60623, saving model to saved_models/weight.best.from_scratch.hdf5
Epo

10

In [12]:
model.load_weights('saved_models/weight.best.from_scratch.hdf5')
pred = model.predict(x_test_images, verbose=1)
print(pred.shape)

(2537, 5005)


In [22]:
predictions=[]
for i, p in enumerate(pred):
  predictions.append(le.inverse_transform(p.argsort()[-5:][::-1]).tolist())
print(map_per_set(X_test.Id, predictions ))

0.38368808303770857
