In [1]:
# Import required packages
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers.noise import GaussianNoise
from keras.layers.normalization import BatchNormalization
from keras.applications import ResNet50, VGG16, InceptionV3
from keras.applications.vgg16 import preprocess_input, decode_predictions

import os
import sys
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from keras.utils import to_categorical
from tqdm import tqdm
%matplotlib inline

Using TensorFlow backend.


## Method 1: Apply pre-trained model on training data but with new classifer

### Data Preparation

#### Steps
- Read input：cv2.imread
- Change image size：cv2.resize
- Define image type：cat = 0, dog = 1
- Shuffle the sequence of images and classification：shuffle
- Split training set into training and validation sets：train_test_split

In [4]:
shape = 224 # VGG16 input size = 224*224
label = np.array([0] * 12500 + [1] * 12500)
data = np.zeros((25000, shape, shape, 3), dtype=np.uint8)

for i in tqdm(range(12500)):
    img = cv2.imread('./train/cat.%s.jpg' % str(i))
    img = img[:, :, ::-1]
    data[i] = cv2.resize(img, (shape, shape))
    
for i in tqdm(range(12500)):
    img = cv2.imread('./train/dog.%s.jpg' % str(i))
    img = img[:, :, ::-1]
    data[i + 12500] = cv2.resize(img, (shape, shape))
    
print('Training Data Size = %.2f GB' % (sys.getsizeof(data)/1024**3))

100%|██████████| 12500/12500 [01:48<00:00, 115.60it/s]
100%|██████████| 12500/12500 [01:15<00:00, 165.85it/s]

Training Data Size = 3.50 GB





In [3]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data, label, test_size=0.2, random_state=42)

In [5]:
test = np.zeros((12500, shape, shape, 3), dtype=np.uint8)
for i in tqdm(range(12500)):
    img = cv2.imread('./test/%s.jpg' % str(i + 1))
    img = img[:, :, ::-1]
    test[i] = cv2.resize(img, (shape, shape))
print('Testing Data Size = %.2f GB' % (sys.getsizeof(test)/1024**3))

100%|██████████| 12500/12500 [01:10<00:00, 178.33it/s]

Testing Data Size = 1.75 GB





### Load the entire pre-trained model plus weights, replace with new classifier.

#### Steps
- Load pre-trained model and weight, excluding the original classifier
- Lock the layers of the pre-trained model to prevent the change during the training process: layers.trainable = False
- Add the new classifier to the end of the model. Choose sigmoid or softmax based on the number of types that need to classify
- Compire the model.Choose binary_crossentropy or categorical_crossentropy based on the number of types that need to classify
- Check the number of trainable weights: create function get_param_count() to count unlocked parameters
- Train the model
  - Use small batch size so that we can get high precision even with a few epochs
  - No need to use too many epochs, 5 ~ 10 is enough. Because the number of trainable parameters is small (~500), it can achieve the optimal without too many epochs.

**First I use VGG16 (22 layers) model for transfer learning.**

In [7]:
from keras import backend as K

def get_params_count(model):
    trainable = int(np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
    non_trainable = int(np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
    return trainable, non_trainable

In [8]:
base_model = VGG16(input_shape=(224, 224, 3), weights='imagenet', include_top=False, pooling='avg')

for layers in base_model.layers:
    layers.trainable = False

y = Dropout(0.5)(base_model.output)
y = Dense(1, activation='sigmoid')(y)

model1 = Model(inputs=base_model.input, outputs=y)
model1.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model1.summary()
print('Model has %d layers.' % len(model1.layers))

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584   

In [9]:
model1.fit(x=X_train, y=y_train, batch_size=16, epochs=5, validation_data=(X_val, y_val), shuffle=True)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6a84f0ce50>

**Now let's try ResNet50 model (178 layers). **

In [10]:
base_model = ResNet50(input_shape=(224, 224, 3), weights='imagenet', include_top=False, pooling='avg')

for layers in base_model.layers:
    layers.trainable = False

y = Dropout(0.25)(base_model.output)
y = Dense(1, activation='sigmoid')(y)

model2 = Model(inputs=base_model.input, outputs=y)
model2.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model2.summary()
print('Model has %d layers.' % len(model2.layers))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 224, 224, 3)   0                                            
____________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D) (None, 230, 230, 3)   0           input_2[0][0]                    
____________________________________________________________________________________________________
conv1 (Conv2D)                   (None, 112, 112, 64)  9472        zero_padding2d_1[0][0]           
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, 112, 112, 64)  256         conv1[0][0]                      
___________________________________________________________________________________________

In [11]:
model2.fit(x=X_train, y=y_train, batch_size=16, epochs=5, validation_data=(X_val, y_val), shuffle=True)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f69ca887e50>

## Method 2: Apply pre-trained model on training data with new classifer and fine tune the last several layers

#### Steps
- Here I use the model2 created above. 
- Unblock the last several layers and train them.
- Since several layers form a combination (e.g. conv + batchNorm + acitvation), we'd better block/unblock the combination.
- Unblock the layers from high (output) to low (input). Train 5 epochs when unblocking on part. Then check if the loss and accuracy have improved. If the answer is yes, then we can continue training. Otherwise, we can unblock more layers until the training result has improvement.

In [19]:
model2.layers[-37:]

[<keras.layers.core.Activation at 0x7f69cb4f8bd0>,
 <keras.layers.convolutional.Conv2D at 0x7f69cb4f8d90>,
 <keras.layers.normalization.BatchNormalization at 0x7f69cb4c7e90>,
 <keras.layers.core.Activation at 0x7f69cb498fd0>,
 <keras.layers.convolutional.Conv2D at 0x7f69cb498590>,
 <keras.layers.normalization.BatchNormalization at 0x7f69cb43c610>,
 <keras.layers.core.Activation at 0x7f69cb370f50>,
 <keras.layers.convolutional.Conv2D at 0x7f69cb3b9f90>,
 <keras.layers.convolutional.Conv2D at 0x7f69cb313ed0>,
 <keras.layers.normalization.BatchNormalization at 0x7f69cb3dd590>,
 <keras.layers.normalization.BatchNormalization at 0x7f69cb35be50>,
 <keras.layers.merge.Add at 0x7f69cb2e6c90>,
 <keras.layers.core.Activation at 0x7f69cb28df50>,
 <keras.layers.convolutional.Conv2D at 0x7f69cb27d950>,
 <keras.layers.normalization.BatchNormalization at 0x7f69cb20ae50>,
 <keras.layers.core.Activation at 0x7f69cb1d7e50>,
 <keras.layers.convolutional.Conv2D at 0x7f69cb220c10>,
 <keras.layers.normaliza

In [20]:
for layers in model2.layers[-35:]:
    layers.trainable = True
    
print('Trainable = %d, Non-Trainable = %d' % (get_params_count(model2)))

Trainable = 14453249, Non-Trainable = 9136512


In [21]:
model2.fit(x=X_train, y=y_train, batch_size=16, epochs=10, validation_data=(X_val, y_val))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6a874e7b50>

In [22]:
model2.save('ResNet_Finetune_last3_epoch5.h5')

## Method 3: Multiple Model Integration

#### Steps

- Calculate feature vectors: calculate the output value of the entire training set from each model (excluding classifier part).
- Merge feature vectors: Merge the feature vectors from multiple models.
- Train the classifier
- Predict

### Calculate feature vectors

- Since I have already read the training and test datasets into memory, I can use model.predict() to get feature vectors directly.
- Average speed is 2 mins/per training + test feature vectors.

In [9]:
import h5py
from keras.layers import Lambda

def export_gap(MODEL, preprocess=None):
    x = Input((224, 224, 3))
    if preprocess:
        x = Lambda(preprocess)(x)
    model = MODEL(input_tensor=x, weights='imagenet', include_top=False, pooling='avg')

    train_gap = model.predict(data, batch_size=128)
    test_gap = model.predict(test, batch_size=128)
    
    with h5py.File("gap_%s.h5" % MODEL.__name__, 'w') as f:
        f.create_dataset('train', data=train_gap)
        f.create_dataset('test', data=test_gap)

In [11]:
export_gap(VGG16)

In [12]:
export_gap(ResNet50)

In [10]:
from keras.applications import inception_v3, xception, Xception
export_gap(InceptionV3, preprocess=inception_v3.preprocess_input)

### Integrate feature vectors from different models

- Use np.concatenate to integrate vectors on specific dimension.
- Feed all feature vectors to the classifier which uses gradient descent to set weights automatically.

In [13]:
train = []
test = []
for gapfile in ['gap_VGG16.h5', 'gap_ResNet50.h5', 'gap_InceptionV3.h5']:
    with h5py.File(gapfile, 'r') as f:
        train.append(np.array(f['train']))
        test.append(np.array(f['test']))
        
print('Feature Vector Shape for Model #0:', train[0].shape)

Feature Vector Shape for Model #0: (25000, 512)


In [14]:
X_train = np.concatenate(train, axis=1)
X_test = np.concatenate(test, axis=1)
print('Feature Vector Shape after Merging 2 models:', X_train.shape)

Feature Vector Shape after Merging 2 models: (25000, 4608)


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, label, test_size=0.2, random_state=42)

### Create Model

- We just need to define a simple classifier to classify the feature vectors. The output is 0(cat) or 1(dog).
- The input of this model is the merged output of above 3 feature vectors. So the input size is (None, 2048 * 3).

In [17]:
inputs = Input((X_train.shape[1],))
x = inputs
x = Dropout(0.25)(x)
y = Dense(1, activation='sigmoid')(x)

model_fusion = Model(inputs=inputs, outputs=y, name='Fusion')
model_fusion.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model_fusion.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 4608)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 4608)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 4609      
Total params: 4,609
Trainable params: 4,609
Non-trainable params: 0
_________________________________________________________________


### Train the Model

- The total number of parameters of the model is about 6000 and the structure is very simple. The training time is very short.

In [18]:
model_fusion.fit(x=X_train, y=y_train, batch_size=128, epochs=10, validation_data=(X_val, y_val))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8cad0aecf8>

**Observation**

We can achieve better result using Fine tuning than feature vectors (0.0496 vs 0.0541). The key point is to 

### Predict the test set and upload result to Kaggle

- The judgment standard is LogLoss.
- Kaggle's scoring system will adjust the probability between 10^-15 and 1 - 10^-15)
- We change the output from 0/1 to 0.005/0.995 when the output is close to 0 or 1.


In [19]:
y_pred = model_fusion.predict(X_test)
y_pred = y_pred.clip(min=0.005, max=0.995)

In [20]:
y_pred.shape, y_pred[0]

((12500, 1), array([ 0.995], dtype=float32))

In [21]:
with open('test.csv', 'w') as f:
    f.writelines('id,label\n')
    for i in range(12500):
        f.writelines(str(i+1) + ',' + str(y_pred[i][0]) + '\n')