In [1]:
import time
import math
import random
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
%matplotlib inline

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4,5'
tf.test.is_gpu_available()

True

# Configs & Hyperparameters

In [3]:
# batch size
batch_size = 32
# validation split
# valid_size = .1
# test split
test_size = 0.2

img_dim = (224, 224)

# paths
cache_path = 'cache'
data_path = 'data-filtered'

# Data Loading

In [4]:
import os

n_birds = 10 # use ten kinds of birds first
bird_file_map = {}

# return array of bird names
birdList = sorted(os.listdir(data_path))
loadedImages = []

n_images = 0
n_birds_loaded = 0
for b in birdList:
    if n_birds_loaded >= n_birds:
        break
    print("Loading images for '" + b + "'")
    curdir = os.path.join(data_path, b)
    if not os.path.isdir(curdir):
        continue
    img_files = os.listdir(curdir)
    
    filenames = [os.path.join(curdir, f) for f in img_files]
    n_f = len(filenames)
    if n_f > 0: # use data only if more than xx images are found
        bird_file_map[b] = filenames
        print(n_f, "images loaded for '" + b + "'")
        n_birds_loaded += 1
        n_images += n_f
    else:
        print("Not enough data for '" + b + "'")

Loading images for 'Aberrant Bush-Warbler'
271 images loaded for 'Aberrant Bush-Warbler'
Loading images for 'Ala Shan Redstart'
401 images loaded for 'Ala Shan Redstart'
Loading images for 'Aleutian Tern'
282 images loaded for 'Aleutian Tern'
Loading images for 'Altai Snowcock'
244 images loaded for 'Altai Snowcock'
Loading images for 'American Wigeon'
486 images loaded for 'American Wigeon'
Loading images for 'Arctic Warbler'
329 images loaded for 'Arctic Warbler'
Loading images for 'Ashy Bulbul'
310 images loaded for 'Ashy Bulbul'
Loading images for 'Ashy Drongo'
412 images loaded for 'Ashy Drongo'
Loading images for 'Ashy Minivet'
443 images loaded for 'Ashy Minivet'
Loading images for 'Ashy Wood Pigeon'
335 images loaded for 'Ashy Wood Pigeon'


In [5]:
import cv2
n_channels = 3
X = np.zeros((n_images, img_dim[0], img_dim[1], n_channels))
labels = []

i = 0
for k, v in bird_file_map.items():
    for file in v:
        try:
            im = cv2.imread(file)
            if im is None or im.shape[0] < img_dim[0] or im.shape[1] < img_dim[1]:
                continue
            shape = im.shape
            assert len(shape) == 3 # width, length and color channels
            assert shape[-1] == 3 # rgb, three channels
            
            # resizing
            im = cv2.resize(src=im, dsize=img_dim, interpolation=cv2.INTER_LINEAR)
            # Gaussian blurring
            # im = cv2.GaussianBlur(im, (5, 5), 0)
            X[i] = np.asarray(im)
            labels.append(k)
            i += 1
        except IOError:
            continue
n_images = len(labels)
X = X[:n_images, ]
del i

In [6]:
labels_unique = list(set(labels))

Y = np.zeros((n_images, 1))
for i in range(n_images):
    Y[i, 0] = labels_unique.index(labels[i])

In [7]:
print(X.shape)
print(Y.shape)
print(np.min(X), np.max(X))

(1906, 224, 224, 3)
(1906, 1)
0.0 255.0


## Normalization

In [8]:
for i in range(X.shape[0]):
    m = np.min(X[i,])
    X[i] = (X[i,] - m) / (np.max(X[i,]) - m)

print(np.min(X), np.max(X))

0.0 1.0


# Reducing memory size

In [9]:
# Function to reduce the numpy array size
def reduce_mem_usage(a, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = a.nbytes / 1024**2
    dtype = a.dtype
    if dtype in numerics:
        c_min = a.min()
        c_max = a.max()
        if str(dtype)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                a = a.astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                a = a.astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                a = a.astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                a = a.astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                a = a.astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                a = a.astype(np.float32)
            else:
                a = a.astype(np.float64)
    end_mem = a.nbytes / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return a

X = reduce_mem_usage(X)
Y = reduce_mem_usage(Y)

Mem. usage decreased to 547.23 Mb (75.0% reduction)
Mem. usage decreased to  0.00 Mb (75.0% reduction)


# Data Splitting

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size)

In [11]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
# y_valid = to_categorical(y_valid)
print(X_train.shape, '|', X_test.shape)
print(y_train.shape, '|', y_test.shape)

(1524, 224, 224, 3) | (382, 224, 224, 3)
(1524, 10) | (382, 10)


# Data Augmentation

In [15]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
img_gen_batch_size = 32
image_gen_train = ImageDataGenerator(
                    rotation_range=45,
                    width_shift_range=0.2,
                    height_shift_range=0.2,
                    horizontal_flip=True,
                    shear_range=0.2,
                    zoom_range=0.1)

image_gen_flow = image_gen_train.flow(X_train, y_train, batch_size=img_gen_batch_size)
X_added = np.zeros((len(image_gen_flow) * img_gen_batch_size, *(X_train.shape[1:])))
Y_added = np.zeros((len(image_gen_flow) * img_gen_batch_size, *(y_train.shape[1:])))
flow_len = len(image_gen_flow)

n_added = 0
for i in range(0, flow_len):
    X_batch = image_gen_flow[i][0]
    img_gen_batch_size = X_batch.shape[0]
    X_added[i * img_gen_batch_size: (i + 1) * img_gen_batch_size,] = X_batch
    Y_added[i * img_gen_batch_size: (i + 1) * img_gen_batch_size,] = image_gen_flow[i][1]
    n_added += img_gen_batch_size
print('data augmentation sucessful, {} new images in total were added'.format(n_added))
del flow_len
del X_batch
del img_gen_batch_size
X_train = np.vstack([X_train, X_added])
y_train = np.vstack([y_train, Y_added])
del X_added
del Y_added

data augmentation sucessful, 1524 new images in total were added


# ResNet v2 as the baseline model

In [16]:
from tensorflow.keras.layers import Dense, Conv2D, BatchNormalization, Activation, AveragePooling2D, Input, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.callbacks import ReduceLROnPlateau

def lr_schedule(epoch):
    """Learning Rate Schedule

    Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs.
    Called automatically every epoch as part of callbacks during training.

    # Arguments
        epoch (int): The number of epochs

    # Returns
        lr (float32): learning rate
    """
    lr = 1e-3
    if epoch > 180:
        lr *= 0.5e-3
    elif epoch > 160:
        lr *= 1e-3
    elif epoch > 120:
        lr *= 1e-2
    elif epoch > 80:
        lr *= 1e-1
    print('Learning rate: ', lr)
    return lr

def resnet_layer(inputs,
                 num_filters=16,
                 kernel_size=3,
                 strides=1,
                 activation='relu',
                 batch_normalization=True,
                 conv_first=True):
    """2D Convolution-Batch Normalization-Activation stack builder

    # Arguments
        inputs (tensor): input tensor from input image or previous layer
        num_filters (int): Conv2D number of filters
        kernel_size (int): Conv2D square kernel dimensions
        strides (int): Conv2D square stride dimensions
        activation (string): activation name
        batch_normalization (bool): whether to include batch normalization
        conv_first (bool): conv-bn-activation (True) or bn-activation-conv (False)

    # Returns
        x (tensor): tensor as input to the next layer
    """
    conv = Conv2D(num_filters,
                  kernel_size=kernel_size,
                  strides=strides,
                  padding='same',
                  kernel_initializer='he_normal',
                  kernel_regularizer=l2(1e-4))

    x = inputs
    if conv_first:
        x = conv(x)
        if batch_normalization:
            x = BatchNormalization()(x)
        if activation is not None:
            x = Activation(activation)(x)
    else:
        if batch_normalization:
            x = BatchNormalization()(x)
        if activation is not None:
            x = Activation(activation)(x)
        x = conv(x)
    return x

def resnet_v2(input_shape, depth, num_classes=10):
    """ResNet Version 2 Model builder [b]

    Stacks of (1 x 1)-(3 x 3)-(1 x 1) BN-ReLU-Conv2D or also known as bottleneck layer
    
    First shortcut connection per layer is 1 x 1 Conv2D.
    
    Second and onwards shortcut connection is identity.
    
    At the beginning of each stage, the feature map size is halved (downsampled)
    by a convolutional layer with strides=2, while the number of filter maps is
    doubled. Within each stage, the layers have the same number filters and the
    same filter map sizes.
    
    Features maps sizes:
    conv1  : 32x32,  16
    stage 0: 32x32,  64
    stage 1: 16x16, 128
    stage 2:  8x8,  256

    # Arguments
        input_shape (tensor): shape of input image tensor
        depth (int): number of core convolutional layers
        num_classes (int): number of classes (CIFAR10 has 10)

    # Returns
        model (Model): Keras model instance
    """
    if (depth - 2) % 9 != 0:
        raise ValueError('depth should be 9n+2 (eg 56 or 110 in [b])')
    # Start model definition
    num_filters_in = 16
    num_res_blocks = int((depth - 2) / 9)

    inputs = Input(shape=input_shape)
    # v2 performs Conv2D with BN-ReLU on input before splitting into 2 paths
    x = resnet_layer(inputs=inputs,
                     num_filters=num_filters_in,
                     conv_first=True)

    # Instantiate the stack of residual units
    for stage in range(3):
        for res_block in range(num_res_blocks):
            activation = 'relu'
            batch_normalization = True
            strides = 1
            if stage == 0:
                num_filters_out = num_filters_in * 4
                if res_block == 0:  # first layer and first stage
                    activation = None
                    batch_normalization = False
            else:
                num_filters_out = num_filters_in * 2
                if res_block == 0:  # first layer but not first stage
                    strides = 2    # downsample

            # bottleneck residual unit
            y = resnet_layer(inputs=x,
                             num_filters=num_filters_in,
                             kernel_size=1,
                             strides=strides,
                             activation=activation,
                             batch_normalization=batch_normalization,
                             conv_first=False)
            y = resnet_layer(inputs=y,
                             num_filters=num_filters_in,
                             conv_first=False)
            y = resnet_layer(inputs=y,
                             num_filters=num_filters_out,
                             kernel_size=1,
                             conv_first=False)
            if res_block == 0:
                # linear projection residual shortcut connection to match
                # changed dims
                x = resnet_layer(inputs=x,
                                 num_filters=num_filters_out,
                                 kernel_size=1,
                                 strides=strides,
                                 activation=None,
                                 batch_normalization=False)
            x = tf.keras.layers.add([x, y])

        num_filters_in = num_filters_out

    # Add classifier on top.
    # v2 has BN-ReLU before Pooling
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = AveragePooling2D(pool_size=8)(x)
    y = Flatten()(x)
    outputs = Dense(num_classes,
                    activation='softmax',
                    kernel_initializer='he_normal')(y)

    # Instantiate model.
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [17]:
from tensorflow.keras.utils import multi_gpu_model
depth = 3 * 9 + 2
input_shape = (img_dim[0], img_dim[1], 3)
model = resnet_v2(input_shape, depth)
# model = multi_gpu_model(model, gpus=2)

In [20]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
model.compile(optimizer=Adam(learning_rate=lr_schedule(0)),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
# print(model.summary())

checkpoint = ModelCheckpoint(filepath='resnet_model_checkpoint.h5',
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True)

lr_scheduler = LearningRateScheduler(lr_schedule)

lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                               cooldown=0,
                               patience=5,
                               min_lr=0.5e-6)

callbacks = [checkpoint, lr_reducer, lr_scheduler]
model.fit(X_train, y_train, epochs=100, shuffle=True, validation_data=(X_test, y_test), callbacks=callbacks)

Learning rate:  0.001
Train on 3060 samples, validate on 382 samples
Learning rate:  0.001
Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.10471, saving model to resnet_model_checkpoint.h5
Learning rate:  0.001
Epoch 2/100
Epoch 00002: val_accuracy improved from 0.10471 to 0.24346, saving model to resnet_model_checkpoint.h5
Learning rate:  0.001
Epoch 3/100
Epoch 00003: val_accuracy improved from 0.24346 to 0.26963, saving model to resnet_model_checkpoint.h5
Learning rate:  0.001
Epoch 4/100
Epoch 00004: val_accuracy improved from 0.26963 to 0.33246, saving model to resnet_model_checkpoint.h5
Learning rate:  0.001
Epoch 5/100
Epoch 00005: val_accuracy improved from 0.33246 to 0.34031, saving model to resnet_model_checkpoint.h5
Learning rate:  0.001
Epoch 6/100
Epoch 00006: val_accuracy did not improve from 0.34031
Learning rate:  0.001
Epoch 7/100
Epoch 00007: val_accuracy improved from 0.34031 to 0.46073, saving model to resnet_model_checkpoint.h5
Learning rate:  0.001
E

<tensorflow.python.keras.callbacks.History at 0x7fae37fefa50>

In [21]:
test_loss, test_acc = model.evaluate(X_test,  y_test)
print('test accuracy:', test_acc)

test accuracy: 0.4450262
