In [4]:
from keras.models import Model, load_model
from keras.layers import Input, Lambda, Conv2D, MaxPooling2D, BatchNormalization, Dense, GlobalAveragePooling2D
from keras.layers import  Flatten, Reshape, Concatenate, Activation, Dropout
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard, LambdaCallback
from keras import backend as K
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras_ssd_loss import SSDLoss
from keras_layer_AnchorBoxes import AnchorBoxes
from keras_layer_L2Normalization import L2Normalization
from ssd_box_encode_decode_utils import SSDBoxEncoder, decode_y, decode_y2
from ssd_batch_generator import BatchGenerator

from keras.applications.mobilenet import MobileNet, relu6, DepthwiseConv2D

In [6]:
from keras.applications.mobilenet import _depthwise_conv_block, _conv_block

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import time
# from moviepy.editor import VideoFileClip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import glob

In [8]:
basemodel = MobileNet(include_top=False, weights='imagenet', input_shape = (128,128, 3))

In [9]:
basemodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 128, 128, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 64, 64, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 64, 64, 32)        128       
_________________________________________________________________
conv1_relu (Activation)      (None, 64, 64, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 64, 64, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 64, 64, 32)        128       
_________________________________________________________________
conv_dw_1_relu (Activation)  (None, 64, 64, 32)        0         
__________

In [23]:
basemodel.get_layer

<bound method Container.get_layer of <keras.engine.training.Model object at 0x7f6b2d8861d0>>

In [50]:
def ssd_mobilenet(image_shape, n_classes, l2_reg=0.0, scales = [0.08, 0.16, 0.32, 0.64, 0.96],
                  variances = np.array([1.0, 1.0, 1.0, 1.0]),
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5]]):
    n_predictor_layers = 4 # The number of predictor conv layers in the network
    n_classes += 1 # Account for the background class.
    # one box per aspect ratio, so for a predictor layer with 3 ratios then 3 boxes
    n_boxes = []
    scales = scales # for anchor boxes 
    variances = variances
    steps = [None] * n_predictor_layers
    offsets = [None] * n_predictor_layers
    
    for i in aspect_ratios_per_layer:
        n_boxes.append(len(i))
    
    basemodel = MobileNet(include_top=False, weights='imagenet', input_shape = (128,128, 3))
    
    img_height, img_width, img_channels = image_shape[0], image_shape[1], image_shape[2]
    x = Input(shape=image_shape)
    x1 = Lambda(lambda x: K.tf.image.resize_images(x, (128, 128)))(x)
    x1 = Lambda(lambda x: x - np.array(127.5), output_shape=(128, 128, 3),
                   name='input_mean_norm')(x)
    x1 = Lambda(lambda x: x/np.array(127.5), output_shape=(128, 128, 3),
                   name='input_scaler')(x1)
    
    base = basemodel(x1)
#     basemodel_compiled = Model(inputs=x, outputs=base)

    #predictor layers for classes and boxes
    classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", 
                      kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                      name='classes7')(base)
    classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), 
                      padding="same", kernel_initializer='glorot_normal', 
                      kernel_regularizer=l2(l2_reg), name='classes4')(basemodel.get_layer("conv_pw_10_relu").output)
    classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), 
                      padding="same", kernel_initializer='glorot_normal', 
                      kernel_regularizer=l2(l2_reg), name='classes5')(basemodel.get_layer("conv_pw_11_relu").output)
    classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), 
                      padding="same", kernel_initializer='glorot_normal', 
                      kernel_regularizer=l2(l2_reg), name='classes6')(basemodel.get_layer("conv_pw_12_relu").output)
    
    box7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box7')(base)
    # 4 coords per box -- xmin, xmax, ymun, ymax
    box4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box4')(basemodel.get_layer("conv_pw_10_relu").output)
    box5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box5')(basemodel.get_layer("conv_pw_11_relu").output)
    box6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box6')(basemodel.get_layer("conv_pw_12_relu").output)
    
    
    #use the custom anchorbox layer to generate predictions for boxes
    anchors4 = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios_per_layer[0],
                           two_boxes_for_ar1=False, this_steps=steps[0], this_offsets=offsets[0],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors4')(box4)
    anchors5 = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios_per_layer[1],
                           two_boxes_for_ar1=False, this_steps=steps[1], this_offsets=offsets[1],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors5')(box5)
    anchors6 = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios_per_layer[2],
                           two_boxes_for_ar1=False, this_steps=steps[2], this_offsets=offsets[2],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors6')(box6)
    anchors7 = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios_per_layer[3],
                           two_boxes_for_ar1=False, this_steps=steps[3], this_offsets=offsets[3],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors7')(box7)
    
    
    # Reshape the class predictors tensor into (batch, height * width * n_boxes, n_classes) => the class is now the last variable for predcition
    classes4_reshaped = Reshape((-1, n_classes), name='classes4_reshape')(classes4)
    classes5_reshaped = Reshape((-1, n_classes), name='classes5_reshape')(classes5)
    classes6_reshaped = Reshape((-1, n_classes), name='classes6_reshape')(classes6)
    classes7_reshaped = Reshape((-1, n_classes), name='classes7_reshape')(classes7)
    
    # Reshape the box coords predictions, into tensors of shape (batch, height * width * n_boxes, 4) => box positions last for loss
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(box4)
    boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(box5)
    boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(box6)
    boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(box7)
    # Reshape the anchor box tensors, into tensors of shape (batch, height * width * n_boxes, 8)
    anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
    anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
    anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
    anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)
    #concat along the middle axis, since want to keep the batch and n_classes/8 untouched
    class_concat = Concatenate(axis=1, name='concatenate_classes')([classes4_reshaped, 
                                classes5_reshaped,classes6_reshaped, 
                                classes7_reshaped])
    boxes_concat = Concatenate(axis=1, name='concatenate_boxes')([boxes4_reshaped,
                                                             boxes5_reshaped,
                                                             boxes6_reshaped,
                                                             boxes7_reshaped])

    # Output shape of `anchors_final`: (batch, n_boxes_total, 8)
    anchors_concat = Concatenate(axis=1, name='concatenate_anchors')([anchors4_reshaped,
                                                                 anchors5_reshaped,
                                                                 anchors6_reshaped,
                                                                 anchors7_reshaped])
    
    classification_softmax = Activation('softmax', name='classes_softamx')(class_concat)
    
    prediction = Concatenate(axis=2,name='concatenate_output')([classification_softmax,
                                                              boxes_concat,
                                                              anchors_concat ])
    
    model = Model(inputs=x, outputs=prediction)
    
    return model
    

In [4]:
train_dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
val_dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

# Training dataset
train_images_dir      = 'udacity_driving_datasets/'
train_labels_filename = 'udacity_driving_datasets/train_labels.csv'

# Validation dataset
val_images_dir      = 'udacity_driving_datasets/'
val_labels_filename = 'udacity_driving_datasets/val_labels.csv'

train_dataset.parse_csv(images_dir=train_images_dir,
                        labels_filename=train_labels_filename,
                        input_format=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'], # This is the order of the first six columns in the CSV file that contains the labels for your dataset. If your labels are in XML format, maybe the XML parser will be helpful, check the documentation.
                        include_classes='all')

val_dataset.parse_csv(images_dir=val_images_dir,
                      labels_filename=val_labels_filename,
                      input_format=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'],
                      include_classes='all')

In [51]:
K.clear_session() # Clear previous models from memory.

model = ssd_mobilenet((300, 480, 3), 5, l2_reg=0.0, 
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5]])

# 3: Instantiate an Adam optimizer and the SSD loss function and compile the model

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

# with negative hard mining as per paper
ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

RuntimeError: Graph disconnected: cannot obtain value for tensor Tensor("input_1:0", shape=(?, 128, 128, 3), dtype=float32) at layer "input_1". The following previous layers were accessed without issue: []

In [11]:
basemodel.get_layer('conv_pw_13').output

<tf.Tensor 'conv_pw_13_1/convolution:0' shape=(?, 4, 4, 1024) dtype=float32>

In [52]:
K.clear_session()
x = Input(shape=(300, 480,3))
x1 = Lambda(lambda x: K.tf.image.resize_images(x, (128, 128)))(x)
x1 = Lambda(lambda x: x - np.array(127.5), output_shape=(128, 128, 3),
               name='input_mean_norm')(x)
x1 = Lambda(lambda x: x/np.array(127.5), output_shape=(128, 128, 3),
               name='input_scaler')(x1)

In [53]:

basemodel = MobileNet(include_top=False, weights='imagenet', input_shape = (128,128, 3))
base = basemodel(x1)


    #predictor layers for classes and boxes
    


In [54]:
n_boxes = [4,4,4,4]
n_classes = 5
l2_reg=0.0
classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", 
                      kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                      name='classes7')(base)
classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), 
                  padding="same", kernel_initializer='glorot_normal', 
                  kernel_regularizer=l2(l2_reg), name='classes4')(basemodel.get_layer("conv_pw_10_relu").output)
classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), 
                  padding="same", kernel_initializer='glorot_normal', 
                  kernel_regularizer=l2(l2_reg), name='classes5')(basemodel.get_layer("conv_pw_11_relu").output)
classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), 
                  padding="same", kernel_initializer='glorot_normal', 
                  kernel_regularizer=l2(l2_reg), name='classes6')(basemodel.get_layer("conv_pw_12_relu").output)


In [None]:
model = Model(inputs=x, outputs=prediction)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

# with negative hard mining as per paper
ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

In [38]:
# classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), 
#                       padding="same", kernel_initializer='glorot_normal', 
#                       kernel_regularizer=l2(l2_reg), name='classes4')(basemodel_compiled.get_layer("conv_pw_10_relu").output)
basemodel_compiled.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 300, 480, 3)       0         
_________________________________________________________________
input_mean_norm (Lambda)     (None, 128, 128, 3)       0         
_________________________________________________________________
input_scaler (Lambda)        (None, 128, 128, 3)       0         
_________________________________________________________________
mobilenet_1.00_128 (Model)   (None, 4, 4, 1024)        3228864   
Total params: 3,228,864
Trainable params: 3,206,976
Non-trainable params: 21,888
_________________________________________________________________


In [13]:
def MobileNet(input_shape=None,
              alpha=1.0,
              depth_multiplier=1,
              dropout=1e-3,
              include_top=True,
              weights='imagenet',
              input_tensor=None,
              pooling=None,
              classes=1000):
    """Instantiates the MobileNet architecture.
    Note that only TensorFlow is supported for now,
    therefore it only works with the data format
    `image_data_format='channels_last'` in your Keras config
    at `~/.keras/keras.json`.
    To load a MobileNet model via `load_model`, import the custom
    objects `relu6` and `DepthwiseConv2D` and pass them to the
    `custom_objects` parameter.
    E.g.
    model = load_model('mobilenet.h5', custom_objects={
                       'relu6': mobilenet.relu6,
                       'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
    # Arguments
        input_shape: optional shape tuple, only to be specified
            if `include_top` is False (otherwise the input shape
            has to be `(224, 224, 3)` (with `channels_last` data format)
            or (3, 224, 224) (with `channels_first` data format).
            It should have exactly 3 inputs channels,
            and width and height should be no smaller than 32.
            E.g. `(200, 200, 3)` would be one valid value.
        alpha: controls the width of the network.
            - If `alpha` < 1.0, proportionally decreases the number
                of filters in each layer.
            - If `alpha` > 1.0, proportionally increases the number
                of filters in each layer.
            - If `alpha` = 1, default number of filters from the paper
                 are used at each layer.
        depth_multiplier: depth multiplier for depthwise convolution
            (also called the resolution multiplier)
        dropout: dropout rate
        include_top: whether to include the fully-connected
            layer at the top of the network.
        weights: one of `None` (random initialization),
              'imagenet' (pre-training on ImageNet),
              or the path to the weights file to be loaded.
        input_tensor: optional Keras tensor (i.e. output of
            `layers.Input()`)
            to use as image input for the model.
        pooling: Optional pooling mode for feature extraction
            when `include_top` is `False`.
            - `None` means that the output of the model
                will be the 4D tensor output of the
                last convolutional layer.
            - `avg` means that global average pooling
                will be applied to the output of the
                last convolutional layer, and thus
                the output of the model will be a
                2D tensor.
            - `max` means that global max pooling will
                be applied.
        classes: optional number of classes to classify images
            into, only to be specified if `include_top` is True, and
            if no `weights` argument is specified.
    # Returns
        A Keras model instance.
    # Raises
        ValueError: in case of invalid argument for `weights`,
            or invalid input shape.
        RuntimeError: If attempting to run this model with a
            backend that does not support separable convolutions.
    """
    if not (weights in {'imagenet', None} or os.path.exists(weights)):
        raise ValueError('The `weights` argument should be either '
                         '`None` (random initialization), `imagenet` '
                         '(pre-training on ImageNet), '
                         'or the path to the weights file to be loaded.')

    if weights == 'imagenet' and include_top and classes != 1000:
        raise ValueError('If using `weights` as ImageNet with `include_top` '
                         'as true, `classes` should be 1000')

    # Determine proper input shape and default size.
    if input_shape is None:
        default_size = 224
    else:
        if K.image_data_format() == 'channels_first':
            rows = input_shape[1]
            cols = input_shape[2]
        else:
            rows = input_shape[0]
            cols = input_shape[1]

        if rows == cols and rows in [128, 160, 192, 224]:
            default_size = rows
        else:
            default_size = 224

    input_shape = _obtain_input_shape(input_shape,
                                      default_size=default_size,
                                      min_size=32,
                                      data_format=K.image_data_format(),
                                      require_flatten=include_top,
                                      weights=weights)

    if K.image_data_format() == 'channels_last':
        row_axis, col_axis = (0, 1)
    else:
        row_axis, col_axis = (1, 2)
    rows = input_shape[row_axis]
    cols = input_shape[col_axis]

    if weights == 'imagenet':
        if depth_multiplier != 1:
            raise ValueError('If imagenet weights are being loaded, '
                             'depth multiplier must be 1')

        if alpha not in [0.25, 0.50, 0.75, 1.0]:
            raise ValueError('If imagenet weights are being loaded, '
                             'alpha can be one of'
                             '`0.25`, `0.50`, `0.75` or `1.0` only.')

        if rows != cols or rows not in [128, 160, 192, 224]:
            raise ValueError('If imagenet weights are being loaded, '
                             'input must have a static square shape (one of '
                             '(128,128), (160,160), (192,192), or (224, 224)).'
                             ' Input shape provided = %s' % (input_shape,))

    if K.image_data_format() != 'channels_last':
        warnings.warn('The MobileNet family of models is only available '
                      'for the input data format "channels_last" '
                      '(width, height, channels). '
                      'However your settings specify the default '
                      'data format "channels_first" (channels, width, height).'
                      ' You should set `image_data_format="channels_last"` '
                      'in your Keras config located at ~/.keras/keras.json. '
                      'The model being returned right now will expect inputs '
                      'to follow the "channels_last" data format.')
        K.set_image_data_format('channels_last')
        old_data_format = 'channels_first'
    else:
        old_data_format = None

    if input_tensor is None:
        img_input = Input(shape=input_shape)
    else:
        if not K.is_keras_tensor(input_tensor):
            img_input = Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor

    x = _conv_block(img_input, 32, alpha, strides=(2, 2))
    x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)

    x = _depthwise_conv_block(x, 128, alpha, depth_multiplier,
                              strides=(2, 2), block_id=2)
    x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)

    x = _depthwise_conv_block(x, 256, alpha, depth_multiplier,
                              strides=(2, 2), block_id=4)
    x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)

    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier,
                              strides=(2, 2), block_id=6)
    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)

    x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier,
                              strides=(2, 2), block_id=12)
    x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)

    if include_top:
        if K.image_data_format() == 'channels_first':
            shape = (int(1024 * alpha), 1, 1)
        else:
            shape = (1, 1, int(1024 * alpha))

        x = GlobalAveragePooling2D()(x)
        x = Reshape(shape, name='reshape_1')(x)
        x = Dropout(dropout, name='dropout')(x)
        x = Conv2D(classes, (1, 1),
                   padding='same', name='conv_preds')(x)
        x = Activation('softmax', name='act_softmax')(x)
        x = Reshape((classes,), name='reshape_2')(x)
    else:
        if pooling == 'avg':
            x = GlobalAveragePooling2D()(x)
        elif pooling == 'max':
            x = GlobalMaxPooling2D()(x)

    # Ensure that the model takes into account
    # any potential predecessors of `input_tensor`.
    if input_tensor is not None:
        inputs = get_source_inputs(input_tensor)
    else:
        inputs = img_input

    # Create model.
    model = Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows))

    # load weights
    if weights == 'imagenet':
        if K.image_data_format() == 'channels_first':
            raise ValueError('Weights for "channels_last" format '
                             'are not available.')
        if alpha == 1.0:
            alpha_text = '1_0'
        elif alpha == 0.75:
            alpha_text = '7_5'
        elif alpha == 0.50:
            alpha_text = '5_0'
        else:
            alpha_text = '2_5'

        if include_top:
            model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)
            weigh_path = BASE_WEIGHT_PATH + model_name
            weights_path = get_file(model_name,
                                    weigh_path,
                                    cache_subdir='models')
        else:
            model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)
            weigh_path = BASE_WEIGHT_PATH + model_name
            weights_path = get_file(model_name,
                                    weigh_path,
                                    cache_subdir='models')
        model.load_weights(weights_path)
    elif weights is not None:
        model.load_weights(weights)

    if old_data_format:
        K.set_image_data_format(old_data_format)
    return model


def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
    """Adds an initial convolution layer (with batch normalization and relu6).
    # Arguments
        inputs: Input tensor of shape `(rows, cols, 3)`
            (with `channels_last` data format) or
            (3, rows, cols) (with `channels_first` data format).
            It should have exactly 3 inputs channels,
            and width and height should be no smaller than 32.
            E.g. `(224, 224, 3)` would be one valid value.
        filters: Integer, the dimensionality of the output space
            (i.e. the number of output filters in the convolution).
        alpha: controls the width of the network.
            - If `alpha` < 1.0, proportionally decreases the number
                of filters in each layer.
            - If `alpha` > 1.0, proportionally increases the number
                of filters in each layer.
            - If `alpha` = 1, default number of filters from the paper
                 are used at each layer.
        kernel: An integer or tuple/list of 2 integers, specifying the
            width and height of the 2D convolution window.
            Can be a single integer to specify the same value for
            all spatial dimensions.
        strides: An integer or tuple/list of 2 integers,
            specifying the strides of the convolution along the width and height.
            Can be a single integer to specify the same value for
            all spatial dimensions.
            Specifying any stride value != 1 is incompatible with specifying
            any `dilation_rate` value != 1.
    # Input shape
        4D tensor with shape:
        `(samples, channels, rows, cols)` if data_format='channels_first'
        or 4D tensor with shape:
        `(samples, rows, cols, channels)` if data_format='channels_last'.
    # Output shape
        4D tensor with shape:
        `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
        or 4D tensor with shape:
        `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
        `rows` and `cols` values might have changed due to stride.
    # Returns
        Output tensor of block.
    """
    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
    filters = int(filters * alpha)
    x = Conv2D(filters, kernel,
               padding='same',
               use_bias=False,
               strides=strides,
               name='conv1')(inputs)
    x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
    return Activation(relu6, name='conv1_relu')(x)


def _depthwise_conv_block(inputs, pointwise_conv_filters, alpha,
                          depth_multiplier=1, strides=(1, 1), block_id=1):
    """Adds a depthwise convolution block.
    A depthwise convolution block consists of a depthwise conv,
    batch normalization, relu6, pointwise convolution,
    batch normalization and relu6 activation.
    # Arguments
        inputs: Input tensor of shape `(rows, cols, channels)`
            (with `channels_last` data format) or
            (channels, rows, cols) (with `channels_first` data format).
        pointwise_conv_filters: Integer, the dimensionality of the output space
            (i.e. the number of output filters in the pointwise convolution).
        alpha: controls the width of the network.
            - If `alpha` < 1.0, proportionally decreases the number
                of filters in each layer.
            - If `alpha` > 1.0, proportionally increases the number
                of filters in each layer.
            - If `alpha` = 1, default number of filters from the paper
                 are used at each layer.
        depth_multiplier: The number of depthwise convolution output channels
            for each input channel.
            The total number of depthwise convolution output
            channels will be equal to `filters_in * depth_multiplier`.
        strides: An integer or tuple/list of 2 integers,
            specifying the strides of the convolution along the width and height.
            Can be a single integer to specify the same value for
            all spatial dimensions.
            Specifying any stride value != 1 is incompatible with specifying
            any `dilation_rate` value != 1.
        block_id: Integer, a unique identification designating the block number.
    # Input shape
        4D tensor with shape:
        `(batch, channels, rows, cols)` if data_format='channels_first'
        or 4D tensor with shape:
        `(batch, rows, cols, channels)` if data_format='channels_last'.
    # Output shape
        4D tensor with shape:
        `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
        or 4D tensor with shape:
        `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
        `rows` and `cols` values might have changed due to stride.
    # Returns
        Output tensor of block.
    """
    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
    pointwise_conv_filters = int(pointwise_conv_filters * alpha)

    x = DepthwiseConv2D((3, 3),
                        padding='same',
                        depth_multiplier=depth_multiplier,
                        strides=strides,
                        use_bias=False,
                        name='conv_dw_%d' % block_id)(inputs)
    x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
    x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)

    x = Conv2D(pointwise_conv_filters, (1, 1),
               padding='same',
               use_bias=False,
               strides=(1, 1),
               name='conv_pw_%d' % block_id)(x)
    x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
    return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)

In [18]:
def mobilenet_model(image_shape, n_classes, input_shape=(128,128,3), l2_reg=0.0, 
                    scales = [0.08, 0.16, 0.32, 0.64, 0.96],
                  variances = np.array([1.0, 1.0, 1.0, 1.0]),
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5]]):
    n_predictor_layers = 4 # The number of predictor conv layers in the network
    n_classes += 1 # Account for the background class.
    # one box per aspect ratio, so for a predictor layer with 3 ratios then 3 boxes
    n_boxes = []
    scales = scales # for anchor boxes 
    variances = variances
    steps = [None] * n_predictor_layers
    offsets = [None] * n_predictor_layers
    
    for i in aspect_ratios_per_layer:
        n_boxes.append(len(i))
        
    alpha=1.0
    depth_multiplier=1
    dropout=1e-3
    
    img_height = input_shape[0]
    img_width = input_shape[1]
        
    x = Input(shape=image_shape)
    x1 = Lambda(lambda x: K.tf.image.resize_images(x, (input_shape[0], input_shape[1])))(x)
    x1 = Lambda(lambda x: x - np.array(127.5), output_shape=(128, 128, 3),
                   name='input_mean_norm')(x)
    x1 = Lambda(lambda x: x/np.array(127.5), output_shape=(128, 128, 3),
                   name='input_scaler')(x1)
    
    conv = _conv_block(x1, 32, alpha, strides=(2, 2))
    d1 = _depthwise_conv_block(conv, 64, alpha, depth_multiplier, block_id=1)

    d2 = _depthwise_conv_block(d1, 128, alpha, depth_multiplier,
                              strides=(2, 2), block_id=2)
    d3 = _depthwise_conv_block(d2, 128, alpha, depth_multiplier, block_id=3)

    d4 = _depthwise_conv_block(d3, 256, alpha, depth_multiplier,
                              strides=(2, 2), block_id=4)
    d5 = _depthwise_conv_block(d4, 256, alpha, depth_multiplier, block_id=5)

    d6 = _depthwise_conv_block(d5, 512, alpha, depth_multiplier,
                              strides=(2, 2), block_id=6)
    d7 = _depthwise_conv_block(d6, 512, alpha, depth_multiplier, block_id=7)
    d8 = _depthwise_conv_block(d7, 512, alpha, depth_multiplier, block_id=8)
    d9 = _depthwise_conv_block(d8, 512, alpha, depth_multiplier, block_id=9)
    d10 = _depthwise_conv_block(d9, 512, alpha, depth_multiplier, block_id=10)
    d11 = _depthwise_conv_block(d10, 512, alpha, depth_multiplier, block_id=11)

    d12 = _depthwise_conv_block(d11, 1024, alpha, depth_multiplier,
                              strides=(2, 2), block_id=12)
    d13 = _depthwise_conv_block(d12, 1024, alpha, depth_multiplier, block_id=13)
    
    
    classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), 
                      padding="same", kernel_initializer='glorot_normal', 
                      kernel_regularizer=l2(l2_reg), name='classes4')(d10)
    classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), 
                      padding="same", kernel_initializer='glorot_normal', 
                      kernel_regularizer=l2(l2_reg), name='classes5')(d11)
    classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), 
                      padding="same", kernel_initializer='glorot_normal', 
                      kernel_regularizer=l2(l2_reg), name='classes6')(d12)
    classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", 
                      kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                      name='classes7')(d13)
    
    
    # 4 coords per box -- xmin, xmax, ymun, ymax
    box4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box4')(d10)
    box5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box5')(d11)
    box6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box6')(d12)
    box7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", 
                  kernel_initializer='glorot_normal', kernel_regularizer=l2(l2_reg), 
                  name='box7')(d13)
    
    
    #use the custom anchorbox layer to generate predictions for boxes
    anchors4 = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios_per_layer[0],
                           two_boxes_for_ar1=False, this_steps=steps[0], this_offsets=offsets[0],
                           limit_boxes=False, variances=variances, coords='centroids', 
                           normalize_coords=False, name='anchors4')(box4)
    anchors5 = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios_per_layer[1],
                           two_boxes_for_ar1=False, this_steps=steps[1], this_offsets=offsets[1],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors5')(box5)
    anchors6 = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios_per_layer[2],
                           two_boxes_for_ar1=False, this_steps=steps[2], this_offsets=offsets[2],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors6')(box6)
    anchors7 = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios_per_layer[3],
                           two_boxes_for_ar1=False, this_steps=steps[3], this_offsets=offsets[3],
                           limit_boxes=False, variances=variances, coords='centroids', normalize_coords=False, name='anchors7')(box7)
    
    
    # Reshape the class predictors tensor into (batch, height * width * n_boxes, n_classes) => the class is now the last variable for predcition
    classes4_reshaped = Reshape((-1, n_classes), name='classes4_reshape')(classes4)
    classes5_reshaped = Reshape((-1, n_classes), name='classes5_reshape')(classes5)
    classes6_reshaped = Reshape((-1, n_classes), name='classes6_reshape')(classes6)
    classes7_reshaped = Reshape((-1, n_classes), name='classes7_reshape')(classes7)
    
    # Reshape the box coords predictions, into tensors of shape (batch, height * width * n_boxes, 4) => box positions last for loss
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(box4)
    boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(box5)
    boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(box6)
    boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(box7)
    # Reshape the anchor box tensors, into tensors of shape (batch, height * width * n_boxes, 8)
    anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
    anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
    anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
    anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)
    #concat along the middle axis, since want to keep the batch and n_classes/8 untouched
    class_concat = Concatenate(axis=1, name='concatenate_classes')([classes4_reshaped, 
                                classes5_reshaped,classes6_reshaped, 
                                classes7_reshaped])
    boxes_concat = Concatenate(axis=1, name='concatenate_boxes')([boxes4_reshaped,
                                                             boxes5_reshaped,
                                                             boxes6_reshaped,
                                                             boxes7_reshaped])

    # Output shape of `anchors_final`: (batch, n_boxes_total, 8)
    anchors_concat = Concatenate(axis=1, name='concatenate_anchors')([anchors4_reshaped,
                                                                 anchors5_reshaped,
                                                                 anchors6_reshaped,
                                                                 anchors7_reshaped])
    
    classification_softmax = Activation('softmax', name='classes_softamx')(class_concat)
    
    prediction = Concatenate(axis=2,name='concatenate_output')([classification_softmax,
                                                              boxes_concat,
                                                              anchors_concat ])
    
    model = Model(inputs=x, outputs=prediction)
    
    return model
    
    
    

In [19]:
K.clear_session() # Clear previous models from memory.

model = mobilenet_model((300, 480, 3), 5, l2_reg=0.005, 
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5]])

# 3: Instantiate an Adam optimizer and the SSD loss function and compile the model

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

# with negative hard mining as per paper
ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)