# Network Architecture for object detection
This file is to build up a neural network for processing the images to find out the information of objects' behaviors within this images, not only the classes, but also location.  
Firstly import the packages we use.

In [None]:
import import_ipynb
from keras.layers import Input, Add, LeakyReLU, Activation, concatenate, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from keras.models import Model, load_model
import xml.etree.ElementTree as ET
from keras.preprocessing.image import load_img, img_to_array, array_to_img
from keras.optimizers import Adam
from keras.initializers import glorot_uniform
#%matplotlib inline
import Util_V1 as U
import os
from keras.utils.np_utils import to_categorical
import keras.backend as K
K.set_image_data_format('channels_last')
K.set_learning_phase(1)
from datetime import datetime
import sys
import numpy as np

## The intuition behind inception architecture
Like the picture below, the one input is transferring to four different paths to generate feauture maps, which implicts that instead of using only one filter to extract feautures, the complex of four different feature extractors is able to excavate internal information in four different views. This is more stable and reliable than counting on one filter.  
<img src="figures/Inception_Module_self.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" />

In [1]:
def inception_block(X, filters, stage, block):
    """
    Implementation of the convolutional block as defined in Figure 4

    Arguments:
    X -- input tensor of shape (m, n_H_prev, n_W_prev, n_C_prev)
    f -- integer, specifying the shape of the middle CONV's window for the main path
    filters -- python list of integers, defining the number of filters in the CONV layers of the main path
    stage -- integer, used to name the layers, depending on their position in the network
    block -- string/character, used to name the layers, depending on their position in the network

    Returns:
    X -- output of the convolutional block, tensor of shape (n_H, n_W, n_C)
    """
    Conv_name_base = block + '_branch'
    BN_name_base = block + '_branch'
    F1, F2, F3, F4 = filters
    print(X)

    X1 = Conv2D(filters=F1, kernel_size=(1, 1), padding='valid', strides=(1, 1), name=Conv_name_base + 'Conv1a',
                kernel_initializer=glorot_uniform(seed=0))(X)
    X1 = BatchNormalization(axis=1, name=BN_name_base + 'Batch1a')(X1)
    X1 = Activation('relu')(X1)

    X2 = Conv2D(filters=F2, kernel_size=(1, 1), padding='valid', strides=(1, 1), name=Conv_name_base + 'Conv1b',
                kernel_initializer=glorot_uniform(seed=0))(X)
    X2 = BatchNormalization(axis=1, name=BN_name_base + 'Batch1b')(X2)
    X2 = Activation('relu')(X2)
    X2 = Conv2D(filters=F2, kernel_size=(3, 3), padding='same', strides=(1, 1), name=Conv_name_base + 'Conv2b',
                kernel_initializer=glorot_uniform(seed=0))(X2)
    X2 = BatchNormalization(axis=1, name=BN_name_base + 'Batch2b')(X2)
    X2 = Activation('relu')(X2)

    X3 = Conv2D(filters=F3, kernel_size=(1, 1), padding='valid', strides=(1, 1), name=Conv_name_base + 'Conv1c',
                kernel_initializer=glorot_uniform(seed=0))(X)
    X3 = BatchNormalization(axis=1, name=BN_name_base + 'Batch1c')(X3)
    X3 = Activation('relu')(X3)
    X3 = Conv2D(filters=F3, kernel_size=(5, 5), padding='same', strides=(1, 1), name=Conv_name_base + 'Conv2c',
                kernel_initializer=glorot_uniform(seed=0))(X3)
    X3 = BatchNormalization(axis=1, name=BN_name_base + 'Batch2c')(X3)
    X3 = Activation('relu')(X3)

    X4 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(X)
    X4 = Conv2D(filters=F4, kernel_size=(1, 1), padding='valid', strides=(1, 1), name=Conv_name_base + 'Conv1d',
                kernel_initializer=glorot_uniform(seed=0))(X4)
    X4 = BatchNormalization(axis=1, name=BN_name_base + 'Batch1d')(X4)
    X4 = Activation('relu')(X4)

    X = concatenate([X1, X2, X3, X4],  axis=-1)
    return X

## The intuition behind ResNet
We put two shortcuts in this architecture. The reasons are not only we hope to avoid gradient vanishing problem, but also we consider if there is a shortcut linking the layers in the front and back, by the shortcut, the layer in the front can make a big stride toward the optimal point rather than a small step. That will reduce the cost on time and computations.  
 
<img src="figures/Resnet.PNG"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" />    
       
For an example, in backpropagation, the gradient in layer i directly pass to layer 1. It results in a big progress in layer 1 and it speeds up the layer 1 converging to optimal states. Therefore, compared with conventional linear backpropogation, in our opinion, adding shortcuts is a good way to accelerate the training process.

In [None]:
def network_architecture(input_data):
    """
    Implementation of the HappyModel.

    Arguments:
    input_shape -- shape of the images of the dataset

    Returns:
    model -- a Model() instance in Keras
    """
    X_shortcut_1 = 0
    X_shortcut_2 = 0
    Input_data = Input(input_data)  # 640x480
    X = Conv2D(filters=32, kernel_size=(8, 8), strides=2, kernel_initializer=glorot_uniform(seed=0), padding='valid')(
        Input_data)  # 317x237
    X_shortcut_1 = X
    X = BatchNormalization(axis=1)(X)
    X = LeakyReLU(alpha=0.3)(X)
    X = MaxPooling2D(pool_size=(3, 3), strides=2, padding='valid')(X)  # 158x118

    X = inception_block(X=X, filters=[64, 32, 32, 64], block='inception', stage=1)  # 158x118
    X_shortcut_2 = X
    X = BatchNormalization(axis=1)(X)
    X = LeakyReLU(alpha=0.3)(X)

    X = Conv2D(filters=128, kernel_size=(2, 2), strides=2, padding='valid', kernel_initializer=glorot_uniform(seed=0))(X)  # 79x59
    X = BatchNormalization(axis=1)(X)
    X = LeakyReLU(alpha=0.3)(X)

    X_shortcut_1 = Conv2D(filters=128, kernel_size=(5, 5), strides=4, padding='valid', kernel_initializer=glorot_uniform(seed=0))(X_shortcut_1)
    X_shortcut_1 = BatchNormalization(axis=1)(X_shortcut_1)
    X_shortcut_1 = LeakyReLU(alpha=0.3)(X_shortcut_1)

    X = Add()([X_shortcut_1, X])
    X = Activation('relu')(X)

    X = Conv2D(filters=256, kernel_size=(3, 3), strides=2, padding='valid', kernel_initializer=glorot_uniform(seed=0))(X)  # 39x29
    X = BatchNormalization(axis=1)(X)
    X = LeakyReLU(alpha=0.3)(X)

    X_shortcut_2 = Conv2D(filters=256, kernel_size=(6, 6), strides = 4, padding='valid', kernel_initializer=glorot_uniform(seed=0))(X_shortcut_2)
    X_shortcut_2 = BatchNormalization(axis= 1)(X_shortcut_2)
    X_shortcut_2 = LeakyReLU(alpha=0.3)(X_shortcut_2)

    X = concatenate([X_shortcut_2, X], axis=-1)

    X = AveragePooling2D(pool_size=(3, 3), strides=2, padding='valid')(X)  # 19x19
    X = Conv2D(filters=15, kernel_size=(1, 1), strides=1, padding='valid', kernel_initializer=glorot_uniform(seed=0))(X)  # 19x14
    model = Model(inputs=Input_data, outputs=X, name='SimpleYOLO')
    return model

# The final Architecture
To sum up, we eventually devise an network architecture like this.
<img src="figures/Architecture_YOLOV1.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" />