# Util_V2 package
This package obtains some tools for computing the loss function of loss_v2 and loss_v3  
The block below is to import packages we use and set hyperparameters from scripts

In [None]:
import import_ipynb
import keras.backend as K
from keras.activations import softmax
from keras.losses import categorical_crossentropy
import keras
import numpy as np
import tensorflow as tf
from settings import setting
###########################################################################
###########################################################################
###########################################################################
# weight_Classification_loss is the important factors of loss on classification
# weight_Object_loss is the important factors of loss on Object detection
# weight_Localization_loss is the important factors of loss on Localizations of objets
# initial_lr is the initial learning rate that is a fixed value 0.01
# Hdecay is the decay value
# _epsilon is a extreme low value to avoid being 0 
# lr_minimum_rate_times is minimum times that the learning rate can be decreased by
###########################################################################
weight_Classification_loss = setting["weight_Classification_loss"]
weight_Object_loss = setting["weight_Object_loss"]
weight_Localization_loss = setting["weight_Localization_loss"]
_batch_size = setting['batch_size']
_epoch = 0
initial_lr = 0.01
Hdecay = setting["decay"]
lr_minimum_rate = 60.0
_epsilon = K.epsilon()
_epsilon = K.cast(_epsilon, 'float32')

# learning rate update mechanism in Keras
new_lr = lr * 1.0 / (1.0 + decay * iterations)  
At here, we try two different decay mechanism to figure out which is fit for our models, and attempt to see what's kind of the influence from each decay.  
At the same time, to avoid learning rate being faded after many iterations, we define a low bound of learning rate that is the initial learning rate divided by lr_minimum_rate_times (the minimum times the learning rate can be reduced comparing with initial value)

In [None]:
# custom learning rate decay function
# the learning rate decay in each epoch end and print the new learning rate
# beside, if the learning rate has been reduced to a minimum  times, this process stops
class DecayByEpoch(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, log=[]):
        global Hdecay
        new_lr = initial_lr * 1.0 / (1.0 + Hdecay * epoch)
        if initial_lr / new_lr > lr_minimum_rate:
            lr = self.model.optimizer.lr
        else:
            K.set_value(self.model.optimizer.lr, new_lr)
            lr = self.model.optimizer.lr
        print(K.eval(lr))

# the learning rate decay in each batch end
# and print the learning rate in each epoch end
# beside, if the learning rate has been reduced to a minimum  times, this process stops
class lr_minimum(keras.callbacks.Callback):
    def on_batch_end(self, batch, log=[]):
        global Hdecay, _epoch, _batch_size
        iterations = batch + _epoch * 1000.0 / _batch_size
        print('iterations:', iterations)

        new_lr = initial_lr * 1.0 / (1.0 + Hdecay * iterations)
        print('The New_lr:', new_lr)
        if initial_lr / new_lr > lr_minimum_rate:
            K.set_value(self.model.optimizer.lr, initial_lr/lr_minimum_rate)
        else:
            K.set_value(self.model.optimizer.lr, new_lr)
    def on_epoch_end(self, epoch, log=[]):
        lr = self.model.optimizer.lr
        global _epoch
        _epoch += 1
        print('Each epoch, the lr is', K.eval(lr))

##  Comparison between two ways of decay
Updating the learning rate after each eopch will avoid it decaying too fast, although it make the stepsize discontinuous. For very large volume of training data set, we don't suggest you use decay by batch because the learning rate can decay to 0 after just several epochs. Through the comparsion between using decay every batch (upper panel) and decay every epoch(lower panel), we choose the latter one because it can fit data better though it might be overfitting.

<img src="figures/loss_byBatch_VS_byEpoch.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" />

##  Test different values of decay
According to the formula above, larger value of decay will make learning rate decrease faster. Make sure your learning rate is not decaying too fast, so that the optimizer have the potential to arrive the global minimum. Here we show comparison of using three different decay rates (0.1, 0.01, 0.001), where the decay = 0.1 makes learning rate decrease too fast. Using decay = 0.01 or 0.001 enable the network to fit the training data better.

<img src="figures/loss_v2_decay_0.1-0.01-0.001.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" />

# Tool functions on loss computing
transform_to_coordinate function is to transform the predictive location information (middle_point_x, middle_point_y, width_of_image, height_of image) to a pair (top, left, bottom, right)

Checking_if_object function is due to only knowing the location of object, we have to compute which grid the middle point of object is situated in.  

return_coordinates is to transform the coordinates in the grid of a image to the coordinates in the whole images. The transformed rule is: 
a coordinate in the whole image = a coordinate in the one grid * the size of grid + the original point in this grid.   
Both width and height of a grid are 64, which is the area where one point in the last feature map can be impacted in the initial figure.


At here, the one of improvements of sliding window technique is we don't separately throw slide windows into the neural networks, because this method is doing many overlapping compuation. Instead, we implement the sliding window method convolutionally[1], which reduces the compuating at the overlapping areas between two sliding windows.  
[1] Sermanet P, Eigen D, Zhang X, et al. Overfeat: Integrated recognition, localization and detection using convolutional networks[J]. arXiv preprint arXiv:1312.6229, 2013.  
<img src="figures/Convoluational implementation.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" width="800"/>  



In [None]:
def transform_to_coordinate(x, y, w, h):
    x1 = x - K.cast(w / 2, 'float32')
    y1 = y - K.cast(h / 2, 'float32')
    x2 = x + K.cast(w / 2, 'float32')
    y2 = y + K.cast(h / 2, 'float32')
    return [x1, y1, x2, y2]

def Checking_if_object(x1_window, y1_window, x2_window, y2_window, x_max_true, x_min_true, y_max_true, y_min_true):
    x_middle_true = (x_max_true + x_min_true)/2.0
    y_middle_true = (y_max_true + y_min_true)/2.0
    matching_x = tf.logical_and(K.greater_equal(x=x_middle_true, y=x1_window), K.greater_equal(x=x2_window, y= x_middle_true))
    matching_y = tf.logical_and(K.greater_equal(x=y_middle_true, y=y1_window), K.greater_equal(x=y2_window, y= y_middle_true))
    matching = tf.logical_and(matching_x, matching_y)
    return matching

# predictive middle_point x abd y should be within the range from 0 to 1.
# predictive w and h should be the ratio bewteen actual length and grid size
# Due to grid size is 64 by 64, and the image is 640 by 480, therefore, the ratio is not more than 10
def return_coordinates(y_pred):
    global _epsilon
    xpred = y_pred[:, :, :, 1]
    xpred = tf.clip_by_value(t=xpred, clip_value_min = 0 + _epsilon, clip_value_max = 1 - _epsilon)
    xpred = K.cast(xpred, 'float32')
    xpred = xpred * 64 + np.arange(0, 608, 32).reshape(19, 1)
    ypred = y_pred[:, :, :, 2]
    ypred = tf.clip_by_value(t=ypred, clip_value_min = 0 + _epsilon, clip_value_max = 1 - _epsilon)
    ypred = K.cast(ypred, 'float32')
    ypred = ypred * 64 + np.arange(0, 417, 32).reshape(1, 14)
    wpred = y_pred[:, :, :, 3] * 64 * 10
    wpred = K.clip(x=wpred, max_value=640, min_value=50)
    hpred = y_pred[:, :, :, 4] * 64 * 10
    hpred = K.clip(x=hpred, max_value=480, min_value=50)
    return [xpred, ypred, wpred, hpred]

### tranforming the coordinate from a slide window to the image
<img style="display: block; margin: 0 auto;"
     src="figures/transforming the coordinates from slide window to image.PNG"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" width="500"/>

## Loss function
The two different loss function, Loss_v2 and Loss_v3, are custom loss function to compute the total loss among three types, location, object, classes. 
The difference between Loss_v2 and Loss_v3 is that we apply an accelerator on classification loss and object loss. The insight is from Retina Network[2]  

By analysizing the consequences, we find out the classifier is easier to find out the background and hard to recognize the object that we are interested in. Therefore, the most of cases is to build up a network called Region proposal network to search the area that we are probabily interested in. However, it brings up numerous computations, thus the paper[2] comes up with an idea that by setting different important factors on right classified results and mistakenly classified results, the attention of the network are forced to transfer to the hard training set, and does not care about right classified images. It will speed up the convergence.  
Besides, the important factor on Object Loss are highest than the other two because at the end, the scores of each prediction are more relevant with the object detected confidence.

[2] Lin T Y, Goyal P, Girshick R, et al. Focal loss for dense object detection[J]. IEEE transactions on pattern analysis and machine intelligence, 2018.

In [None]:
def Loss_v2(y_true, y_pred):
    # obtaining the predictive confidence on object detection Pc_pred and preprocessing it.
    Pc_pred = y_pred[:, :, :, 0]
    Pc_pred = K.cast(Pc_pred, 'float32')
    global _epsilon
    Pc_pred = tf.clip_by_value(t=Pc_pred, clip_value_min = _epsilon, clip_value_max = 1 - _epsilon)
    
    #transforming the coordinates from a grid to the whole image
    #and obtaining the actual coordinates in the whole images
    xpred, ypred, wpred, hpred = return_coordinates(y_pred)
    x1_pred, y1_pred, x2_pred, y2_pred = transform_to_coordinate(xpred, ypred, wpred, hpred)
    
    #obtaining the predictive classes and actual classes
    C_Class_Array = y_pred[:, :, :, 5:]
    x_max_true = y_true[:, :, :, 0]
    x_min_true = y_true[:, :, :, 1]
    y_max_true = y_true[:, :, :, 2]
    y_min_true = y_true[:, :, :, 3]
    C_index_true = y_true[:, :, :, 7:]
    C_index_true = K.cast(C_index_true, dtype='float32')

    #obtaining which grid is having the middle_point_Object 
    #and only in this grid, we compute location loss and classification loss, and in the rest of grid, we only care about object loss
    X_matrix = np.ndarray((19, 14, 2), dtype='float32')
    X_matrix[:, :, 0] = np.arange(0, 608, 32).reshape(19, 1)
    X_matrix[:, :, 1] = np.arange(0, 417, 32).reshape(1, 14)
    
    x1_window = X_matrix[:, :, 0]
    y1_window = X_matrix[:, :, 1]
    x2_window = X_matrix[:, :, 0] + 64
    y2_window = X_matrix[:, :, 1] + 64

    matching = Checking_if_object(x1_window, y1_window, x2_window, y2_window, x_max_true, x_min_true,
                                  y_max_true, y_min_true)
    mat = K.cast(matching, 'float32')
    
    #compute the classification loss and put an acceleator on it
    #the loss = (1 - p) * entropy(p) * mat
    #p stands for the predictive probabilities of classes
    #mat informs if there is an object in this grid
    C_Class_Array = tf.clip_by_value(t=C_Class_Array, clip_value_min = _epsilon, clip_value_max = 1 - _epsilon)
    Classification_loss = categorical_crossentropy(y_true= C_index_true, y_pred= C_Class_Array)

    Classification_loss = K.reshape(x=Classification_loss, shape=(-1, 19, 14, 1))

    Classification_loss  = (1 - C_Class_Array) * C_index_true * Classification_loss * weight_Classification_loss
    
    #compute location loss    
    Localization_loss = weight_Localization_loss * mat * (K.square(x1_pred - x_min_true) + K.square(x2_pred - x_max_true) + K.square(
        y1_pred - y_min_true) + K.square(y2_pred - y_max_true))
    
    #also put an acceleator
    # loss = (1 - Pc) * entropy(Pc) * mat + (1 - mat) * Pc * entropy(1-Pc)
    Object_loss = -(1 - Pc_pred) * K.log(Pc_pred) * mat - (1 - mat) * K.log(1-Pc_pred) * Pc_pred
    Object_loss = Object_loss * weight_Object_loss

    Total_loss = K.mean(axis=-1, x= K.mean(axis=-1, x=Classification_loss)) + K.mean(axis=-1, x=Localization_loss) + K.mean(axis=-1, x=Object_loss)
    Totalloss = K.mean(x=Total_loss, axis=-1)

    return Totalloss


def Loss_v3(y_true, y_pred):
    # obtaining the predictive confidence on object detection Pc_pred and preprocessing it.
    Pc_pred = y_pred[:, :, :, 0]
    Pc_pred = K.cast(Pc_pred, 'float32')
    
    #transforming the coordinates from a grid to the whole image
    #and obtaining the actual coordinates in the whole images
    xpred, ypred, wpred, hpred = return_coordinates(y_pred)
    x1_pred, y1_pred, x2_pred, y2_pred = transform_to_coordinate(xpred, ypred, wpred, hpred)
    
    #obtaining the predictive classes and actual classes
    C_Class_Array = y_pred[:, :, :, 5:]
    x_max_true = y_true[:, :, :, 0]
    x_min_true = y_true[:, :, :, 1]
    y_max_true = y_true[:, :, :, 2]
    y_min_true = y_true[:, :, :, 3]
    C_index_true = y_true[:, :, :, 7:]
    C_index_true = K.cast(C_index_true, dtype='float32')
    
    #obtaining which grid is having the middle_point_Object 
    #and only in this grid, we compute location loss and classification loss, and in the rest of grid, we only care about object loss   
    X_matrix = np.ndarray((19, 14, 2), dtype='float32')
    
    X_matrix[:, :, 0] = np.arange(0, 608, 32).reshape(19, 1)
    X_matrix[:, :, 1] = np.arange(0, 417, 32).reshape(1, 14)
    
    x1_window = X_matrix[:, :, 0]
    y1_window = X_matrix[:, :, 1]
    x2_window = X_matrix[:, :, 0] + 64
    y2_window = X_matrix[:, :, 1] + 64

    matching = Checking_if_object(x1_window, y1_window, x2_window, y2_window, x_max_true, x_min_true,
                                  y_max_true, y_min_true)
    mat = K.cast(matching, 'float32')
   
    #compute the classification loss and put an acceleator on it
    #the loss = entropy(p) * mat
    #p stands for the predictive probabilities of classes
    #mat informs if there is an object in this grid
    global _epsilon
    
    C_Class_Array = tf.clip_by_value(t=C_Class_Array, clip_value_min = _epsilon, clip_value_max = 1 - _epsilon)
    
    #compute location loss    
    Classification_loss = categorical_crossentropy(y_true= C_index_true, y_pred= C_Class_Array) * weight_Classification_loss

    Localization_loss = weight_Localization_loss * mat * (K.square(x1_pred - x_min_true) + K.square(x2_pred - x_max_true) + K.square(
        y1_pred - y_min_true) + K.square(y2_pred - y_max_true))
    
    #also put an acceleator
    #loss = mat * square(1 - Pc_pred) + (1 - mat) * square(Pc_pred)
    Object_loss = weight_Object_loss * (mat * K.square(1 - Pc_pred) + (1 - mat) * K.square(Pc_pred))

    Total_loss = K.mean(axis=-1, x=Classification_loss) + K.mean(axis=-1, x=Localization_loss) + K.mean(axis=-1, x=Object_loss)
    Totalloss = K.mean(x=Total_loss, axis=-1)

    return Totalloss

##  Compare the performance of minimizing loss_v2 and loss_v3
From the figures below, we can easily find out the the convergence of loss v2 is more spped than loss function v3.   
It proves that our insight is correct. By setting a small weight to right classified results and a big weight to wrong classified results, the network is forced to focus on hard classified samples, which speeds up the convergence.   
<img src="figures/loss_v2_VS_loss_v3.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 5px;" />

## The Utility tool for picking up the best prediction on each image
This function is to pick up the relatively best five predictive boxes among all bounding boxes in each image.  
Firstly, each predictive box has a score that is equal to the highest score on all classes.   
And we directly pick up the five predictive boxes that have the 5 highest scores as the output.

In [None]:
def generating_consequences(results):
    #########################################
    global _batch_size
    _batch_size = setting['batch_size']
    #########################################
    
    # Obtain the Probability confidence
    Pc = results[:, :, :, 0]
    Pc = K.reshape(x = Pc, shape=(-1, 19, 14, 1))
    
    # Transforming the middle_point corrdinates with width and height of bounding boxes to 
    # the top-left and bottom-right coordinates in bounding boxes with original point is the top-left image
    x ,y ,w ,h = return_coordinates(y_pred=results)
    x = K.reshape(x=x, shape=(-1, 19, 14, 1))
    y = K.reshape(x=y, shape=(-1, 19, 14, 1))
    w = K.reshape(x=w, shape=(-1, 19, 14, 1))
    h = K.reshape(x=h, shape=(-1, 19, 14, 1))
    Boxes = K.concatenate([x, y, w, h], axis=-1)
    
    # Obtain the Classes Prediction
    Class = results[:, :, :, 5:]
    
    # Compute the scores for all classes in one bounding box
    Box_scores = Pc * Class
    
    # Picking the best as the scores in this bounding box
    Box_classes = K.argmax(Box_scores, axis=-1)
    Box_class_scores = K.max(Box_scores, axis=-1)
    Box_classes = K.reshape(x=Box_classes, shape=(_batch_size, -1))
    Box_class_scores = K.reshape(x=Box_class_scores, shape=(_batch_size, -1))
    Boxes = K.reshape(x=Boxes, shape=(_batch_size, -1, 4))
    
    # Picking up the top five bounding boxes as the output
    # Finding out the indices and catch the value by gather function
    TOPK = tf.nn.top_k(input=Box_class_scores, k=5)
    indices = TOPK.indices
    temp = K.zeros(indices.get_shape(), dtype='int32')
    tmp = K.arange(0, _batch_size, 1, dtype='int32')
    tmp = K.reshape(x=tmp, shape=(_batch_size, -1))
    temp = temp + tmp
    indices = tf.stack([temp, indices], axis= 2)
    scores = tf.gather_nd(params=Box_class_scores, indices=indices)
    boxes = tf.gather_nd(params=Boxes, indices=indices)
    classes = tf.gather_nd(params=Box_classes, indices=indices)
    return boxes, classes, scores