In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow.keras.backend as K
import numpy as np
# image augmentation
import imgaug as ia
from imgaug import augmenters as iaa

In [2]:
# Parameters

IMAGE_H, IMAGE_W = 512, 512
GRID_H,  GRID_W = 16, 16 # GRID size = IMAGE size / 32
NUM_CLASSES = 20

NUM_BOXES = 5 # number of anchors
ANCHORS = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828] # Anchor size

BATCH_SIZE = 10

EPOCHS = 100

LAMBDA_NOOBJECT = 0.5
LAMBDA_OBJECT = 1
LAMBDA_CLASS = 1
LAMBDA_COORD = 5

max_annot = 0

# Model

![image.png](images/darknet19.png)

In [3]:
class ConvBNLeakyRelu(tf.keras.layers.Layer):
    def __init__(self, num_filters, kernel_size):
        super().__init__()
        self.conv = tf.keras.layers.Conv2D(filters=num_filters, kernel_size=kernel_size, padding="same")
        self.bn = tf.keras.layers.BatchNormalization()
        # LeakyRelu: see YOLOv1 paper
        self.leakyrelu = tf.keras.layers.LeakyReLU(alpha=0.1)
        
    def call(self, inputs):
        x = self.conv(inputs)
        x = self.bn(x)
        output = self.leakyrelu(x)
        
        return output

In [4]:
class DarkNet19Classification(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.convolutional1 = ConvBNLeakyRelu(32, (3, 3))
        self.maxpool1 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional2 = ConvBNLeakyRelu(64, (3, 3))
        self.maxpool2 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional3 = ConvBNLeakyRelu(128, (3, 3))
        self.convolutional4 = ConvBNLeakyRelu(64, (1, 1))
        self.convolutional5 = ConvBNLeakyRelu(128, (3, 3))
        self.maxpool3 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional6 = ConvBNLeakyRelu(256, (3, 3))
        self.convolutional7 = ConvBNLeakyRelu(128, (1, 1))
        self.convolutional8 = ConvBNLeakyRelu(256, (3, 3))
        self.maxpool4 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional9 = ConvBNLeakyRelu(512, (3, 3))
        self.convolutional10 = ConvBNLeakyRelu(256, (1, 1))
        self.convolutional11 = ConvBNLeakyRelu(512, (3, 3))
        self.convolutional12 = ConvBNLeakyRelu(256, (1, 1))
        self.convolutional13 = ConvBNLeakyRelu(512, (3, 3))
        self.maxpool5 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional14 = ConvBNLeakyRelu(1024, (3, 3))
        self.convolutional15 = ConvBNLeakyRelu(512, (1, 1))
        self.convolutional16 = ConvBNLeakyRelu(1024, (3, 3))
        self.convolutional17 = ConvBNLeakyRelu(512, (1, 1))
        self.convolutional18 = ConvBNLeakyRelu(1024, (3, 3))
        # the following is replaced in detection network
        # 1000: NUM_CLASSES
        self.convolutional19 = ConvBNLeakyRelu(1000, (1, 1))
        self.averagepool = tf.keras.layers.GlobalAveragePooling2D()
        
    def call(self, inputs):
        x = self.convolutional1(inputs)
        x = self.maxpool1(x)
        x = self.convolutional2(x)
        x = self.maxpool2(x)
        x = self.convolutional3(x)
        x = self.convolutional4(x)
        x = self.convolutional5(x)
        x = self.maxpool3(x)
        x = self.convolutional6(x)
        x = self.convolutional7(x)
        x = self.convolutional8(x)
        x = self.maxpool4(x)
        x = self.convolutional9(x)
        x = self.convolutional10(x)
        x = self.convolutional11(x)
        x = self.convolutional12(x)
        x = self.convolutional13(x)
        x = self.maxpool5(x)
        x = self.convolutional14(x)
        x = self.convolutional15(x)
        x = self.convolutional16(x)
        x = self.convolutional17(x)
        x = self.convolutional18(x)
        x = self.convolutional19(x)
        x = self.averagepool(x)
        output = tf.nn.softmax(x)
        
        return output

In [5]:
class DarkNet19Detection(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.convolutional1 = ConvBNLeakyRelu(32, (3, 3))
        self.maxpool1 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional2 = ConvBNLeakyRelu(64, (3, 3))
        self.maxpool2 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional3 = ConvBNLeakyRelu(128, (3, 3))
        self.convolutional4 = ConvBNLeakyRelu(64, (1, 1))
        self.convolutional5 = ConvBNLeakyRelu(128, (3, 3))
        self.maxpool3 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional6 = ConvBNLeakyRelu(256, (3, 3))
        self.convolutional7 = ConvBNLeakyRelu(128, (1, 1))
        self.convolutional8 = ConvBNLeakyRelu(256, (3, 3))
        self.maxpool4 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional9 = ConvBNLeakyRelu(512, (3, 3))
        self.convolutional10 = ConvBNLeakyRelu(256, (1, 1))
        self.convolutional11 = ConvBNLeakyRelu(512, (3, 3))
        self.convolutional12 = ConvBNLeakyRelu(256, (1, 1))
        self.convolutional13 = ConvBNLeakyRelu(512, (3, 3))
        self.maxpool5 = tf.keras.layers.MaxPooling2D(pool_size=[2,2])
        self.convolutional14 = ConvBNLeakyRelu(1024, (3, 3))
        self.convolutional15 = ConvBNLeakyRelu(512, (1, 1))
        self.convolutional16 = ConvBNLeakyRelu(1024, (3, 3))
        self.convolutional17 = ConvBNLeakyRelu(512, (1, 1))
        self.convolutional18 = ConvBNLeakyRelu(1024, (3, 3))
        
        # the following is replaced
        # 1000: NUM_CLASSES
        self.convolutional19 = ConvBNLeakyRelu(1024, (3, 3))
        self.final_cov = tf.keras.layers.Conv2D(
            filters=NUM_BOXES*(1+4+NUM_CLASSES),
            kernel_size=(1,1),
            padding="same")
        self.reshape = tf.keras.layers.Reshape([GRID_W, GRID_H, NUM_BOXES, 4 + 1 + NUM_CLASSES])

        self.convolutional_pass_through = ConvBNLeakyRelu(64, (1, 1))
        self.concat = tf.keras.layers.Concatenate(axis=-1)
        
    def call(self, inputs):
        x = self.convolutional1(inputs)
        x = self.maxpool1(x)
        x = self.convolutional2(x)
        x = self.maxpool2(x)
        x = self.convolutional3(x)
        x = self.convolutional4(x)
        x = self.convolutional5(x)
        x = self.maxpool3(x)
        x = self.convolutional6(x)
        x = self.convolutional7(x)
        x = self.convolutional8(x)
        x = self.maxpool4(x)
        x = self.convolutional9(x)
        x = self.convolutional10(x)
        x = self.convolutional11(x)
        x = self.convolutional12(x)
        x = self.convolutional13(x)
        
        pass_through = x
        
        x = self.maxpool5(x)
        x = self.convolutional14(x)
        x = self.convolutional15(x)
        x = self.convolutional16(x)
        x = self.convolutional17(x)
        x = self.convolutional18(x)
        
        # 224 case: 14*14*512 -> 14*14*64 -> 7*7*256
        pass_through = self.convolutional_pass_through(pass_through)
        pass_through = tf.nn.space_to_depth(pass_through, block_size=2)
        x = self.concat([pass_through, x])
        
        x = self.convolutional19(x)
        x = self.final_cov(x)
        output = self.reshape(x)
        
        return output

# Data

In [6]:
dataset = tfds.load("voc", batch_size=BATCH_SIZE)

[1mDownloading and preparing dataset voc/2007/4.0.0 (download: 868.85 MiB, generated: Unknown size, total: 868.85 MiB) to /root/tensorflow_datasets/voc/2007/4.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…











HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/voc/2007/4.0.0.incompleteVLWYTZ/voc-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=4952.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/voc/2007/4.0.0.incompleteVLWYTZ/voc-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=2501.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/voc/2007/4.0.0.incompleteVLWYTZ/voc-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=2510.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Computing statistics...', max=3.0, style=ProgressStyle(de…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


[1mDataset voc downloaded and prepared to /root/tensorflow_datasets/voc/2007/4.0.0. Subsequent calls will reuse this data.[0m


In [7]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

data loading, preprocessing, and augmentation
Reference https://github.com/jmpap/YOLOV2-Tensorflow-2.0
Modified for tensorflow database

In [8]:
def val_generator(input_dataset):
    '''
    put bbox and label together
    '''
    while True:
        batch = next(iter(input_dataset))
    # for batch in input_dataset:
        # conversion tensor->numpy
        img = tf.image.resize(batch["image"],(512,512)).numpy()/255.
        bbox = batch["objects"]["bbox"].numpy()
        label = batch["objects"]["label"].numpy()
        bbox_and_label = []
        for i in range(img.shape[0]):
            bbox_and_label.append([])
            for j in range(len(label[i])):
                bbox_and_label_item = bbox[i,j].tolist()+[label[i,j]]
                bbox_and_label[i].append(bbox_and_label_item)
                
        # conversion numpy->tensor
        batch = (tf.convert_to_tensor(img), tf.convert_to_tensor(bbox_and_label))
        #batch = (img_aug, boxes)
        yield batch
        
def augmentation_generator(input_dataset):
    '''
    Augmented batch generator from a yolo dataset

    Parameters
    ----------
    - YOLO dataset
    
    Returns
    -------
    - augmented batch : tensor (shape : batch_size, IMAGE_W, IMAGE_H, 3)
        batch : tupple(images, annotations)
        batch[0] : images : tensor (shape : batch_size, IMAGE_W, IMAGE_H, 3)
        batch[1] : annotations : tensor (shape : batch_size, max annot, 5)
    '''
    while True:
        batch = next(iter(input_dataset))
#     for batch in input_dataset:
        # conversion tensor->numpy
        img = tf.image.resize(batch["image"],(512,512)).numpy()/255.
        bbox = batch["objects"]["bbox"].numpy()
        label = batch["objects"]["label"].numpy()
        bbox_and_label = []
        for i in range(img.shape[0]):
            bbox_and_label.append([])
            for j in range(len(label[i])):
                bbox_and_label_item = bbox[i,j].tolist()+[label[i,j]]
                bbox_and_label[i].append(bbox_and_label_item)
        
        # conversion bbox numpy->ia object
        ia_boxes = []
        for i in range(img.shape[0]):
            ia_bbs = [ia.BoundingBox(x1=bb[0],
                                       y1=bb[1],
                                       x2=bb[2],
                                       y2=bb[3]) for bb in bbox[i]
                      if (bb[0] + bb[1] +bb[2] + bb[3] > 0)]
            ia_boxes.append(ia.BoundingBoxesOnImage(ia_bbs, shape=(IMAGE_W, IMAGE_H)))
        # data augmentation
        seq = iaa.Sequential([
            iaa.Fliplr(0.5),
            iaa.Flipud(0.5),
            iaa.Multiply((0.4, 1.6)), # change brightness
            #iaa.ContrastNormalization((0.5, 1.5)),
            #iaa.Affine(translate_px={"x": (-100,100), "y": (-100,100)}, scale=(0.7, 1.30))
            ])
        #seq = iaa.Sequential([])
        seq_det = seq.to_deterministic()
        img_aug = seq_det.augment_images(img)
        img_aug = np.clip(img_aug, 0, 1)
        boxes_aug = seq_det.augment_bounding_boxes(ia_boxes)
        # conversion ia object -> bbox numpy
        for i in range(img.shape[0]):
#             boxes_aug[i] = boxes_aug[i].remove_out_of_image().clip_out_of_image()
            boxes_aug[i] = boxes_aug[i].remove_out_of_image()
            for j, bb in enumerate(boxes_aug[i].bounding_boxes):
                bbox_and_label[i][j][0] = bb.x1
                bbox_and_label[i][j][1] = bb.y1
                bbox_and_label[i][j][2] = bb.x2
                bbox_and_label[i][j][3] = bb.y2
                
        # conversion numpy->tensor
        batch = (tf.convert_to_tensor(img_aug), tf.convert_to_tensor(bbox_and_label))
        #batch = (img_aug, boxes)
        yield batch
        
def process_true_boxes(true_boxes, anchors, image_width, image_height):
    '''
    Build image ground truth in YOLO format from image true_boxes and anchors.
    
    Parameters
    ----------
    - true_boxes : tensor, shape (max_annot, 5), format : x1 y1 x2 y2 c, coords unit : image pixel
    - anchors : list [anchor_1_width, anchor_1_height, anchor_2_width, anchor_2_height...]
        anchors coords unit : grid cell
    - image_width, image_height : int (pixels)
    
    Returns
    -------
    - detector_mask : array, shape (GRID_W, GRID_H, anchors_count, 1)
        1 if bounding box detected by grid cell, else 0
    - matching_true_boxes : array, shape (GRID_W, GRID_H, anchors_count, 5)
        Contains adjusted coords of bounding box in YOLO format
    -true_boxes_grid : array, same shape than true_boxes (max_annot, 5),
        format : x, y, w, h, c, coords unit : grid cell
        
    Note:
    -----
    Bounding box in YOLO Format : x, y, w, h, c
    x, y : center of bounding box, unit : grid cell
    w, h : width and height of bounding box, unit : grid cell
    c : label index
    ''' 
    
    scale = IMAGE_W / GRID_W # scale = 32
    
    anchors_count = len(anchors) // 2
    anchors = np.array(anchors)
    anchors = anchors.reshape(len(anchors) // 2, 2)
    
    detector_mask = np.zeros((GRID_W, GRID_H, anchors_count, 1))
    matching_true_boxes = np.zeros((GRID_W, GRID_H, anchors_count, 5))
    
    # convert true_boxes numpy array -> tensor
    true_boxes = true_boxes.numpy()
    
    true_boxes_grid = np.zeros(true_boxes.shape)
    
    # convert bounding box coords and localize bounding box
    for i, box in enumerate(true_boxes):
        # convert box coords to x, y, w, h and convert to grids coord
        w = (box[2] - box[0]) / scale
        h = (box[3] - box[1]) / scale    
        x = ((box[0] + box[2]) / 2) / scale
        y = ((box[1] + box[3]) / 2) / scale
        true_boxes_grid[i,...] = np.array([x, y, w, h, box[4]])
        if w * h > 0: # box exists
            # calculate iou between box and each anchors and find best anchors
            best_iou = 0
            best_anchor = 0
            for i in range(anchors_count): 
                # iou (anchor and box are shifted to 0,0)
                intersect = np.minimum(w, anchors[i,0]) * np.minimum(h, anchors[i,1])
                union = (anchors[i,0] * anchors[i,1]) + (w * h) - intersect
                iou = intersect / union
                if iou > best_iou:
                    best_iou = iou
                    best_anchor = i
            # localize box in detector_mask and matching true_boxes
            if best_iou > 0:
                x_coord = np.floor(x).astype('int')
                y_coord = np.floor(y).astype('int')
                detector_mask[y_coord, x_coord, best_anchor] = 1
                yolo_box = np.array([x, y, w, h, box[4]])
                matching_true_boxes[y_coord, x_coord, best_anchor] = yolo_box
    return matching_true_boxes, detector_mask, true_boxes_grid

def ground_truth_generator(input_dataset):
    '''
    Ground truth batch generator from a yolo dataset, ready to compare with YOLO prediction in loss function.

    Parameters
    ----------
    - YOLO dataset. Generate batch:
        batch : tupple(images, annotations)
        batch[0] : images : tensor (shape : batch_size, IMAGE_W, IMAGE_H, 3)
        batch[1] : annotations : tensor (shape : batch_size, max annot, 5)
        
    Returns
    -------
    - imgs : images to predict. tensor (shape : batch_size, IMAGE_H, IMAGE_W, 3)
    - detector_mask : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, 1)
        1 if bounding box detected by grid cell, else 0
    - matching_true_boxes : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, 5)
        Contains adjusted coords of bounding box in YOLO format
    - class_one_hot : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, class_count)
        One hot representation of bounding box label
    - true_boxes_grid : annotations : tensor (shape : batch_size, max annot, 5)
        true_boxes format : x, y, w, h, c, coords unit : grid cell
    '''
    while True:
        batch = next(iter(input_dataset))
#     for batch in dataset:
        # imgs
        imgs = batch[0]
        
        # true boxes
        true_boxes = batch[1]
        
        # matching_true_boxes and detector_mask
        batch_matching_true_boxes = []
        batch_detector_mask = []
        batch_true_boxes_grid = []
        
        for i in range(true_boxes.shape[0]):     
            one_matching_true_boxes, one_detector_mask, true_boxes_grid = process_true_boxes(true_boxes[i],
                                                                                           ANCHORS,
                                                                                           IMAGE_W,
                                                                                           IMAGE_H)
            batch_matching_true_boxes.append(one_matching_true_boxes)
            batch_detector_mask.append(one_detector_mask)
            batch_true_boxes_grid.append(true_boxes_grid)
                
        detector_mask = tf.convert_to_tensor(np.array(batch_detector_mask), dtype='float32')
        matching_true_boxes = tf.convert_to_tensor(np.array(batch_matching_true_boxes), dtype='float32')
        true_boxes_grid = tf.convert_to_tensor(np.array(batch_true_boxes_grid), dtype='float32')
        
        # class one_hot
        matching_classes = tf.keras.backend.cast(matching_true_boxes[..., 4], 'int32') 
        class_one_hot = tf.keras.backend.one_hot(matching_classes, NUM_CLASSES + 1)[:,:,:,:,1:]
        class_one_hot = tf.cast(class_one_hot, dtype='float32')
        
        batch = (imgs, detector_mask, matching_true_boxes, class_one_hot, true_boxes_grid)
        yield batch

In [9]:
aug_train_dataset = augmentation_generator(train_dataset)
train_gen = ground_truth_generator(aug_train_dataset)
val_gen = ground_truth_generator(val_generator(val_dataset))

In [10]:
model = DarkNet19Detection()

In [11]:
def iou(x1, y1, w1, h1, x2, y2, w2, h2):  
    xmin1 = x1 - 0.5*w1
    xmax1 = x1 + 0.5*w1
    ymin1 = y1 - 0.5*h1
    ymax1 = y1 + 0.5*h1
    xmin2 = x2 - 0.5*w2
    xmax2 = x2 + 0.5*w2
    ymin2 = y2 - 0.5*h2
    ymax2 = y2 + 0.5*h2
    interx = np.minimum(xmax1, xmax2) - np.maximum(xmin1, xmin2)
    intery = np.minimum(ymax1, ymax2) - np.maximum(ymin1, ymin2)
    inter = interx * intery
    union = w1*h1 + w2*h2 - inter
    iou = inter / (union + 1e-6)
    return iou

![image.png](images/loss.png)
(from YOLOv1 paper)

In [12]:
# loss

def yolov2_loss(detector_mask, matching_true_boxes, class_one_hot, true_boxes_grid, y_pred):
    '''
    Calculate YOLO V2 loss from prediction (y_pred) and ground truth tensors (detector_mask,
    matching_true_boxes, class_one_hot, true_boxes_grid,)

    Parameters
    ----------
    - detector_mask : tensor, shape (batch, size, GRID_W, GRID_H, anchors_count, 1)
        1 if bounding box detected by grid cell, else 0
    - matching_true_boxes : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, 5)
        Contains adjusted coords of bounding box in YOLO format
    - class_one_hot : tensor, shape (batch_size, GRID_W, GRID_H, anchors_count, class_count)
        One hot representation of bounding box label
    - true_boxes_grid : annotations : tensor (shape : batch_size, max annot, 5)
        true_boxes_grid format : x, y, w, h, c (coords unit : grid cell)
    - y_pred : prediction from model. tensor (shape : batch_size, GRID_W, GRID_H, anchors count, (5 + labels count)
    
    Returns
    -------
    - loss : scalar
    - sub_loss : sub loss list : coords loss, class loss and conf loss : scalar
    '''
    
    # anchors tensor
    anchors = np.array(ANCHORS)
    anchors = anchors.reshape(len(anchors) // 2, 2)
    
    # grid coords tensor
    coord_x = tf.cast(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)), tf.float32)
    coord_y = tf.transpose(coord_x, (0,2,1,3,4))
    coords = tf.tile(tf.concat([coord_x,coord_y], -1), [y_pred.shape[0], 1, 1, 5, 1])
    
    # coordinate loss
    pred_xy = K.sigmoid(y_pred[:,:,:,:,0:2]) # adjust coords between 0 and 1
    pred_xy = (pred_xy + coords) # add cell coord for comparaison with ground truth. New coords in grid cell unit
    pred_wh = K.exp(y_pred[:,:,:,:,2:4]) * anchors # adjust width and height for comparaison with ground truth. New coords in grid cell unit
    #pred_wh = (pred_wh * anchors) # unit : grid cell
    nb_detector_mask = K.sum(tf.cast(detector_mask > 0.0, tf.float32))
    xy_loss = LAMBDA_COORD * K.sum(detector_mask * K.square(matching_true_boxes[...,:2] - pred_xy)) / (nb_detector_mask + 1e-6) # Non /2
    wh_loss = LAMBDA_COORD * K.sum(detector_mask * K.square(K.sqrt(matching_true_boxes[...,2:4]) - 
                                                            K.sqrt(pred_wh))) / (nb_detector_mask + 1e-6)
    coord_loss = xy_loss + wh_loss
    
    # class loss    
    pred_box_class = y_pred[..., 5:]
    true_box_class = tf.argmax(class_one_hot, -1)
    #class_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
    class_loss = K.sparse_categorical_crossentropy(target=true_box_class, output=pred_box_class, from_logits=True)
    class_loss = K.expand_dims(class_loss, -1) * detector_mask
    class_loss = LAMBDA_CLASS * K.sum(class_loss) / (nb_detector_mask + 1e-6)
    
    # confidence loss
    pred_conf = K.sigmoid(y_pred[...,4:5])
    # for each detector : iou between prediction and ground truth
    x1 = matching_true_boxes[...,0]
    y1 = matching_true_boxes[...,1]
    w1 = matching_true_boxes[...,2]
    h1 = matching_true_boxes[...,3]
    x2 = pred_xy[...,0]
    y2 = pred_xy[...,1]
    w2 = pred_wh[...,0]
    h2 = pred_wh[...,1]
    ious = iou(x1, y1, w1, h1, x2, y2, w2, h2)
    ious = K.expand_dims(ious, -1)
     
    # for each detector : best ious between prediction and true_boxes (every bounding box of image)
    pred_xy = K.expand_dims(pred_xy, 4) # shape : m, GRID_W, GRID_H, BOX, 1, 2 
    pred_wh = K.expand_dims(pred_wh, 4)
    pred_wh_half = pred_wh / 2.
    pred_mins = pred_xy - pred_wh_half
    pred_maxes = pred_xy + pred_wh_half
    true_boxe_shape = K.int_shape(true_boxes_grid)
    true_boxes_grid = K.reshape(true_boxes_grid, [true_boxe_shape[0], 1, 1, 1, true_boxe_shape[1], true_boxe_shape[2]])
    true_xy = true_boxes_grid[...,0:2]
    true_wh = true_boxes_grid[...,2:4]
    true_wh_half = true_wh * 0.5
    true_mins = true_xy - true_wh_half
    true_maxes = true_xy + true_wh_half
    intersect_mins = K.maximum(pred_mins, true_mins) # shape : m, GRID_W, GRID_H, BOX, max_annot, 2 
    intersect_maxes = K.minimum(pred_maxes, true_maxes) # shape : m, GRID_W, GRID_H, BOX, max_annot, 2
    intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
    pred_areas = pred_wh[..., 0] * pred_wh[..., 1] # shape : m, GRID_W, GRID_H, BOX, 1, 1
    true_areas = true_wh[..., 0] * true_wh[..., 1] # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores = intersect_areas / union_areas # shape : m, GRID_W, GRID_H, BOX, max_annot, 1
    best_ious = K.max(iou_scores, axis=4)  # Best IOU scores.
    best_ious = K.expand_dims(best_ious) # shape : m, GRID_W, GRID_H, BOX, 1
    
    # no object confidence loss
    no_object_detection = K.cast(best_ious < 0.6, K.dtype(best_ious)) 
    noobj_mask = no_object_detection * (1 - detector_mask)
    nb_noobj_mask  = K.sum(tf.cast(noobj_mask  > 0.0, tf.float32))
    
    noobject_loss =  LAMBDA_NOOBJECT * K.sum(noobj_mask * K.square(-pred_conf)) / (nb_noobj_mask + 1e-6)
    # object confidence loss
    object_loss = LAMBDA_OBJECT * K.sum(detector_mask * K.square(ious - pred_conf)) / (nb_detector_mask + 1e-6)
    # total confidence loss
    conf_loss = noobject_loss + object_loss
    
    # total loss
    loss = conf_loss + class_loss + coord_loss
    sub_loss = [conf_loss, class_loss, coord_loss]  
              
    return loss, sub_loss

In [13]:
# gradients
def grad(model, img, detector_mask, matching_true_boxes, class_one_hot, true_boxes, training=True):
    with tf.GradientTape() as tape:
        y_pred = model(img)
        loss, sub_loss = yolov2_loss(detector_mask, matching_true_boxes, class_one_hot, true_boxes, y_pred)
    return loss, sub_loss, tape.gradient(loss, model.trainable_variables)

In [17]:
# training
def train(epochs, model, train_dataset, val_dataset, steps_per_epoch_train, steps_per_epoch_val, train_name = 'train'):
    '''
    Train YOLO model for n epochs.
    Eval loss on training and validation dataset.
    Log training loss and validation loss for tensorboard.
    Save best weights during training (according to validation loss).

    Parameters
    ----------
    - epochs : integer, number of epochs to train the model.
    - model : YOLO model.
    - train_dataset : YOLO ground truth and image generator from training dataset.
    - val_dataset : YOLO ground truth and image generator from validation dataset.
    - steps_per_epoch_train : integer, number of batch to complete one epoch for train_dataset.
    - steps_per_epoch_val : integer, number of batch to complete one epoch for val_dataset.
    - train_name : string, training name used to log loss and save weights.
    
    Notes :
    - train_dataset and val_dataset generate YOLO ground truth tensors : detector_mask,
      matching_true_boxes, class_one_hot, true_boxes_grid. Shape of these tensors (batch size, tensor shape).
    - steps per epoch = number of images in dataset // batch size of dataset
    
    Returns
    -------
    - loss history : [train_loss_history, val_loss_history] : list of average loss for each epoch.
    '''
    num_epochs = epochs
    steps_per_epoch_train = steps_per_epoch_train
    steps_per_epoch_val = steps_per_epoch_val
    train_loss_history = []
    val_loss_history = []
    best_val_loss = 1e6
    
    # optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    
    # training
    for epoch in range(num_epochs):
        epoch_loss = []
        epoch_val_loss = []
        epoch_val_sub_loss = []
        print('Epoch {} :'.format(epoch))
        # train
        for batch_idx in range(steps_per_epoch_train): 
            img, detector_mask, matching_true_boxes, class_one_hot, true_boxes =  next(train_dataset)
            loss, _, grads = grad(model, img, detector_mask, matching_true_boxes, class_one_hot, true_boxes)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            epoch_loss.append(loss)
            print('-', end='')
        print(' | ', end='')
        # val
        for batch_idx in range(steps_per_epoch_val): 
            img, detector_mask, matching_true_boxes, class_one_hot, true_boxes =  next(val_dataset)
            loss, sub_loss, grads = grad(model, img, detector_mask, matching_true_boxes, class_one_hot, true_boxes, training=False)
            epoch_val_loss.append(loss)
            epoch_val_sub_loss.append(sub_loss)
            print('-', end='')

        loss_avg = np.mean(np.array(epoch_loss))
        val_loss_avg = np.mean(np.array(epoch_val_loss))
        sub_loss_avg = np.mean(np.array(epoch_val_sub_loss), axis=0)
        train_loss_history.append(loss_avg)
        val_loss_history.append(val_loss_avg)
        
        print(' loss = {:.4f}, val_loss = {:.4f} (conf={:.4f}, class={:.4f}, coords={:.4f})'.format(
            loss_avg, val_loss_avg, sub_loss_avg[0], sub_loss_avg[1], sub_loss_avg[2]))
        
    return [train_loss_history, val_loss_history]

In [18]:
aug_train_dataset = augmentation_generator(train_dataset)
train_gen = ground_truth_generator(aug_train_dataset)
val_gen = ground_truth_generator(val_generator(val_dataset))

In [19]:
train(EPOCHS, model, train_gen, val_gen, 10, 2, 'training_1')

Epoch 0 :
---------- | -- loss = 9.7606, val_loss = 9.9467 (conf=0.3096, class=2.9955, coords=6.6415)
Epoch 1 :
---------- | -- loss = 9.7792, val_loss = 9.9419 (conf=0.3094, class=2.9955, coords=6.6371)
Epoch 2 :
---------- | -- loss = 9.7678, val_loss = 9.9364 (conf=0.3090, class=2.9955, coords=6.6319)
Epoch 3 :
---------- | -- loss = 9.7545, val_loss = 9.9297 (conf=0.3086, class=2.9955, coords=6.6256)
Epoch 4 :
---------- | -- loss = 9.7404, val_loss = 9.9214 (conf=0.3082, class=2.9954, coords=6.6178)
Epoch 5 :
---------- | -- loss = 9.7451, val_loss = 9.9106 (conf=0.3075, class=2.9954, coords=6.6077)
Epoch 6 :
---------- | -- loss = 9.7254, val_loss = 9.8965 (conf=0.3067, class=2.9954, coords=6.5944)
Epoch 7 :
---------- | -- loss = 9.7108, val_loss = 9.8779 (conf=0.3055, class=2.9954, coords=6.5770)
Epoch 8 :
---------- | -- loss = 9.7073, val_loss = 9.8518 (conf=0.3040, class=2.9954, coords=6.5525)
Epoch 9 :
---------- | -- loss = 9.6749, val_loss = 9.8157 (conf=0.3017, class=2.9

KeyboardInterrupt: 

The paper trained DarkNet19Classification first.  
Schedule learning rate

# Reference

[YOLOv2 Paper](https://arxiv.org/pdf/1612.08242.pdf)  
[YOLOv1 Paper](https://arxiv.org/pdf/1506.02640.pdf)  
[jmpap/YOLOV2-Tensorflow-2.0](https://github.com/jmpap/YOLOV2-Tensorflow-2.0)