# 深度學習第二次競賽報告 - 第12組 工具人智慧
組員：
<br/>107024501 高瑀鍹
<br/>107024506 王子誠
<br/>107024511 羅揚
<br/>107024522 戴子翔

## Outlines
這次的競賽中，我們主要的方法條列如下：
1. Balance Data
2. Data Augmentation
3. Models
4. Training
5. Non-max Suppression
6. 水平翻轉預測
7. Ensemble

以下會個別介紹每個方法。

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
# import tensorflow_addons as tfa
import numpy as np

from datetime import datetime

%matplotlib inline
import matplotlib.pyplot as plt
import cv2
import os
import random

from tensorflow import keras
from tensorflow.keras import layers

import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

from shutil import copyfile
import os

In [None]:
classes_name =  ["aeroplane", "bicycle", "bird", "boat", "bottle", 
                 "bus", "car", "cat", "chair", "cow", "diningtable", 
                 "dog", "horse", "motorbike", "person", "pottedplant", 
                 "sheep", "sofa", "train","tvmonitor"]

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
# common params
IMAGE_SIZE = 448
BATCH_SIZE = 8
NUM_CLASSES = 20
MAX_OBJECTS_PER_IMAGE = 20

# dataset params
PATH_TO_DATA = "../data"
DATA_PATH = PATH_TO_DATA + '/train_1.txt'
IMAGE_DIR = PATH_TO_DATA + '/VOCdevkit_train/VOCdevkit_train/VOC2007/JPEGImages/'

# model params
CELL_SIZE = 7
BOXES_PER_CELL = 2
OBJECT_SCALE = 1
NOOBJECT_SCALE = 0.5
CLASS_SCALE = 1
COORD_SCALE = 5

# training params
LEARNING_RATE = 1e-5
EPOCHS = 15

is_fine_tune = False

CKT_Dir = "../ckpts/17_resnet_balance"
checkpoint_name = 'yolo_v1_17_resnet_balance'

is_load_best = False

PROB_THRES = 0.01
IOU_THRES = 0.3

## Balance Data

不同label之間的個數相差很多，例如 : 人(label 14)的個數就高達5392筆，相反地公車(label 5)的個數就只有272筆，最高類別的個數與最低類別的個數相差約20倍，這會造成預測結果被混淆而傾向將多數的結果視為人，且公車因為資料少，就很難判定。
<br/>我們藉由不增加原先個數多的label設定條件，並增加原始label個數少的物品，這樣就可以達到他們不同類別個數之間相差沒有那麼多。根據最後的結果，平衡後的資料最高類別的個數 (6979筆; label 8) 與最低類別的個數 (3241筆; label 0) 相差約2倍，可見類別的個數相較之前平衡許多。

In [None]:
 def list_add(a,b):
    c = []
    for i in range(len(a)):
        c.append(a[i]+b[i])
    return c

training_data_file = open("../data/pascal_voc_training_data.txt", "r")
class imbalance:
  def __init__(self):
    self.each_class_count = []
    self.each_pic_count = []
    self.add_pic_index = []
    self.new = []

    for i, line in enumerate(training_data_file):
      line = line.strip()
      self.new.append(line)
      a = line.split()
      b = len(a)
      e = []
      for j in range(5,b,5):
        e.append(a[j])
        f = [e.count('0'),e.count('1'),e.count('2'),e.count('3'),e.count('4'),e.count('5'),e.count('6'),e.count('7'),e.count('8'),e.count('9'),e.count('10'),e.count('11'),e.count('12'),e.count('13'),e.count('14'),e.count('15'),e.count('16'),e.count('17'),e.count('18'),e.count('19')]
      self.each_pic_count.append(f)
    g = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    for i in range(4974):
      g = list_add(g,self.each_pic_count[i])
    self.each_class_count = g 
    d = self.each_pic_count
    
    #index 
    index5 = []
    for i in range(4974):
      if (d[i][5]>0) and (d[i][14] == 0):
        index5.append(i)
    index10 = []
    for i in range(4974):  
      if (d[i][10]>0) and (d[i][14] < 2) and (d[i][8] < 3):
        index10.append(i)
    index0 = []
    for i in range(4974): 
      if (d[i][0]>0) and (d[i][14] == 0) and (d[i][5] == 0)and (d[i][6] == 0) and(d[i][8] == 0) :
        index0.append(i)
    index18 = []
    for i in range(4974):
      if (d[i][18]>0) and (d[i][14] == 0) and (d[i][5] == 0)and (d[i][6] == 0) and(d[i][8] == 0) and (d[i][10] == 0) :
        index18.append(i)
    index19 = []
    for i in range(4974):    
      if (d[i][19]>0) and (d[i][14] == 0) and (d[i][8] == 0) and (d[i][10] == 0) and (d[i][0] == 0) and (d[i][18] == 0):
        index19.append(i)
    index9 = []
    for i in range(4974):
      if (d[i][9]>0) and (d[i][14] == 0):
        index9.append(i)
    index1 = []
    for i in range(4974):
      if (d[i][1]>0) and (d[i][14] == 0) :
        index1.append(i)
    index2 = []
    for i in range(4974):
      if (d[i][2]>0) and (d[i][14] == 0):
        index2.append(i)
    index3 = []
    for i in range(4974):
      if (d[i][3]>0) and (d[i][2]==0) and(d[i][14] == 0) :
        index3.append(i)
    index7 = []
    for i in range(4974):
      if (d[i][7]>0) and(d[i][14] == 0) :
        index7.append(i)
    index11 = []
    for i in range(4974):
      if (d[i][11]>0) and(d[i][14] == 0) :
        index11.append(i)
    index12 = []
    for i in range(4974):
      if (d[i][12]>0) and(d[i][14] == 0) :
        index12.append(i)
    index13 = []
    for i in range(4974):
      if (d[i][13]>0) and (d[i][14] == 0) and (d[i][6] == 0) and (d[i][10] == 0):
        index13.append(i)
    index16 = []
    for i in range(4974):
      if (d[i][16]>0) and (d[i][14] == 0) :
        index16.append(i)
    index17 = []
    for i in range(4974):
      if (d[i][17]>0) and (d[i][14] == 0) and (d[i][15] == 0)and (d[i][8] == 0) :
        index17.append(i)
    index4 = []
    for i in range(4974):
      if (d[i][4]>0) and(d[i][14] == 0) :
        index4.append(i)
    index15 = []
    for i in range(4974):
      if (d[i][15]>0) and(d[i][14] == 0) :
        index15.append(i)
    self.add_pic_index = list(np.repeat(index5,26))+list(np.repeat(index10,60))+list(np.repeat(index0,10))+list(np.repeat(index18,15))+list(np.repeat(index19,20))+list(np.repeat(index9,15))+list(np.repeat(index1,40))+list(np.repeat(index2,6))+list(np.repeat(index3,10))+list(np.repeat(index7,10))+list(np.repeat(index11,10))+list(np.repeat(index12,40))+list(np.repeat(index13,40))+list(np.repeat(index16,10))+list(np.repeat(index17,10))+list(np.repeat(index4,6))+list(np.repeat(index15,3))
    for i in self.add_pic_index:
      self.new.append(self.new[i])

In [None]:
a = imbalance()

num = []
for line in a.new:
  line = line.strip()
  temp = line.split()
  for j in range(5,len(temp),5):
    num.append(temp[j])
result = Counter(num)

new_data = a.new

with open('../data/train_1.txt', 'w') as f:
    for item in new_data:
        f.write("%s\n" % item)

## Data Augmentation

在做Data augmentation時，我們嘗試了兩種方法：
1. 參考Paper中提到的augmentation方法，包含Sacling、Translation、調整圖片的飽和度、曝光度等，此外，我們也對圖片進行水平與垂直翻轉。我們以20%的機率保留原圖，確保模型可以看到部分的原始圖片。
2. 我們也嘗試自己設計了augmentation方法，使用到的方法包含Sacling、Translation、Rotation、Crop、Shear，以及調整對比、模糊、Random noise、亮度、飽和度、曝光度，以及轉為灰階圖、RGB shuffle、水平與垂直翻轉。因為使用的augmentation方法更多，我們選擇保留40%的原圖。

因為tensorflow中並沒有提供我們足夠多的data augmentation方法，而且進行data augmentation時，我們也必須將bounding box進行調整。所以，我們使用imgaug package進行data augmentation，起初我們擔心使用非tensorflow會降低效率，但根據tensorflow的官網，準備dataset的階段是使用CPU進行運算，因此應該不會降低太多效率。

In [None]:
@tf.function
def random_flip(image,xcenter,ycenter):
    up_down_outcome = tf.random.uniform([1],0,1)
    right_left_outcome = tf.random.uniform([1],0,1)

    x_0 = tf.not_equal(xcenter,0.)
    y_0 = tf.not_equal(ycenter,0.)
    grand = tf.cast(tf.where(tf.math.logical_or(x_0,y_0),IMAGE_SIZE,0),
                    tf.float32)

    if up_down_outcome<up_down_flip_p:
        image = tf.image.flip_up_down(image)
        ycenter = grand-ycenter

    if right_left_outcome<left_right_flip_p:
        image = tf.image.flip_left_right(image)
        xcenter = grand-xcenter

    return image, xcenter, ycenter

@tf.function
def to_gray(image):
    prob = tf.random.uniform([1],0,1)

    if prob<gray_p:
        image = tf.image.rgb_to_grayscale(image)
        image = tf.image.grayscale_to_rgb(image)

    return image

@tf.function
def gaussian_noise(image):
    prob = tf.random.uniform([1],0,1)
    
    if prob<noise_p:
        noise = tf.random.normal(image.shape,stddev=5)
        image = tf.math.add(image, noise)
        
    return image

@tf.function
def brightness(image):
    prob = tf.random.uniform([1],0,1)
    
    if prob<brightness_p:
        image = tf.image.random_brightness(image,5)
        
    return image

@tf.function
def hue(image):
    prob = tf.random.uniform([1], 0, 1)
    
    if prob < hue_p:
        image = tf.image.random_hue(image, 0.5)
        
    return image

@tf.function
def saturation(image):
    prob = tf.random.uniform([1], 0, 1)
    
    if prob < saturation_p:
        image = tf.image.random_saturation(image, 0, 1.5)
        
    return image

### 1. Paper Augmentation
以下為實作Paper的data augmentation的程式碼。

In [None]:
# data augumentation parameter
up_down_flip_p = 0.1
left_right_flip_p = 0.5

theSame = 0.1
crop_p = 0
geom_p = 0.8
gray_p = 0
noise_p = 0
hue_p = 0.8
saturation_p = 0.8
brightness_p = 0
contrast_p = 0
blur_p = 0

rotate_range=(-45, 45)
scale_range=(0.8, 1.2)
translate_range=(-0.2, 0.2)
shear_range=(-20, 20)
crop_pad_range=(-0.1, 0.1)

In [None]:
def imgaug_trans(image,labels):
    n = tf.math.count_nonzero(labels[:,0]).numpy()
    image = image.numpy()
    labels = labels.numpy()
    output = np.zeros_like(labels)

    center_x = labels[:,0]
    center_y = labels[:,1]
    w_half = labels[:,2] / 2
    h_half = labels[:,3] / 2

    tempbb = [BoundingBox(x1=center_x[i] - w_half[i], y1=center_y[i] - h_half[i],
                          x2=center_x[i] + w_half[i], y2=center_y[i] + h_half[i]) for i in range(n)]
    bbs = BoundingBoxesOnImage(tempbb, shape=image.shape)

    seq = iaa.Sequential(
      [
           iaa.Sometimes(geom_p,
                     iaa.SomeOf((1, 3), [
                                       iaa.Affine(translate_percent={"x":(translate_range[0], translate_range[1])},mode =ia.ALL),
                                       iaa.Affine(translate_percent={"y":(translate_range[0], translate_range[1])},mode = ia.ALL),
                                       iaa.Affine(scale=(scale_range[0], scale_range[1]),mode = ia.ALL)
                                       ],
                                random_order=True)
                     )
    ])

    image_aug, bbs_aug = seq(image=image, bounding_boxes=bbs)

    if len(bbs_aug.remove_out_of_image().bounding_boxes)==n:
        theIdx = [i for i in range(n)]
    else:
        set_bb = set(bbs_aug.remove_out_of_image().bounding_boxes)
        theIdx = [i for i in range(n) if bbs_aug.bounding_boxes[i] in set_bb]

    selected_labels = labels[theIdx, 4]
    clip_bbs = bbs_aug.remove_out_of_image().clip_out_of_image().bounding_boxes
    for i in range(len(theIdx)):
        theBox = clip_bbs[i]
        output[i, 0] = (theBox.x1 + theBox.x2) / 2 # x center
        output[i, 1] = (theBox.y1 + theBox.y2) / 2 # y center
        output[i, 2] = (theBox.x2 - theBox.x1) # w
        output[i, 3] = (theBox.y2 - theBox.y1) # h
        output[i, 4] = selected_labels[i]

    return image_aug, output

In [None]:
@tf.function
def data_aug(image,labels):
    same_sample = tf.random.uniform([1],0,1)
    if same_sample > theSame:
        # not the same
        image, labels = tf.py_function(
            func=imgaug_trans,
            inp=[image,labels],
            Tout=[tf.float32,tf.float32]
            )
        image = hue(image)
        image = saturation(image)

    return image, labels

### 2. Our Augmentation
以下是實作我們自己設計的augmentation的程式碼。

In [None]:
# data augumentation parameter
up_down_flip_p = 0.05
left_right_flip_p = 0.5

theSame = 0.4
crop_p = 0.6
geom_p = 0.8
brightness_p = 0.3
hue_p = 0.8
saturation_p = 0.8
contrast = 0.3
gray_p = 0.1
noise_p = 0.4
blur = 0.2

rotate_range=(-45, 45)
scale_range=(0.2, 1.2)
translate_range=(-0.2, 0.2)
shear_range=(-20, 20)
crop_pad_range=(-0.2, 0.2)

In [None]:
def imgaug_trans(image,labels):
    n = tf.math.count_nonzero(labels[:,0]).numpy()
    image = image.numpy()
    labels = labels.numpy()
    output = np.zeros_like(labels)

    center_x = labels[:,0]
    center_y = labels[:,1]
    w_half = labels[:,2] / 2
    h_half = labels[:,3] / 2

    tempbb = [BoundingBox(x1=center_x[i] - w_half[i], y1=center_y[i] - h_half[i],
                          x2=center_x[i] + w_half[i], y2=center_y[i] + h_half[i]) for i in range(n)]
    bbs = BoundingBoxesOnImage(tempbb, shape=image.shape)

    seq = iaa.Sequential(
      [
       iaa.Sometimes(crop_p,iaa.CropAndPad(percent=(crop_pad_range[0], crop_pad_range[1]))),
       iaa.Sometimes(geom_p,
                     iaa.SomeOf((1, 5),[
                                       iaa.Affine(translate_percent={"x":(translate_range[0], translate_range[1])}),
                                       iaa.Affine(translate_percent={"y":(translate_range[0], translate_range[1])}),
                                       iaa.Affine(scale=(scale_range[0], scale_range[1])),
                                       iaa.Affine(rotate=(rotate_range[0], rotate_range[1])),
                                       iaa.Affine(shear=(shear_range[0],shear_range[1]))
                                       ],
                                random_order=True)
                     ),
       iaa.Sometimes(contrast,
                     iaa.OneOf([
                                iaa.contrast.LinearContrast(alpha=(1.25, 1.5),per_channel=True),
                                iaa.contrast.LinearContrast(alpha=(0.25, 0.5),per_channel=True),
                                iaa.contrast.LinearContrast(alpha=(0.25, 0.5)),
                                iaa.contrast.LinearContrast(alpha=(1.25, 1.5)),
                                iaa.ChannelShuffle()
                                ]
                               )
                     ),
       iaa.Sometimes(blur, iaa.GaussianBlur(sigma=(0.1,3)))
    ])

    image_aug, bbs_aug = seq(image=image, bounding_boxes=bbs)

    if len(bbs_aug.remove_out_of_image().bounding_boxes)==n:
        theIdx = [i for i in range(n)]
    else:
        set_bb = set(bbs_aug.remove_out_of_image().bounding_boxes)
        theIdx = [i for i in range(n) if bbs_aug.bounding_boxes[i] in set_bb]

    selected_labels = labels[theIdx, 4]
    clip_bbs = bbs_aug.remove_out_of_image().clip_out_of_image().bounding_boxes
    for i in range(len(theIdx)):
        theBox = clip_bbs[i]
        output[i, 0] = (theBox.x1 + theBox.x2) / 2 # x center
        output[i, 1] = (theBox.y1 + theBox.y2) / 2 # y center
        output[i, 2] = (theBox.x2 - theBox.x1) # w
        output[i, 3] = (theBox.y2 - theBox.y1) # h
        output[i, 4] = selected_labels[i]

    return image_aug, output

In [None]:
@tf.function
def data_aug(image,labels):
    same_sample = tf.random.uniform([1],0,1)
    if same_sample > theSame:
        # not the same
        image = gaussian_noise(image)
        image, labels = tf.py_function(
            func=imgaug_trans,
            inp=[image, labels],
            Tout=[tf.float32, tf.float32]
            )
        image = brightness(image)
        image = hue(image)
        image = saturation(image)
        image = to_gray(image)

    return image, labels

## Models

我們嘗試過許多種不同的Pre-trained，包含VGG16、VGG19、Inception V3、NASNet等不同的Pretrained model，我們也嘗試過在Pretrained model後接上許多不同layers。
<br/>我們一開始使用VGG16替代Paper的前20層，並在完成augmentation階段後才開始嘗試不同的模型。有些模型的效果一直都很不好，如Inception V3，但有些模型卻是在部分augmentation效果好、其他效果不好，如NASNet。最後嘗試出來表現較好並在不同的augmentation上都能夠有好結果的模型如下：
1. 使用ResNet 152替代Paper的前20層，接下來的四層 Convolution+Leaky ReLU 依照Paper的設定，但不使用Padding，最後再接上4096個neuron的全連接層以及Output layer。
2. 使用Xception，並在之後使用兩層Convolution+Max pooling+Batch normalization，全連接層則使用一層4096個neuron、一層512個neuron並各自加上Dropout，最後才接上Output layer。

### 1. ResNet 152
這個模型是上述提到的第一個模型，因為ResNet是使用Caffe訓練，必須要將資料進行前處理。

In [None]:
class DatasetGenerator:
    """
    Load pascalVOC 2007 dataset and creates an input pipeline ready to be fed into a model.
    - Reshapes images into 448 x 448
    - converts [0 1] to [-1 1]
    - shuffles the input
    - builds batches
    """

    def __init__(self):
        self.image_names = []
        self.record_list = []
        self.object_num_list = []
        # filling the record_list
        input_file = open(DATA_PATH, 'r')

        for line in input_file:
            line = line.strip()
            ss = line.split(' ')
            
            self.image_names.append(ss[0])
            self.record_list.append([float(num) for num in ss[1:]])
            self.object_num_list.append(min(len(self.record_list[-1])//5, MAX_OBJECTS_PER_IMAGE))
            
            # resize newest data
            if len(self.record_list[-1]) < MAX_OBJECTS_PER_IMAGE*5:
                self.record_list[-1] = self.record_list[-1] +\
                [0., 0., 0., 0., 0.]*\
                (MAX_OBJECTS_PER_IMAGE-len(self.record_list[-1])//5)
            elif len(self.record_list[-1]) > MAX_OBJECTS_PER_IMAGE*5:
                self.record_list[-1] = self.record_list[-1][:MAX_OBJECTS_PER_IMAGE*5]
                
        ## shuffle
        idx = random.sample(range(len(self.image_names)), len(self.image_names))
        self.image_names = [self.image_names[i] for i in idx]
        self.record_list = [self.record_list[i] for i in idx]
        self.object_num_list = [self.object_num_list[i] for i in idx]

    def _data_preprocess(self, image_name, raw_labels, object_num):
        image_file = tf.io.read_file(IMAGE_DIR+image_name)
        image = tf.io.decode_jpeg(image_file, channels=3)

        h = tf.shape(image)[0]
        w = tf.shape(image)[1]

        width_rate = IMAGE_SIZE * 1.0 / tf.cast(w, tf.float32) 
        height_rate = IMAGE_SIZE * 1.0 / tf.cast(h, tf.float32) 

        image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])

        raw_labels = tf.cast(tf.reshape(raw_labels, [-1, 5]), tf.float32)

        xmin = raw_labels[:, 0]
        ymin = raw_labels[:, 1]
        xmax = raw_labels[:, 2]
        ymax = raw_labels[:, 3]
        class_num = raw_labels[:, 4]

        xcenter = (xmin + xmax) * 1.0 / 2.0 * width_rate
        ycenter = (ymin + ymax) * 1.0 / 2.0 * height_rate

        box_w = (xmax - xmin) * width_rate
        box_h = (ymax - ymin) * height_rate

        image, xcenter, ycenter = random_flip(image, xcenter, ycenter)

        labels = tf.stack([xcenter, ycenter, box_w, box_h, class_num], axis = 1)

        image, labels = data_aug(image, labels)
        image = tf.keras.applications.resnet.preprocess_input(image)

        return image, labels, tf.cast(object_num, tf.int32)

    def generate(self):
        dataset = tf.data.Dataset.from_tensor_slices((self.image_names, 
                                                      np.array(self.record_list), 
                                                      np.array(self.object_num_list)))
        dataset = dataset.map(self._data_preprocess, num_parallel_calls = tf.data.experimental.AUTOTUNE)
        dataset = dataset.shuffle(1000)
        dataset = dataset.batch(BATCH_SIZE)
        
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        return dataset

In [None]:
def conv_leaky_relu(inputs, filters, size, stride):
    x = layers.Conv2D(filters, size, stride)(inputs)
    x = layers.LeakyReLU(0.1)(x)

    return x

In [None]:
if is_fine_tune:
    pre_trained_model = tf.keras.applications.ResNet152(include_top=False,
                                                    weights='imagenet',
                                                    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
    pre_trained_model.trainable=True

    for layer in pre_trained_model.layers:
        if not layer.name.startswith("conv5_block3_") and not layer.name.startswith("conv5_block2_"):
            layer.trainable = False
else:
    pre_trained_model = tf.keras.applications.ResNet152(include_top=False,
                                                        weights='imagenet',
                                                        input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
    pre_trained_model.trainable=False

In [None]:
x = pre_trained_model.output
x = conv_leaky_relu(x, 1024, 3, 1)
x = conv_leaky_relu(x, 1024, 3, 2)
x = conv_leaky_relu(x, 1024, 3, 1)
x = conv_leaky_relu(x, 1024, 3, 1)

x = layers.Flatten()(x)
x = layers.Dense(4096, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))(x)
x = layers.LeakyReLU(0.1)(x)
outputs = layers.Dense(CELL_SIZE*CELL_SIZE*(BOXES_PER_CELL*5+20),
                       activation='relu', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01))(x)

YOLO = keras.Model(inputs=pre_trained_model.input, outputs=outputs, name="YOLO")

### 2. Xception
這個模型是上述提到的第二個模型，因為Xception是使用tensorflow訓練，必須要將資料進行前處理。

In [None]:
class DatasetGenerator:
    """
    Load pascalVOC 2007 dataset and creates an input pipeline ready to be fed into a model.
    - Reshapes images into 448 x 448
    - converts [0 1] to [-1 1]
    - shuffles the input
    - builds batches
    """

    def __init__(self):
        self.image_names = []
        self.record_list = []
        self.object_num_list = []
        # filling the record_list
        input_file = open(DATA_PATH, 'r')

        for line in input_file:
            line = line.strip()
            ss = line.split(' ')
            
            self.image_names.append(ss[0])
            self.record_list.append([float(num) for num in ss[1:]])
            self.object_num_list.append(min(len(self.record_list[-1])//5, MAX_OBJECTS_PER_IMAGE))
            
            # resize newest data
            if len(self.record_list[-1]) < MAX_OBJECTS_PER_IMAGE*5:
                self.record_list[-1] = self.record_list[-1] +\
                [0., 0., 0., 0., 0.]*\
                (MAX_OBJECTS_PER_IMAGE-len(self.record_list[-1])//5)
            elif len(self.record_list[-1]) > MAX_OBJECTS_PER_IMAGE*5:
                self.record_list[-1] = self.record_list[-1][:MAX_OBJECTS_PER_IMAGE*5]
                
        ## shuffle
        idx = random.sample(range(len(self.image_names)), len(self.image_names))
        self.image_names = [self.image_names[i] for i in idx]
        self.record_list = [self.record_list[i] for i in idx]
        self.object_num_list = [self.object_num_list[i] for i in idx]

    def _data_preprocess(self, image_name, raw_labels, object_num):
        image_file = tf.io.read_file(IMAGE_DIR+image_name)
        image = tf.io.decode_jpeg(image_file, channels=3)

        h = tf.shape(image)[0]
        w = tf.shape(image)[1]

        width_rate = IMAGE_SIZE * 1.0 / tf.cast(w, tf.float32) 
        height_rate = IMAGE_SIZE * 1.0 / tf.cast(h, tf.float32) 

        image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])

        raw_labels = tf.cast(tf.reshape(raw_labels, [-1, 5]), tf.float32)

        xmin = raw_labels[:, 0]
        ymin = raw_labels[:, 1]
        xmax = raw_labels[:, 2]
        ymax = raw_labels[:, 3]
        class_num = raw_labels[:, 4]

        xcenter = (xmin + xmax) * 1.0 / 2.0 * width_rate
        ycenter = (ymin + ymax) * 1.0 / 2.0 * height_rate

        box_w = (xmax - xmin) * width_rate
        box_h = (ymax - ymin) * height_rate

        image, xcenter, ycenter = random_flip(image, xcenter, ycenter)

        labels = tf.stack([xcenter, ycenter, box_w, box_h, class_num], axis = 1)

        image, labels = data_aug(image, labels)
        image = tf.keras.applications.xception.preprocess_input(image)

        return image, labels, tf.cast(object_num, tf.int32)

    def generate(self):
        dataset = tf.data.Dataset.from_tensor_slices((self.image_names, 
                                                      np.array(self.record_list), 
                                                      np.array(self.object_num_list)))
        dataset = dataset.map(self._data_preprocess, num_parallel_calls = tf.data.experimental.AUTOTUNE)
        dataset = dataset.shuffle(1000)
        dataset = dataset.batch(BATCH_SIZE)
        
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        return dataset

In [None]:
def conv_leaky_relu(inputs, filters, size, stride):
    x = layers.Conv2D(filters, size, stride)(inputs)
    x = layers.LeakyReLU(0.1)(x)

    return x

In [None]:
pre_trained_model = tf.keras.applications.xception.Xception(include_top=False,
                                                            weights='imagenet',input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
pre_trained_model.trainable=False

In [None]:
inputs_mdl = keras.models.Sequential()

inputs_mdl.add(layers.Conv2D(1024, (3,3), padding='same', activation='relu'))
inputs_mdl.add(layers.MaxPool2D(pool_size=3,strides=2,padding='same'))
inputs_mdl.add(layers.BatchNormalization())
inputs_mdl.add(layers.Conv2D(1024, (3,3), padding='same', activation='relu'))
inputs_mdl.add(layers.MaxPool2D(pool_size=3,strides=2,padding='same'))
inputs_mdl.add(layers.BatchNormalization())

inputs_mdl.add(layers.Flatten())
inputs_mdl.add(layers.Dense(4096, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01)))
inputs_mdl.add(layers.Dropout(0.5))
inputs_mdl.add(layers.Dense(512, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01)))
inputs_mdl.add(layers.Dropout(0.5))
inputs_mdl.add(layers.Dense(CELL_SIZE*CELL_SIZE*(BOXES_PER_CELL*5+20),
                            activation='relu', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01)))

YOLO = keras.Model(inputs=pre_trained_model.input, outputs=inputs_mdl(pre_trained_model.output), name="YOLO")

## Training

訓練模型的方法主要參考助教所提供的程式碼。
<br/> 我們其他的修改與設定如下：
1. 因為有對圖片進行翻轉，將圖片翻轉後會在計算loss的時候出現Error，因此有對部分計算loss的程式碼進行修改。
2. 為了避免因為learning rate設的太大而發生loss明顯上升的情況，設定若當下的loss超過最低的loss兩倍時則提前停止，但在訓練的過程中沒有發生此情況。
3. 根據預測結果，若使用imbalance的資料大約會在150個Epoch附近的時候結果較好，因此根據資料量推算，使用Balance data時，我們訓練的Epoch數為15個Epoch。
4. 我們一開始有嘗試使用fine-tune，即在前面幾個epoch時，凍結住Pretrained的layers，等經過幾個epoch後，再一起訓練Pretrained的layers。在未使用balance data時，這個方法能夠有不錯的效果，但在使用balance data後，則是全部凍結住Pretrained的layers的效果較好，因此我們後來便都是以凍結住Pretrained的layers的方式進行。

In [None]:
# base boxes (for loss calculation)
base_boxes = np.zeros([CELL_SIZE, CELL_SIZE, 4])

#for each cell
for y in range(CELL_SIZE):
    for x in range(CELL_SIZE):
        base_boxes[y, x, :] = [IMAGE_SIZE / CELL_SIZE * x, IMAGE_SIZE / CELL_SIZE * y, 0, 0]

base_boxes = np.tile(np.resize(base_boxes, [CELL_SIZE, CELL_SIZE, 1, 4]), [1, 1, BOXES_PER_CELL, 1])

In [None]:
def yolo_loss(predicts, labels, objects_num):
    """
    calculate loss
    Args:
        predict: 3-D tensor [cell_size, cell_size, 5 * boxes_per_cell]
        labels : [max_objects, 5]  (x_center, y_center, w, h, class)
    """
    """
    Add Loss to all the trainable variables
    Args:
        predicts: 4-D tensor [batch_size, cell_size, cell_size, 5 * boxes_per_cell]
        ===> (num_classes, boxes_per_cell, 4 * boxes_per_cell)
        labels  : 3-D tensor of [batch_size, max_objects, 5]
        objects_num: 1-D tensor [batch_size]
    """

    batch_size = predicts.shape[0]
    loss = 0.

    for i in tf.range(batch_size):
        predict = predicts[i, :, :, :]
        label = labels[i, :, :]
        object_num = objects_num[i]

        for j in tf.range(object_num):
            results = losses_calculation(predict, label[j:j+1, :])
            loss = loss + results

    return loss/BATCH_SIZE

def iou(boxes1, boxes2):
    """calculate ious
    Args:
      boxes1: 4-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]  ====> (x_center, y_center, w, h)
      boxes2: 1-D tensor [4] ===> (x_center, y_center, w, h)

    Return:
      iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    """

    #boxes1 : [4(xmin, ymin, xmax, ymax), cell_size, cell_size, boxes_per_cell]
    boxes1 = tf.stack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
                      boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2])

    #boxes1 : [cell_size, cell_size, boxes_per_cell, 4(xmin, ymin, xmax, ymax)]
    boxes1 = tf.transpose(boxes1, [1, 2, 3, 0])

    boxes2 =  tf.stack([boxes2[0] - boxes2[2] / 2, boxes2[1] - boxes2[3] / 2,
                      boxes2[0] + boxes2[2] / 2, boxes2[1] + boxes2[3] / 2])

    #calculate the left up point of boxes' overlap area
    lu = tf.maximum(boxes1[:, :, :, 0:2], boxes2[0:2])
    #calculate the right down point of boxes overlap area
    rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:])

    #intersection
    intersection = rd - lu 

    #the size of the intersection area
    inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1]

    mask = tf.cast(intersection[:, :, :, 0] > 0, tf.float32) * tf.cast(intersection[:, :, :, 1] > 0, tf.float32)

    #if intersection is negative, then the boxes don't overlap
    inter_square = mask * inter_square

    #calculate the boxs1 square and boxs2 square
    square1 = (boxes1[:, :, :, 2] - boxes1[:, :, :, 0]) * (boxes1[:, :, :, 3] - boxes1[:, :, :, 1])
    square2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])

    return inter_square/(square1 + square2 - inter_square + 1e-6)

def losses_calculation(predict, label):
    """
    calculate loss
    Args:
      predict: 3-D tensor [cell_size, cell_size, 5 * boxes_per_cell]
      label : [1, 5]  (x_center, y_center, w, h, class)
    """
    label = tf.reshape(label, [-1])

    #calculate objects tensor [CELL_SIZE, CELL_SIZE]
    min_x = (label[0] - label[2] / 2) / (IMAGE_SIZE / CELL_SIZE)
    max_x = (label[0] + label[2] / 2) / (IMAGE_SIZE / CELL_SIZE)

    min_y = (label[1] - label[3] / 2) / (IMAGE_SIZE / CELL_SIZE)
    max_y = (label[1] + label[3] / 2) / (IMAGE_SIZE / CELL_SIZE)

    # min_x = tf.floor(min_x)
    # min_y = tf.floor(min_y)

    min_x = tf.maximum(tf.math.floor(min_x), 0) #
    min_y = tf.maximum(tf.math.floor(min_y), 0) #

    max_x = tf.minimum(tf.math.ceil(max_x), CELL_SIZE)
    max_y = tf.minimum(tf.math.ceil(max_y), CELL_SIZE)

    temp = tf.cast(tf.stack([max_y - min_y, max_x - min_x]), dtype=tf.int32)
    objects = tf.ones(temp, tf.float32)

    temp = tf.cast(tf.stack([min_y, CELL_SIZE - max_y, min_x, CELL_SIZE - max_x]), tf.int32)
    temp = tf.reshape(temp, (2, 2))
    objects = tf.pad(objects, temp, "CONSTANT")

    #calculate objects  tensor [CELL_SIZE, CELL_SIZE]
    #calculate responsible tensor [CELL_SIZE, CELL_SIZE]
    center_x = label[0] / (IMAGE_SIZE / CELL_SIZE)
    center_x = tf.floor(center_x)

    center_y = label[1] / (IMAGE_SIZE / CELL_SIZE)
    center_y = tf.floor(center_y)

    response = tf.ones([1, 1], tf.float32)

    temp = tf.cast(tf.stack([center_y, CELL_SIZE - center_y - 1, 
                             center_x, CELL_SIZE - center_x - 1]), 
                   tf.int32)
#     tmp = tf.stack([center_y, CELL_SIZE - center_y - 1,
#                     center_x, CELL_SIZE - center_x - 1])
    temp = tf.reshape(temp, (2, 2))
    response = tf.pad(response, temp, "CONSTANT")
    #objects = response

    #calculate iou_predict_truth [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    predict_boxes = predict[:, :, NUM_CLASSES + BOXES_PER_CELL:]

    predict_boxes = tf.reshape(predict_boxes, [CELL_SIZE, 
                                               CELL_SIZE, 
                                               BOXES_PER_CELL, 4])

    predict_boxes = predict_boxes * [IMAGE_SIZE / CELL_SIZE, 
                                     IMAGE_SIZE / CELL_SIZE, 
                                     IMAGE_SIZE, IMAGE_SIZE]

    #if there's no predict_box in that cell, then the base_boxes will be calcuated with label and got iou equals 0
    predict_boxes = base_boxes + predict_boxes

    iou_predict_truth = iou(predict_boxes, label[0:4])
    #calculate C [cell_size, cell_size, boxes_per_cell]
    C = iou_predict_truth * tf.reshape(response, [CELL_SIZE, CELL_SIZE, 1])

    #calculate I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    I = iou_predict_truth * tf.reshape(response, (CELL_SIZE, CELL_SIZE, 1))

    max_I = tf.reduce_max(I, 2, keepdims=True)

    I = tf.cast((I >= max_I), tf.float32) * tf.reshape(response, (CELL_SIZE, CELL_SIZE, 1))

    #calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    no_I = tf.ones_like(I, dtype=tf.float32) - I

    p_C = predict[:, :, NUM_CLASSES:NUM_CLASSES + BOXES_PER_CELL]

    #calculate truth x, y, sqrt_w, sqrt_h 0-D
    x = label[0]
    y = label[1]

    sqrt_w = tf.sqrt(tf.abs(label[2]))
    sqrt_h = tf.sqrt(tf.abs(label[3]))

    #calculate predict p_x, p_y, p_sqrt_w, p_sqrt_h 3-D [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    p_x = predict_boxes[:, :, :, 0]
    p_y = predict_boxes[:, :, :, 1]

    #p_sqrt_w = tf.sqrt(tf.abs(predict_boxes[:, :, :, 2])) * ((tf.cast(predict_boxes[:, :, :, 2] > 0, tf.float32) * 2) - 1)
    #p_sqrt_h = tf.sqrt(tf.abs(predict_boxes[:, :, :, 3])) * ((tf.cast(predict_boxes[:, :, :, 3] > 0, tf.float32) * 2) - 1)
    #p_sqrt_w = tf.sqrt(tf.maximum(0.0, predict_boxes[:, :, :, 2]))
    #p_sqrt_h = tf.sqrt(tf.maximum(0.0, predict_boxes[:, :, :, 3]))
    #p_sqrt_w = predict_boxes[:, :, :, 2]
    #p_sqrt_h = predict_boxes[:, :, :, 3]
    p_sqrt_w = tf.sqrt(tf.minimum(IMAGE_SIZE * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 2])))
    p_sqrt_h = tf.sqrt(tf.minimum(IMAGE_SIZE * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 3])))

    #calculate truth p 1-D tensor [NUM_CLASSES]
    P = tf.one_hot(tf.cast(label[4], tf.int32), NUM_CLASSES, dtype=tf.float32)

    #calculate predict p_P 3-D tensor [CELL_SIZE, CELL_SIZE, NUM_CLASSES]
    p_P = predict[:, :, 0:NUM_CLASSES]

    #class_loss
    class_loss = tf.nn.l2_loss(tf.reshape(objects, (CELL_SIZE, CELL_SIZE, 1)) * (p_P - P)) * CLASS_SCALE
    #class_loss = tf.nn.l2_loss(tf.reshape(response, (CELL_SIZE, CELL_SIZE, 1)) * (p_P - P)) * CLASS_SCALE

    #object_loss
    object_loss = tf.nn.l2_loss(I * (p_C - C)) * OBJECT_SCALE
    #object_loss = tf.nn.l2_loss(I * (p_C - (C + 1.0)/2.0)) * OBJECT_SCALE

    #noobject_loss
    #noobject_loss = tf.nn.l2_loss(no_I * (p_C - C)) * NOOBJECT_SCALE
    noobject_loss = tf.nn.l2_loss(no_I * (p_C)) * NOOBJECT_SCALE

    #coord_loss
    coord_loss = (tf.nn.l2_loss(I * (p_x - x)/(IMAGE_SIZE/CELL_SIZE)) +
                 tf.nn.l2_loss(I * (p_y - y)/(IMAGE_SIZE/CELL_SIZE)) +
                 tf.nn.l2_loss(I * (p_sqrt_w - sqrt_w))/IMAGE_SIZE +
                 tf.nn.l2_loss(I * (p_sqrt_h - sqrt_h))/IMAGE_SIZE) * COORD_SCALE

    return class_loss + object_loss + noobject_loss + coord_loss

In [None]:
dataset = DatasetGenerator().generate()

optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
train_loss_metric = tf.keras.metrics.Mean(name='loss')

In [None]:
last_ckp = tf.train.latest_checkpoint(CKT_Dir)
if last_ckp:
    ckpt = tf.train.Checkpoint(net=YOLO)
    ckpt.restore(last_ckp)
    init_epoch = int(last_ckp.split("-")[-1])+1
    print(f'Resume training from epoch {init_epoch-1}') 
else:
    init_epoch=1
    print("Strat from 1")

In [None]:
best_loss = 1e10
best_epoch = 0

for f in os.listdir(CKT_Dir):
    if "BEST_" in f:
        index_dot = f.index(".")
        index_ep = f.index("ep_")
        index_best = f.index("BEST_")
        
        best_loss = int(f[index_ep + 3 : index_dot])/100
        best_epoch = int(f[index_best + 5:index_ep])
        break
        
print("Previous best epoch is {} wiht loss {}".format(best_epoch, best_loss))

In [None]:
ckpt = tf.train.Checkpoint(epoch=tf.Variable(init_epoch-1), net=YOLO)

manager = tf.train.CheckpointManager(ckpt, CKT_Dir, max_to_keep=3,
                                     checkpoint_name=checkpoint_name)

In [None]:
@tf.function
def train_step(image, labels, objects_num):
    with tf.GradientTape() as tape:
        outputs = YOLO(image)
        n1 = CELL_SIZE * CELL_SIZE * NUM_CLASSES
        n2 = n1 + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL
        class_probs = tf.reshape(outputs[:, 0:n1], (-1, CELL_SIZE, CELL_SIZE, 20))
        scales = tf.reshape(outputs[:, n1:n2], (-1, CELL_SIZE, CELL_SIZE, 2))
        boxes = tf.reshape(outputs[:, n2:], (-1, CELL_SIZE, CELL_SIZE, 2*4))
        predicts = tf.concat([class_probs, scales, boxes], 3)

        loss = yolo_loss(predicts, labels, objects_num)
        train_loss_metric(loss)

    grads = tape.gradient(loss, YOLO.trainable_weights)
    optimizer.apply_gradients(zip(grads, YOLO.trainable_weights))

In [None]:
loss_list = []
epoch_list = []

def remove_save_best_model(best_epoch, best_loss):
    # remove old model
    for f1 in os.listdir(CKT_Dir):
        if "BEST_" in f1:
            os.remove(CKT_Dir + "/" + f1)
    
    # save new model
    for f in os.listdir(CKT_Dir):
        if f.startswith("{}-{}".format(checkpoint_name, best_epoch)):
            new_file_name = f.replace("{}-{}".format(checkpoint_name, best_epoch),
                                      "{}-BEST_{}ep_{}".format(checkpoint_name,best_epoch, int(best_loss*100)))
            copyfile(CKT_Dir + "/" + f, CKT_Dir + "/" + new_file_name)

In [None]:
print("{}, start training.".format(datetime.now()))

for i in range(init_epoch,EPOCHS+1):
    print("\nEpoch: %d/%d" % (i,EPOCHS))
    train_loss_metric.reset_states()
    ckpt.epoch.assign_add(1)

    for idx, (image, labels, objects_num) in enumerate(dataset):
        train_step(image, labels, objects_num)

    tr_loss = train_loss_metric.result()
    print("{}, Epoch {}: loss {:.5f}".format(datetime.now(), i, tr_loss))
    
    epoch_list.append(i)
    loss_list.append(tr_loss)
    
    if tr_loss > 2*best_loss:
        print("Explosion at step %d" % i)
        break
    
    save_path = manager.save(checkpoint_number=tf.Variable(ckpt.epoch))
    print("Saved checkpoint for epoch {}: {}".format(int(ckpt.epoch), save_path))
    
    # Save best model
    if tr_loss < best_loss:
        best_loss = tr_loss
        best_epoch = i
        remove_save_best_model(best_epoch, best_loss)
        print("New Best Epoch is {} with training loss {}".format(best_epoch, best_loss))
    
Fin_time = datetime.now()

In [None]:
print("Finish at {}".format(Fin_time))
print("The best epoch is {} wiht training loss {}".format(best_epoch, best_loss))

print("\n-------------------------------------------------------")
for idx, loss_of_step in zip(epoch_list,loss_list):
    print("The train loss of the %d-th epoch is %.3f" % (idx, loss_of_step))

## Non-max Suppression

在助教提供的程式碼中，只會預測機率最大的Bounding Box與其類別，但在一張圖中可能會有許多不同的Object，因此必須要設定某個threshold，將confidence大於此threshold的bounding box預測出來。然而，可能會有多個bounding box框住同一個物體，所以必須要使用Non-max suppression的方式將多個高度重疊的bounding box選擇出一個。對於模型預測都是0的圖片則是只取一個。
<br/> 我們便是實作了這個過程來預測一張圖片中的多個Object。

必須要注意的是，如果bounding box的面積為0，tensorflow的Non-max suppression會一直回傳這個面積為0的bounding box，因此，若有x_min等於x_max、y_min等於y_max的情況，我們會將x_max或y_max加0.01來避免此種情況。

In [None]:
def multiple_process_outputs(outputs):
    """
    Process YOLO outputs into bou
    """

    n1 = CELL_SIZE * CELL_SIZE * NUM_CLASSES
    n2 = n1 + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL
    class_probs = np.reshape(outputs[:, 0:n1], (-1, CELL_SIZE, CELL_SIZE, 20))
    scales = np.reshape(outputs[:, n1:n2], (-1, CELL_SIZE, CELL_SIZE, 2))
    boxes = np.reshape(outputs[:, n2:], (-1, CELL_SIZE, CELL_SIZE, 2*4))
    predicts = np.concatenate([class_probs, scales, boxes], 3)

    p_classes = predicts[0, :, :, 0:20]
    C = predicts[0, :, :, 20:22]
    coordinate = predicts[0, :, :, 22:]

    p_classes = np.reshape(p_classes, (CELL_SIZE, CELL_SIZE, 1, 20))
    C = np.reshape(C, (CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 1))

    P = C * p_classes
    #P's shape [7, 7, 2, 20]
    #print P[5,1, 0, :]

    coordinate = np.reshape(coordinate, 
                            (CELL_SIZE, 
                              CELL_SIZE,
                              BOXES_PER_CELL, 
                              4))

    output = []
    
    counter = np.sum(P>PROB_THRES)
    if counter == 0:
        counter = 1

    while counter>0:
        max_conf = np.max(P)
        index = np.argmax(P)
        index = np.unravel_index(index, P.shape)
        assert P[index] == max_conf, "兩個不合?"

        P[index[0],index[1],index[2],:] = 0.
        class_num = index[3]

        max_coordinate = coordinate[index[0], index[1], index[2], :]

        xcenter = max_coordinate[0]
        ycenter = max_coordinate[1]
        w = max_coordinate[2]
        h = max_coordinate[3]

        xcenter = (index[1] + xcenter) * (IMAGE_SIZE/float(CELL_SIZE))
        ycenter = (index[0] + ycenter) * (IMAGE_SIZE/float(CELL_SIZE))

        w = w * IMAGE_SIZE
        h = h * IMAGE_SIZE

        xmin = xcenter - w/2.0
        ymin = ycenter - h/2.0

        xmax = xmin + w
        ymax = ymin + h

        counter = np.sum(P>PROB_THRES)
        output.append([xmin, ymin, xmax, ymax, class_num, max_conf])

    return output

In [None]:
test_img_files = open(PATH_TO_DATA+'/pascal_voc_testing_data.txt')
test_img_dir = PATH_TO_DATA+'/VOCdevkit_test/VOCdevkit_test/VOC2007/JPEGImages/'
test_images = []

for line in test_img_files:
    line = line.strip()
    ss = line.split(' ')
    test_images.append(ss[0])

test_dataset = tf.data.Dataset.from_tensor_slices(test_images)

In [None]:
# ResNet 152
def load_img_data(image_name):
    image_file = tf.io.read_file(test_img_dir+image_name)
    image = tf.image.decode_jpeg(image_file, channels=3)

    h = tf.shape(image)[0]
    w = tf.shape(image)[1]

    image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])
    image = tf.keras.applications.resnet.preprocess_input(image)

    return image_name, image, h, w

In [None]:
# Xception
def load_img_data(image_name):
    image_file = tf.io.read_file(test_img_dir+image_name)
    image = tf.image.decode_jpeg(image_file, channels=3)

    h = tf.shape(image)[0]
    w = tf.shape(image)[1]

    image = tf.image.resize(image, size=[IMAGE_SIZE, IMAGE_SIZE])
    image = tf.keras.applications.xception.preprocess_input(image)

    return image_name, image, h, w

In [None]:
test_dataset = test_dataset.map(load_img_data, num_parallel_calls = tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(32)

In [None]:
if is_load_best:
    for f in os.listdir(CKT_Dir):
        if "BEST_" in f:
            best_epoch_file = f[:f.index(".")]
            break
else:
    best_epoch_file = checkpoint_name + "-" + str(EPOCHS)
    
print("Load checkpoint: ",best_epoch_file)

In [None]:
ckpt = tf.train.Checkpoint(net=YOLO)
ckpt.restore(CKT_Dir + "/" + best_epoch_file)

In [None]:
@tf.function
def prediction_step(img):
    return YOLO(img)

@tf.function
def tf_non_max(boxes,scores,max_value):
    return tf.image.non_max_suppression(boxes,
                                        scores,
                                        max_value,
                                        iou_threshold=IOU_THRES)

In [None]:
def swapBox(box):
    box[:,[0,1,2,3]] = box[:,[1,0,3,2]]
    return box

def execute_non_max(ori_box, ori_score, ori_class):
    # ori_class => number
    boxes = tf.Variable(ori_box, dtype=tf.float32)
    scores = tf.Variable(ori_score, dtype=tf.float32)
    
    selected_indices = tf.image.non_max_suppression(boxes, scores,
                                                    20, iou_threshold=IOU_THRES)
    
    selected_boxes = tf.gather(boxes, selected_indices).numpy()
    selected_scores = tf.gather(scores, selected_indices).numpy().tolist()
    selected_class = [ori_class for _ in range(selected_boxes.shape[0])]
    
    return selected_boxes, selected_scores, selected_class

def cal_non_max(original_box, class_number, conf):
    original_box = swapBox(original_box)
    
    selected_boxes = np.array([[0., 0., 0., 0.]])
    selected_scores = []
    selected_class = []
    
    class_set = set(class_number)
    for i in class_set:
        curr_idx = [j for j in range(len(class_number)) if class_number[j] == i]
        curr_boxes = original_box[curr_idx,:]
        curr_scores = [conf[j] for j in curr_idx]
        
        curr_selected_boxes, curr_selected_scores, curr_selected_class = execute_non_max(curr_boxes, curr_scores, i)
        
        selected_boxes = np.concatenate((selected_boxes, curr_selected_boxes), axis=0)
        selected_scores.extend(curr_selected_scores)
        selected_class.extend(curr_selected_class)
    
    selected_boxes = swapBox(selected_boxes)
    selected_boxes = selected_boxes[1:, :]
    
    tmp_scores = np.array(selected_scores)
    order_list = tmp_scores.argsort().tolist()
    order_list = order_list[::-1]
    
    output_boxes = selected_boxes[order_list,:]
    output_scores = [selected_scores[j] for j in order_list]
    output_class = [selected_class[j] for j in order_list]
    
    return output_boxes.tolist(), output_scores, output_class

## 水平翻轉預測

我們認為模型既然可能看得懂一邊的臉，但卻看不懂另外一邊，因此我們在訓練模型的時候要做Data augmentation，那我們也可以將這個想法應用在預測的階段。
<br/> 因此，我們在進行預測的時候，我們先預測testing data的圖。接著，我們對圖進行水平翻轉，對水平翻轉後的圖再進行一次預測，並將新預測出的bounding box再翻轉回原始的圖上，並使用Non-max suppression將有進行水平翻轉預測出的bounding box與沒有進行翻轉預測出的bounding box合併。
<br/> 如此一來，即便模型真的發生看不懂一邊的臉的情況，此方法也能夠讓模型預測到另一邊的臉，藉此來框出此Object。
<br/>根據預測出的結果，我們也發現有許多原本預測都是0的圖，藉由這個方式而框出了Object。

必須要注意的是，因為這個方法整併了兩種預測結果，因此若有其中一個預測到錯誤的物件會導致最後的預測結果中也會有錯誤的物件，所以我們認為若將confidence的threshold適當的提高可以有更好的結果，因此我們將原本使用的threshold 0.001調整到0.01。

In [None]:
@tf.function
def flip_mirror(image):
    return tf.image.flip_left_right(image)

def clean_box(box,conf,cla):
    if len(box)==0:
        return box, conf,cla
    
    mask = []
    for i in range(len(box)):
        if int(box[i][0]) != int(box[i][2]) and int(box[i][1]) != int(box[i][3]) and conf[i] !=0:
            mask.append(True)
        else:
            mask.append(False)
    
    if sum(mask) == 0:
        output_box = [[0., 0., 0., 0.]]
        output_conf = [0]
        output_cla = [0]
    else:
        output_box = [box[j] for j in range(len(box)) if mask[j]]
        output_conf = [conf[j] for j in range(len(conf)) if mask[j]]
        output_cla = [cla[j] for j in range(len(cla)) if mask[j]]
    
    return output_box, output_conf, output_cla

In [None]:
output_file = open('./test_prediction_Resnet_paper.txt', 'w')
# output_file = open('./test_prediction_Resnet.txt', 'w')
# output_file = open('./test_prediction_Xception_paper.txt', 'w')
# output_file = open('./test_prediction_Xception.txt', 'w')

for img_name, test_img, img_h, img_w in test_dataset:
    batch_num = img_name.shape[0]
    for i in range(batch_num):
        temp_original = multiple_process_outputs(prediction_step(test_img[i:i+1]))
        temp_mirror = multiple_process_outputs(prediction_step(flip_mirror(test_img[i:i+1])))
        
        for idx in range(len(temp_mirror)):
            tmp_x_max = IMAGE_SIZE-temp_mirror[idx][0]
            tmp_x_min = IMAGE_SIZE-temp_mirror[idx][2]

            temp_mirror[idx][0] = tmp_x_min
            temp_mirror[idx][2] = tmp_x_max

        temp = temp_original+temp_mirror
        
        now_w = img_w[i:i+1].numpy().tolist()[0]
        now_h = img_h[i:i+1].numpy().tolist()[0]

        box_np = np.zeros((len(temp),4))
        class_num_list = []
        conf_list = []

        for j in range(len(temp)):
            pred = temp[j]
            xmin, ymin, xmax, ymax, class_num, conf = pred[0], pred[1], pred[2], pred[3], pred[4], pred[5]

            xmin = max(xmin, 0)
            ymin = max(ymin, 0)
            xmax = min(xmax, IMAGE_SIZE)
            ymax = min(ymax, IMAGE_SIZE)

            # to avoid some weird problem
            if xmin == xmax:
                xmax += 0.1
            if ymin == ymax:
                ymax += 0.1

            box_np[j,:] = [xmin*(now_w/IMAGE_SIZE), ymin*(now_h/IMAGE_SIZE), xmax*(now_w/IMAGE_SIZE), ymax*(now_h/IMAGE_SIZE)]
            class_num_list.append(class_num)
            conf_list.append(conf)

        if len(conf_list)==1:
            output_box, output_conf, output_class = box_np.tolist(), conf_list, class_num_list
        else:
            output_box, output_conf, output_class = cal_non_max(box_np, class_num_list, conf_list)

        assert len(output_box)==len(output_conf)==len(output_class), "長度不同"
        
        output_box, output_conf, output_class = clean_box(output_box, output_conf, output_class)

        # start output
        output_file.write(img_name[i:i+1].numpy()[0].decode('ascii'))
        for k in range(len(output_class)):
            # for every box
            now_box = output_box[k]
            output_file.write(" %d %d %d %d %d %f" % (now_box[0], now_box[1], now_box[2],
                                                      now_box[3], output_class[k], output_conf[k]))
            
        output_file.write("\n")

output_file.close()
print("Finish Prediction")

## Ensemble

最後，透過水平翻轉預測中使用Non-max suppression合併有無水平翻轉的bounding box的技巧。我們想到這個方法照理來說也可以用來合併不同模型所預測出的bounding box。
<br/> 因此，我們便將不同的模型所預測出的Bounding box使用Non-max suppression進行合併，並將合併後的結果做為最終的預測結果。我們合併的模型如下：
1. Paper的Augmentation + ResNet 152
2. 我們的Augmentation + ResNet 152
3. Paper的Augmentation + Xception
4. 我們的Augmentation + Xception

在合併的過程中，我們先將同為使用ResNet 152的第一個模型與第二個模型合併、將同為使用Xception的第三個模型與第四個模型合併，最後再將這兩個合併的結果合併以形成最終預測結果。
<br/> 最終預測結果的Public Score為0.37759、Private Score為0.38902，都是我們所有嘗試的方法當中最好的。

In [None]:
def swapBox(box):
    box[:,[0,1,2,3]] = box[:,[1,0,3,2]]
    return box

def execute_non_max(ori_box, ori_score, ori_class):
    # ori_class => number
    boxes = tf.Variable(ori_box, dtype=tf.float32)
    scores = tf.Variable(ori_score, dtype=tf.float32)
    
    selected_indices = tf.image.non_max_suppression(boxes, scores,
                                                    20, iou_threshold=IOU_THRES)
    
    selected_boxes = tf.gather(boxes, selected_indices).numpy()
    selected_scores = tf.gather(scores, selected_indices).numpy().tolist()
    selected_class = [ori_class for _ in range(selected_boxes.shape[0])]
    
    return selected_boxes, selected_scores, selected_class

def cal_non_max(original_box, class_number, conf):
    original_box = swapBox(original_box)
    
    selected_boxes = np.array([[0., 0., 0., 0.]])
    selected_scores = []
    selected_class = []
    
    class_set = set(class_number)
    for i in class_set:
        curr_idx = [j for j in range(len(class_number)) if class_number[j] == i]
        curr_boxes = original_box[curr_idx,:]
        curr_scores = [conf[j] for j in curr_idx]
        
        curr_selected_boxes, curr_selected_scores, curr_selected_class = execute_non_max(curr_boxes, curr_scores, i)
        
        selected_boxes = np.concatenate((selected_boxes, curr_selected_boxes), axis=0)
        selected_scores.extend(curr_selected_scores)
        selected_class.extend(curr_selected_class)
    
    selected_boxes = swapBox(selected_boxes)
    selected_boxes = selected_boxes[1:, :]
    
    tmp_scores = np.array(selected_scores)
    order_list = tmp_scores.argsort().tolist()
    order_list = order_list[::-1]
    
    output_boxes = selected_boxes[order_list,:]
    output_scores = [selected_scores[j] for j in order_list]
    output_class = [selected_class[j] for j in order_list]
    
    return output_boxes.tolist(), output_scores, output_class

In [None]:
class ensemble_obj_detect(object):
    def __init__(self, file_list, path_to_file):
        self.path_to_file = path_to_file
        self.mapping_dict = {}

        self.image_order = []

        for f in file_list:
            tmp_dict, tmp_image_order = self.read_file(f)
            self.mapping_dict[f] = tmp_dict

            if len(self.image_order) == 0:
                self.image_order = tmp_image_order

    def read_file(self, file_name):
        output_dict = {}
        output_image_order = []

        with open(self.path_to_file + file_name) as f:
            for line in f:
                ss = line.strip().split()
                image_name = ss[0]
                line_data = ss[1:]

                output_dict[image_name] = []
                output_image_order.append(image_name)

                for i in range(len(line_data) // 6):
                    curr_data = [
                        float(j) for j in line_data[6 * i:6 * (i + 1)]
                    ]
                    output_dict[image_name].append(curr_data)

        return output_dict, output_image_order

    def ensemble(self, weights=None):
        self.reshape_dict = {}

        for file_name in self.mapping_dict.keys():
            tmp_dict = self.mapping_dict[file_name]

            for image_name in tmp_dict.keys():
                if image_name not in self.reshape_dict.keys():
                    self.reshape_dict[image_name] = tmp_dict[image_name].copy()
                else:
                    self.reshape_dict[image_name].extend(tmp_dict[image_name])

        self.perform_ensemble()
        self.clean()

    def perform_ensemble(self):
        self.ensemble_dict = {}

        for image_name in self.reshape_dict.keys():
            tmp_array = np.array(self.reshape_dict[image_name])
            boxes = tmp_array[:, :4]
            class_list = tmp_array[:, 4].tolist()
            scores_list = tmp_array[:, 5].tolist()

            x_same_bool = boxes[:, 0] == boxes[:, 2]
            boxes[x_same_bool, 2] += 0.5

            y_same_bool = boxes[:, 1] == boxes[:, 3]
            boxes[y_same_bool, 3] += 0.5

            output_box, output_score, output_class = cal_non_max(
                boxes, class_list, scores_list)

            np_box = np.array(output_box)

            np_score = np.array(output_score)
            np_score = np_score.reshape((np_box.shape[0], 1))

            np_class = np.array(output_class)
            np_class = np_class.reshape((np_box.shape[0], 1))

            output = np.concatenate((np_box, np_class, np_score), axis=1)

            # sort array from max to min
            output = output[output[:, 5].argsort()[::-1], :]
            output = self.remove_low_confidence(output)

            self.ensemble_dict[image_name] = output.tolist()

    def clean(self):
        for image_name in self.ensemble_dict.keys():
            if len(self.ensemble_dict[image_name]) > 1:
                new_list = []
                for box in self.ensemble_dict[image_name]:
                    if int(box[0]) != int(box[2]) and int(box[1]) != int(
                            box[3]):
                        new_list.append(box)

                if len(new_list) == 0:
                    new_list.append([0., 0., 0., 0., 0., 0.])

                self.ensemble_dict[image_name] = new_list

            assert len(self.ensemble_dict[image_name]) >= 1, "沒有箱子?"

    @staticmethod
    def remove_low_confidence(whole_array):
        if whole_array.shape[0] == 1:
            return whole_array

        high_conf = whole_array[:, 5] > PROB_THRES

        if np.sum(high_conf) == 0:
            # all low => return highest
            return whole_array[[0], :]
        else:
            return whole_array[high_conf, :]

    def write_ensemble(self, file_name):
        with open(file_name, "w") as f:
            for image_name in self.image_order:
                f.write(image_name)

                for to_write in self.ensemble_dict[image_name]:
                    f.write(" %d %d %d %d %d %.6f" %
                            (to_write[0], to_write[1], to_write[2],
                             to_write[3], to_write[4], to_write[5]))

                f.write("\n")

In [None]:
IOU_THRES=0.3
PROB_THRES=0.01

In [None]:
enser1 = ensemble_obj_detect(file_list=["test_prediction_Resnet_paper.txt",
                                        "test_prediction_Resnet.txt"],
                             path_to_file="../data/output/txt/")
enser1.ensemble()
enser1.write_ensemble("../data/output/txt/ensemble_Resnet.txt")

In [None]:
enser2 = ensemble_obj_detect(file_list=["test_prediction_Xception_paper.txt",
                                        "test_prediction_Xception.txt"],
                             path_to_file="../data/output/txt/")
enser2.ensemble()
enser2.write_ensemble("../data/output/txt/ensemble_Xception.txt")

In [None]:
enser3 = ensemble_obj_detect(file_list=["ensemble_Resnet.txt",
                                        "ensemble_Xception.txt"],
                             path_to_file="../data/output/txt/")
enser3.ensemble()
enser3.write_ensemble("../data/output/txt/ensemble_final.txt")

## 結論

1. 這次的模型訓練動輒12小時起跳，因此必須要審慎思考必須要做的方法以及小心的coding，不然可能會發生train了很久最後結果卻有問題的情況。
2. 不論是augmentation的方法或是模型的設計，paper所提供的方法都能夠有不錯的效果，因此，可以先參考paper的設計再從這點開始變化，會比從0開始嘗試來得快速、有效。
3. 使用水平翻轉預測與ensemble可以得到不錯的成效，後來我有在Kaggle的有獎競賽中看到有人也會使用類似的方法，但每個人的設定與使用方式略有不同，或許可以多參考其他人的使用方式。