In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import seaborn as sns### visualizations
import matplotlib.pyplot as plt### plotting bar chart
import datetime
import pathlib
import io
from datetime import datetime
import json
import xml.etree.ElementTree as ET
import os
import shutil
import cv2
import time
import random
from PIL import Image
import albumentations as A
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (GlobalAveragePooling2D, Activation, MaxPooling2D, Add, Conv2D, MaxPool2D, Dense,
                                     Flatten, InputLayer, BatchNormalization, Input, Embedding, Permute,
                                     Dropout, RandomFlip, RandomRotation, LayerNormalization, MultiHeadAttention,
                                     RandomContrast, Rescaling, Resizing, Reshape, LeakyReLU)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (Callback, CSVLogger, EarlyStopping, LearningRateScheduler,
                                        ModelCheckpoint, ReduceLROnPlateau)
from tensorflow.keras.regularizers import L2, L1
from tensorflow.keras.initializers import RandomNormal

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
def compute_iou(boxes1, boxes2):
    boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,
                         boxes1[..., 1] - boxes1[..., 3] / 2.0,
                         boxes1[..., 0] + boxes1[..., 2] / 2.0,
                         boxes1[..., 1] + boxes1[..., 3] / 2.0],
                        axis=-1)

    boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
                         boxes2[..., 1] - boxes2[..., 3] / 2.0,
                         boxes2[..., 0] + boxes2[..., 2] / 2.0,
                         boxes2[..., 1] + boxes2[..., 3] / 2.0],
                        axis=-1)
    lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
    rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])

    intersection = tf.maximum(0.0, rd - lu)
    inter_square = intersection[..., 0] * intersection[..., 1]

    square1 = boxes1[..., 2] * boxes1[..., 3]
    square2 = boxes2[..., 2] * boxes2[..., 3]

    union_square = tf.maximum(square1 + square2 - inter_square, 1e-10)
    return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)

In [4]:
def difference(x,y):
  return tf.reduce_sum(tf.square(y-x))

In [5]:
def yolo_loss(y_true, y_pred):
  target = y_true[...,0]

  ###################### OBject Loss
  y_pred_extract = tf.gather_nd(y_pred, tf.where(target[:]==1))
  y_target_extract = tf.gather_nd(y_true, tf.where(target[:]==1))
  
  rescaler = tf.where(target[:]==1)*SPLIT_SIZE
  upscaler_1 = tf.concat([rescaler[:,1:],tf.zeros([len(rescaler),2], dtype=tf.int64)],axis=-1)
  
  target_upscaler_2 = tf.repeat([[float(SPLIT_SIZE),float(SPLIT_SIZE),H,W]],
                       repeats=[len(rescaler)], axis=0)*tf.cast(y_target_extract[...,1:5], dtype = tf.float32)
  pred_1_upscaler_2 = tf.repeat([[float(SPLIT_SIZE),float(SPLIT_SIZE),H,W]],
                      repeats=[len(rescaler)], axis=0)*tf.cast(y_pred_extract[...,1:5], dtype = tf.float32)
  pred_2_upscaler_2 = tf.repeat([[float(SPLIT_SIZE),float(SPLIT_SIZE),H,W]],
                      repeats=[len(rescaler)], axis=0)*tf.cast(y_pred_extract[...,6:10], dtype = tf.float32)
  
  target_orig = tf.cast(upscaler_1, dtype = tf.float32)+target_upscaler_2
  pred_1_orig = tf.cast(upscaler_1, dtype = tf.float32)+pred_1_upscaler_2
  pred_2_orig = tf.cast(upscaler_1, dtype = tf.float32)+pred_2_upscaler_2
  
  mask =tf.cast(tf.math.greater(compute_iou(target_orig,pred_2_orig),
                                         compute_iou(target_orig,pred_1_orig)),dtype=tf.int32)
  
  y_pred_joined=tf.transpose(tf.concat([tf.expand_dims(y_pred_extract[...,0],axis=0),
                        tf.expand_dims(y_pred_extract[...,5],axis=0)],axis=0))
  
  obj_pred = tf.gather_nd(y_pred_joined,tf.stack([tf.range(len(rescaler)),mask],axis=-1))
  
  object_loss = difference(tf.cast(obj_pred,dtype =tf.float32)
                            ,tf.cast(tf.ones([len(rescaler)]),dtype=tf.float32))

  ####################### For No object
  y_pred_extract = tf.gather_nd(y_pred[...,0:B*5], tf.where(target[:]==0))
  y_target_extract = tf.zeros(len(y_pred_extract))

  no_object_loss_1 = difference(tf.cast(y_pred_extract[...,0],dtype =tf.float32)
                            ,tf.cast(y_target_extract,dtype=tf.float32))
  
  no_object_loss_2 = difference(tf.cast(y_pred_extract[...,5],dtype =tf.float32)
                            ,tf.cast(y_target_extract,dtype=tf.float32))
  
  no_object_loss = no_object_loss_1+no_object_loss_2

  ######################## For OBject class loss
  y_pred_extract = tf.gather_nd(y_pred[...,10:],tf.where(target[:]==1))
  class_extract = tf.gather_nd(y_true[...,5:],tf.where(target[:]==1))

  class_loss = difference(tf.cast(y_pred_extract,dtype =tf.float32)
                                ,tf.cast(class_extract,dtype=tf.float32))

  ######################### For object bounding box loss
  y_pred_extract = tf.gather_nd(y_pred[...,0:B*5], tf.where(target[:]==1))
  centre_joined=tf.stack([y_pred_extract[...,1:3],y_pred_extract[...,6:8]],axis=1)
  centre_pred = tf.gather_nd(centre_joined,tf.stack([tf.range(len(rescaler)),mask],axis=-1))
  centre_target = tf.gather_nd(y_true[...,1:3], tf.where(target[:]==1))
  
  centre_loss = difference(centre_pred,centre_target)
    
  size_joined=tf.stack([y_pred_extract[...,3:5],y_pred_extract[...,8:10]],axis=1)

  size_pred = tf.gather_nd(size_joined,tf.stack([tf.range(len(rescaler)),mask],axis=-1))
  size_target = tf.gather_nd(y_true[...,3:5], tf.where(target[:]==1))
  
  size_loss = difference(tf.math.sqrt(tf.math.abs(size_pred)),tf.math.sqrt(tf.math.abs(size_target)))
  box_loss = centre_loss+size_loss
  
  lambda_coord = 5.0
  lambda_no_obj = 0.5

  loss = object_loss + (lambda_no_obj*no_object_loss)+ tf.cast(lambda_coord*box_loss,dtype=tf.float32)+ tf.cast(class_loss,dtype=tf.float32) 
  return loss

In [6]:
classes = ['aeroplane','bicycle','bird','boat','bottle','bus','car','cat','chair','cow','diningtable',
         'dog','horse','motorbike','person','pottedplant','sheep','sofa','train','tvmonitor']

In [7]:
model = tf.keras.models.load_model("pascal_voc_2012_yolo_efficientnetB1.h5", custom_objects={"yolo_loss": yolo_loss})

In [8]:
test_path = "dataset/VOC2012/ValJPEGImages/"

In [9]:
H, W = 224, 224

In [9]:
!mkdir output

In [12]:
def model_test(filename):
    try:
        test = test_path+filename
        # print(test)

        img=cv2.resize(cv2.imread(test),(H,W))
        
        image = tf.io.decode_jpeg(tf.io.read_file(test))
        image = tf.cast(tf.image.resize(image, [H, W]), dtype=tf.float32)
        output = model.predict(tf.expand_dims(image, axis=0))
        # print(output.shape)
    
        THRESH = .25
    
        # the object posistions is to get the first two (B) 5 values where eg. 0/1, norm_x_center, norm_y_center, norm_w, norm_h
        # based on the defined threshold
        object_positions=tf.concat([tf.where(output[...,0] >= THRESH), tf.where(output[...,5] >= THRESH)], axis=0)
        # print(object_positions)
        selected_output=tf.gather_nd(output, object_positions)
        # print(selected_output)
        final_boxes=[]
        final_scores=[]

        for i, pos in enumerate(object_positions):
            # to loop the two (B) labels
            for j in range(2):
                # to get each of the first (label -- 0 to 1) of the five values
                # selected_output[0][0] & selected_output[0][5]
                if selected_output[i][j*5] > THRESH:
                    # output[pos[0]][pos[1]][pos[2]] -- this is to get the 30 values output
                    # [(j*5)+1:(j*5)+5] -- this is to get two (b) bounding box from the 30 values output
                    # first loop -- [(0*5)+1]:(0*5)+5] -- [1:5]
                    # second loop -- [(1*5)+1:(1*5)+5] -- [6:10]
                    output_box = tf.cast(output[pos[0]][pos[1]][pos[2]][(j*5)+1:(j*5)+5], dtype=tf.float32)
                    # print(output_box)

                    # to get the x_centre, since the grid is 7 * 7 and the image size is 224
                    # need to get the position of the object first which is from pos
                    # pos/7*224 = pos*32 (224/7=32)
                    # pos*32 + value*32 = (pos + value) * 32
                    x_centre=(tf.cast(pos[1], dtype=tf.float32) + output_box[0])*32
                    y_centre=(tf.cast(pos[2], dtype=tf.float32) + output_box[1])*32
                    # print(x_centre)
                    # print(y_centre)

                    x_width, y_height = tf.math.abs(W * output_box[2]), tf.math.abs(H * output_box[3])

                    # this is taking the bounding box's width/height to get the min and max coordinates 
                    # with the centre point of the bounding box
                    x_min, y_min = int(x_centre-(x_width/2)), int(y_centre-(y_height/2))
                    x_max, y_max = int(x_centre+(x_width/2)), int(y_centre+(y_height/2))

                    x_min = 0 if x_min <= 0 else x_min
                    y_min = 0 if y_min <= 0 else y_min
                    x_max = W if x_max >= W else x_max
                    y_max = H if y_max >= H else y_max

                    final_boxes.append([x_min, 
                                        y_min, 
                                        x_max, 
                                        y_max, 
                                        str([classes[tf.argmax(selected_output[...,10:], axis=-1)[i]]])])

                    final_scores.append(selected_output[i][j*5])
        # print(final_scores)
        # print("Final Box: ", final_boxes)
        final_boxes = np.array(final_boxes)

        object_classes = final_boxes[...,4]
        nms_boxes = final_boxes[...,0:4]

        # this is to remove the duplicate bounding boxes and remain the one with highest probability score
        nms_output = tf.image.non_max_suppression(
            nms_boxes, # containing the bounding box, in order to calculate the area
            final_scores, # containing the first and the sixth value of the label, in order to get the highest score
            max_output_size=100, # depending on the specific task, if there is 150 classes, then set to 150
            iou_threshold=0.2, # using the larger bounding box to divide the duplicated box, if greater than 0.2 then discard it
            score_threshold=float('-inf') # this is the threshold that if the score is lower than the defined, directly discard
        )

        # print(nms_output)

        for i in nms_output:
            cv2.rectangle(img, 
                          (int(final_boxes[i][0]), int(final_boxes[i][1])), 
                          (int(final_boxes[i][2]), int(final_boxes[i][3])), 
                          (255,0,0), 
                          1)

            cv2.putText(img,
                        final_boxes[i][-1],
                        (int(final_boxes[i][0]), int(final_boxes[i][1])+15),
                        cv2.FONT_HERSHEY_COMPLEX_SMALL, 
                        1, 
                        (0,0,0), 
                        1)

        cv2.imwrite("output/" + filename[:-4] + "_det.jpg", cv2.resize(img, (384,384)))
            
    except:
        print("No Object Found.")

In [55]:
for filename in os.listdir(test_path):
    model_test(filename)
    break

dataset/val2017/000000000139.jpg
(1, 7, 7, 30)
tf.Tensor(
[[0 3 4]
 [0 4 4]
 [0 0 3]
 [0 3 4]
 [0 4 4]], shape=(5, 3), dtype=int64)
tf.Tensor(
[[3.9771909e-01 6.5087509e-01 4.3916300e-01 2.3826581e-01 3.2411438e-01
  4.6929896e-01 5.6332964e-01 4.5645911e-01 1.1285289e-01 1.9790517e-01
  5.2837916e-03 2.6853201e-03 1.8878145e-03 3.8372127e-03 4.2474237e-03
  1.5914055e-03 2.3122712e-03 4.4010924e-03 2.7509305e-01 7.5809103e-03
  4.8044147e-03 3.5881207e-03 1.5113264e-03 6.3503149e-04 5.6665337e-01
  1.8790988e-02 4.2701536e-03 8.5225161e-03 7.2963891e-04 1.6405111e-02]
 [6.3112724e-01 4.7805014e-01 4.1215324e-01 9.9348441e-02 1.9270043e-01
  7.0542997e-01 4.5462221e-01 4.2997396e-01 2.2061114e-01 3.0477968e-01
  7.7536292e-03 3.1222547e-03 1.9102831e-03 6.0180826e-03 2.2914133e-03
  4.6955612e-03 3.7736509e-03 6.9671529e-03 3.4356844e-01 2.5895773e-03
  9.6747343e-04 3.2366079e-03 1.3682389e-03 1.5522166e-03 4.9770910e-01
  2.5655344e-02 5.3435043e-03 3.5018433e-02 1.9784782e-03 9.3930

In [13]:
for filename in os.listdir(test_path):
    model_test(filename)

No Object Found.
No Object Found.
No Object Found.
No Object Found.
No Object Found.
