#Getting started with YOLOv1 in PyTorch

Materials used for learning YOLO :
1. YOLOv1 from scratch ( by Aladdin Persson --> https://www.youtube.com/watch?v=n9_XyCGr-MI&t=1044s )

##model.py

In [None]:
# List all the packages to be imported

import torch
import torch.nn as nn

In [None]:
# Format for architecture_config is tuple( kernel_size, num_filters, stride, padding ) ; "M" represents Maxpool of size 2x2 with stride 2
architecture_config = [
                       (7,64,2,3),
                       "M",
                       (3,192,1,1),
                       "M",
                       (1,128,1,0),
                       (3,256,1,1),
                       (1,256,1,0),
                       (3,512,1,1),
                       "M",
                       [(1,256,1,0),(3,512,1,1),4],
                       (1,512,1,0),
                       (3,1024,1,1),
                       "M",
                       [(1,512,1,0),(3,1024,1,1),2],
                       (3,1024,1,1),
                       (3,1024,2,1),
                       (3,1024,1,1),
                       (3,1024,1,1),
]

class CNN(nn.Module):
  def _init_(self,in_channels,out_channels):
    super(CNN,self)._init_()
    self.conv = nn.Conv2d(in_channels,out_channels, bias=False)
    self.batchnorm = nn.BatchNorm2d(out_channels)
    self.leakyrelu = nn.LeakyReLU(0.1)

  def forward(self,x):
    return self.leakyrelu(self.batchnorm(self.conv(x)))
  
class Yolov1(nn.Module):
  def _init_(self,in_channels=3):
    super(Yolov1,self)._init_()
    self.architecture = architecture_config
    self.in_channels = in_channels
    self.darknet = self._create_conv_layers(self.architecture)
    self.fcs = self._create_fcs(**kwargs)

  def forward(self,x):
    x=self.darknet(x)
    return self.fcs(torch.flatten(x,start_dim=1))

  def _create_conv_layers(self,architecture):
    layers=[]
    in_channels = self.in_channels

    for x in architecture:
      if type(x)==tuple:
        layers += [
                 CNN(
            in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]
        )
        ]
        in_channels = x[1]
      elif type(x)==str:
          layers += [
                     nn.MaxPool2d(kernel_size=2,stride=2)
          ]

      elif type(x)==list:
        conv1 = x[0]
        conv2 = x[1]
        num_repeats = x[2]

        for _ in range(num_repeats):
          layers += [
                     CNN( in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3] )
          ]
          layers += [
                     CNN( conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3] )
          ]

          in_channels = conv2[1]
    return nn.Sequential(*layers)

  def _create_fcs(self, split_size, num_boxes, num_classes):
    S, B, C = split_size, num_boxes, num_classes
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(1024*S*S,496),
        nn.Dropout(0.0),
        nn.LeakyReLU(0.1),
        nn.Linear(496,S*S*(C+B*5)),
    )

In [None]:
def test(S=7, B=2, C=20):
  model = Yolov1()
  x=torch.randn((2,3,488,488))
  print(model(x).shape)

test()

utils.py

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import Counter

def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates intersection over union
    Parameters:
        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)
    Returns:
        tensor: Intersection over union for all examples
    """

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)


def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
    """
    Does Non Max Suppression given bboxes
    Parameters:
        bboxes (list): list of lists containing all bboxes with each bboxes
        specified as [class_pred, prob_score, x1, y1, x2, y2]
        iou_threshold (float): threshold where predicted bboxes is correct
        threshold (float): threshold to remove predicted bboxes (independent of IoU) 
        box_format (str): "midpoint" or "corners" used to specify bboxes
    Returns:
        list: bboxes after performing NMS given a specific IoU threshold
    """

    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[1] > threshold]
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0)

        bboxes = [
            box
            for box in bboxes
            if box[0] != chosen_box[0]
            or intersection_over_union(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                box_format=box_format,
            )
            < iou_threshold
        ]

        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms


def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    """
    Calculates mean average precision 
    Parameters:
        pred_boxes (list): list of lists containing all bboxes with each bboxes
        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
        true_boxes (list): Similar as pred_boxes except all the correct ones 
        iou_threshold (float): threshold where predicted bboxes is correct
        box_format (str): "midpoint" or "corners" used to specify bboxes
        num_classes (int): number of classes
    Returns:
        float: mAP value across all classes given a specific IoU threshold 
    """

    # list storing all AP for respective classes
    average_precisions = []

    # used for numerical stability later on
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        # Go through all predictions and targets,
        # and only add the ones that belong to the
        # current class c
        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # find the amount of bboxes for each training example
        # Counter here finds how many ground truth bboxes we get
        # for each training example, so let's say img 0 has 3,
        # img 1 has 5 then we will obtain a dictionary with:
        # amount_bboxes = {0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # We then go through each key, val in this dictionary
        # and convert to the following (w.r.t same example):
        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box probabilities which is index 2
        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)
        
        # If none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Only take out the ground_truths that have the same
            # training idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)


def plot_image(image, boxes):
    """Plots predicted bounding boxes on the image"""
    im = np.array(image)
    height, width, _ = im.shape

    # Create figure and axes
    fig, ax = plt.subplots(1)
    # Display the image
    ax.imshow(im)

    # box[0] is x midpoint, box[2] is width
    # box[1] is y midpoint, box[3] is height

    # Create a Rectangle potch
    for box in boxes:
        box = box[2:]
        assert len(box) == 4, "Got more values than in x, y, w, h, in a box!"
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="r",
            facecolor="none",
        )
        # Add the patch to the Axes
        ax.add_patch(rect)

    plt.show()

def get_bboxes(
    loader,
    model,
    iou_threshold,
    threshold,
    pred_format="cells",
    box_format="midpoint",
    device="cuda",
):
    all_pred_boxes = []
    all_true_boxes = []

    # make sure model is in eval before get bboxes
    model.eval()
    train_idx = 0

    for batch_idx, (x, labels) in enumerate(loader):
        x = x.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        true_bboxes = cellboxes_to_boxes(labels)
        bboxes = cellboxes_to_boxes(predictions)

        for idx in range(batch_size):
            nms_boxes = non_max_suppression(
                bboxes[idx],
                iou_threshold=iou_threshold,
                threshold=threshold,
                box_format=box_format,
            )


            #if batch_idx == 0 and idx == 0:
            #    plot_image(x[idx].permute(1,2,0).to("cpu"), nms_boxes)
            #    print(nms_boxes)

            for nms_box in nms_boxes:
                all_pred_boxes.append([train_idx] + nms_box)

            for box in true_bboxes[idx]:
                # many will get converted to 0 pred
                if box[1] > threshold:
                    all_true_boxes.append([train_idx] + box)

            train_idx += 1

    model.train()
    return all_pred_boxes, all_true_boxes



def convert_cellboxes(predictions, S=7):
    """
    Converts bounding boxes output from Yolo with
    an image split size of S into entire image ratios
    rather than relative to cell ratios. Tried to do this
    vectorized, but this resulted in quite difficult to read
    code... Use as a black box? Or implement a more intuitive,
    using 2 for loops iterating range(S) and convert them one
    by one, resulting in a slower but more readable implementation.
    """

    predictions = predictions.to("cpu")
    batch_size = predictions.shape[0]
    predictions = predictions.reshape(batch_size, 7, 7, 30)
    bboxes1 = predictions[..., 21:25]
    bboxes2 = predictions[..., 26:30]
    scores = torch.cat(
        (predictions[..., 20].unsqueeze(0), predictions[..., 25].unsqueeze(0)), dim=0
    )
    best_box = scores.argmax(0).unsqueeze(-1)
    best_boxes = bboxes1 * (1 - best_box) + best_box * bboxes2
    cell_indices = torch.arange(7).repeat(batch_size, 7, 1).unsqueeze(-1)
    x = 1 / S * (best_boxes[..., :1] + cell_indices)
    y = 1 / S * (best_boxes[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
    w_y = 1 / S * best_boxes[..., 2:4]
    converted_bboxes = torch.cat((x, y, w_y), dim=-1)
    predicted_class = predictions[..., :20].argmax(-1).unsqueeze(-1)
    best_confidence = torch.max(predictions[..., 20], predictions[..., 25]).unsqueeze(
        -1
    )
    converted_preds = torch.cat(
        (predicted_class, best_confidence, converted_bboxes), dim=-1
    )

    return converted_preds


def cellboxes_to_boxes(out, S=7):
    converted_pred = convert_cellboxes(out).reshape(out.shape[0], S * S, -1)
    converted_pred[..., 0] = converted_pred[..., 0].long()
    all_bboxes = []

    for ex_idx in range(out.shape[0]):
        bboxes = []

        for bbox_idx in range(S * S):
            bboxes.append([x.item() for x in converted_pred[ex_idx, bbox_idx, :]])
        all_bboxes.append(bboxes)

    return all_bboxes

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


##loss.py

In [None]:
import torch
import torch.nn as nn
#from utils import intersection_over_union


class YoloLoss(nn.Module):
    """
    Calculate the loss for yolo (v1) model
    """

    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")

        """
        S is split size of image (in paper 7),
        B is number of boxes (in paper 2),
        C is number of classes (in paper and VOC dataset is 20),
        """
        self.S = S
        self.B = B
        self.C = C

        # These are from Yolo paper, signifying how much we should
        # pay loss for no object (noobj) and the box coordinates (coord)
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        # predictions are shaped (BATCH_SIZE, S*S(C+B*5) when inputted
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        # Calculate IoU for the two predicted bounding boxes with target bbox
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        # Take the box with highest IoU out of the two prediction
        # Note that bestbox will be indices of 0, 1 for which bbox was best
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)  # in paper this is Iobj_i

        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #

        # Set boxes with no object in them to 0. We only take out one of the two 
        # predictions, which is the one with highest Iou calculated previously.
        box_predictions = exists_box * (
            (
                bestbox * predictions[..., 26:30]
                + (1 - bestbox) * predictions[..., 21:25]
            )
        )

        box_targets = exists_box * target[..., 21:25]

        # Take sqrt of width, height of boxes to ensure that
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ==================== #
        #   FOR OBJECT LOSS    #
        # ==================== #

        # pred_box is the confidence score for the bbox with highest IoU
        pred_box = (
            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
        )

        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21]),
        )

        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #

        #max_no_obj = torch.max(predictions[..., 20:21], predictions[..., 25:26])
        #no_object_loss = self.mse(
        #    torch.flatten((1 - exists_box) * max_no_obj, start_dim=1),
        #    torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        #)

        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )

        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #

        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2,),
            torch.flatten(exists_box * target[..., :20], end_dim=-2,),
        )

        loss = (
            self.lambda_coord * box_loss  # first two rows in paper
            + object_loss  # third row in paper
            + self.lambda_noobj * no_object_loss  # forth row
            + class_loss  # fifth row
        )

        return loss

train.py

In [None]:
import torch
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT
from tqdm import tqdm
from torch.utils.data import DataLoader
from model import Yolov1
from dataset import VOCDataset
from utils import (
    non_max_suppression,
    mean_average_precision,
    intersection_over_union,
    cellboxes_to_boxes,
    get_bboxes,
    plot_image,
    save_checkpoint,
    load_checkpoint,
)
from loss import YoloLoss

seed = 123
torch.manual_seed(seed)

# Hyperparameters etc. 
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
BATCH_SIZE = 16 # 64 in original paper but I don't have that much vram, grad accum?
WEIGHT_DECAY = 0
EPOCHS = 1000
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "overfit.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"


class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes


transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor(),])


def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update progress bar
        loop.set_postfix(loss=loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")


def main():
    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    optimizer = optim.Adam(
        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )
    loss_fn = YoloLoss()

    if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

    train_dataset = VOCDataset(
        "data/100examples.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
    )

    test_dataset = VOCDataset(
        "data/test.csv", transform=transform, img_dir=IMG_DIR, label_dir=LABEL_DIR,
    )

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
        drop_last=True,
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
        drop_last=True,
    )

    for epoch in range(EPOCHS):
        # for x, y in train_loader:
        #    x = x.to(DEVICE)
        #    for idx in range(8):
        #        bboxes = cellboxes_to_boxes(model(x))
        #        bboxes = non_max_suppression(bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint")
        #        plot_image(x[idx].permute(1,2,0).to("cpu"), bboxes)

        #    import sys
        #    sys.exit()

        pred_boxes, target_boxes = get_bboxes(
            train_loader, model, iou_threshold=0.5, threshold=0.4
        )

        mean_avg_prec = mean_average_precision(
            pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
        )
        print(f"Train mAP: {mean_avg_prec}")

        #if mean_avg_prec > 0.9:
        #    checkpoint = {
        #        "state_dict": model.state_dict(),
        #        "optimizer": optimizer.state_dict(),
        #    }
        #    save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)
        #    import time
        #    time.sleep(10)

        train_fn(train_loader, model, optimizer, loss_fn)


if __name__ == "__main__":
    main()

dataset.py

In [None]:
import torch
import os
import pandas as pd
from PIL import Image


class VOCDataset(torch.utils.data.Dataset):
    def __init__(
        self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None,
    ):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes = []
        with open(label_path) as f:
            for label in f.readlines():
                class_label, x, y, width, height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]

                boxes.append([class_label, x, y, width, height])

        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)
        boxes = torch.tensor(boxes)

        if self.transform:
            # image = self.transform(image)
            image, boxes = self.transform(image, boxes)

        # Convert To Cells
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)

            # i,j represents the cell row and cell column
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            """
            Calculating the width and height of cell of bounding box,
            relative to the cell is done by the following, with
            width as the example:
            
            width_pixels = (width*self.image_width)
            cell_pixels = (self.image_width)
            
            Then to find the width relative to the cell is simply:
            width_pixels/cell_pixels, simplification leads to the
            formulas below.
            """
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # If no object already found for specific cell i,j
            # Note: This means we restrict to ONE object
            # per cell!
            if label_matrix[i, j, 20] == 0:
                # Set that there exists an object
                label_matrix[i, j, 20] = 1

                # Box coordinates
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                label_matrix[i, j, 21:25] = box_coordinates

                # Set one hot encoding for class_label
                label_matrix[i, j, class_label] = 1

        return image, label_matrix

# **Object Detection and Trajectory Forecasting**

    YOLO Metric --> IoU + Non Max suppression + mAP

      [ Unit testing using unittest ]

    YOLO model/architecture --> CNN and Fully Connected Layer

    Loss Function

    Trajectory forecasting (not yet started)


# Module-wise implementation in PyTorch

---



####Intersection over Union (IoU)


> IoU = Intersection Area/Union Area

    If IoU > 0.5 "decent detection"

    IoU > 0.7 "pretty good"

    IoU > 0.9 "mind-blowing"

    IoU = 1 "naah, that ain't gonna happen ;)"

**Bounding boxes are represented as coordinates of the top left corner and bottom right corner**

Box1 = [x1,y1,x2,y2]

Box2 = [x1,y1,x2,y2]

Intersection coordinates = [

    x1 = max( Box1[0],Box2[0] )

    y1 = max( Box1[1],Box2[1] )

    x2 = min( Box1[2],Box2[2] )
    
    y2 = min( Box1[3],Box2[3] )

]



In [12]:
import torch

def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
  """
  
  boxes_preds are of size Nx4 --> where N is the number of bounding boxes and 4 represents the coordinates of each bounding box
  box_labels are of size --> Nx4

  """

  ### Coordinates for the bounding region ###
  
  if box_format == "midpoint": # --> format is [ x,y,w,h ]
    box1_w = boxes_preds[0][2] / 2
    box1_h = boxes_preds[0][3] / 2
    box2_w = boxes_labels[1][2] / 2
    box2_h = boxes_labels[1][3] / 2

    box1_x1 = boxes_preds[0][0] - box1_w
    box1_y1 = boxes_preds[0][1] - box1_h
    box1_x2 = boxes_preds[0][2] + box1_w
    box1_y2 = boxes_preds[0][3] + box1_h

    box2_x1 = boxes_labels[1][0] - box2_w
    box2_y1 = boxes_labels[1][1] - box2_h
    box2_x2 = boxes_labels[1][2] + box2_w
    box2_y2 = boxes_labels[1][3] + box2_h



  if box_format == "corners": # --> format is [ x1,y1,x2,y2 ]
    box1_x1 = boxes_preds[0][0]
    box1_y1 = boxes_preds[0][1]
    box1_x2 = boxes_preds[0][2]
    box1_y2 = boxes_preds[0][3]

    box2_x1 = boxes_labels[1][0]
    box2_y1 = boxes_labels[1][1]
    box2_x2 = boxes_labels[1][2]
    box2_y2 = boxes_labels[1][3]

  ### Coordinates for the intersection region ###

  x1 = torch.max(box1_x1,box2_x1)
  y1 = torch.max(box1_y1,box2_y1)
  x2 = torch.max(box1_x2,box2_x2)
  y2 = y1 = torch.max(box1_y2,box2_y2)

  intersection_area = abs( ( x2 - x1 ).clamp(0) * ( y2 - y1 ).clamp(0) )  # .clamp(0) is used in cases when there is no intersection and so the area is 0

  box1_area = abs( ( box1_x2 - box1_x1 ) * ( box1_y2 - box1_y1 ) )
  box2_area = abs( ( box2_x2 - box2_x1 ) * ( box2_y2 - box2_y1 ) )

  return intersection_area / (box1_area + box2_area - intersection_area + 1e-6)


####Non max suppression

> *Perform the following steps for every identified object one at a time*

    * Discard all the bounding boxes < probability threshold ( hyperparameter to the model )
    * Take out the bounding box with the highest probability
    * Remove every other box with an IoU (bounding box with the highest prob and other boxes) > threshold

[Analytics Vidhya --> Selecting the Right Bounding Box Using Non-Max Suppression](https://www.analyticsvidhya.com/blog/2020/08/selecting-the-right-bounding-box-using-non-max-suppression-with-implementation/)

In [13]:
import torch

def non_max_suppression(
    bboxes,
    iou_threshold,
    prob_threshold,
    box_format="corners"
):
  '''
    Format for predictions --> [ [ class, prob, x1, y1, x2, y2 ], []... ]
  '''

  assert type(bboxes) == list # to check if the predictions is a list
  
  # Scrap all the bounding boxes below a prob_threshold
  bboxes = [ box for box in bboxes if box[1] > prob_threshold ]

  # To sort the bounding boxes based on their probabilities. key parameter takes in a function
  bboxes = sorted(bboxes, key=lambda x : x[1], reverse=True)
  bboxes_after_nms = []

  # Non-max suppression for all classes
  while bboxes:
    chosen_box = bboxes.pop(0)

    bboxes = [
              box for box in bboxes
              if box[0]!=chosen_box[0]
              or intersection_over_union(
                  torch.tensor( chosen_box[2:] ),
                  torch.tensor( box[2:] ),
                  box_format=box_format
              ) < iou_threshold
    ]

    bboxes_after_nms.append(chosen_box)
  return bboxes_after_nms

####Mean Average Precision

> Metric used to validate a model's capability to perform in testing/validation.

    Confidence --> probability value that the detected objected belongs to a specific class.
    TP (True Positive) --> Predicted = Actual
    FP (False Positive) --> Predicted != Actual (or) Wrong prediction
    FN --> Total no.of targeted bounding boxes
> Precision and Recall

    Precision --> TP/(TP+FP) Out of all the detected bounding boxes, what fraction is correct?

    Recall --> TP/(TP+FN) Out of all the target bounding boxes, what fraction did we correctly detect?


In [16]:
import torch
from collections import Counter

def mean_average_precision(
    pred_boxes,
    true_boxes,
    iou_threshold=0.5,
    box_format="corners",
    num_classes=20
):
  # pred_boxes is in the form of [ [ train_idx, class_pred, prob_score, x1, y1, x2, y2 ],... ]
  average_precisions = []
  epsilon = 1e-6

  for c in range(num_classes):
    
    # Taking all the detections and ground truths of a specific class 'c'
    detections = [ [detect for detect in pred_boxes if detect[1]==c] ]
    ground_truths = [ [ground for ground in true_boxes if ground[1]==c] ]

    # Count of the number of true boxes in each images. Format --> { 0:3, 1:5,... }
    amount_bboxes = Counter( [gt[0] for gt in ground_truths] )

    # Converting into the following format --> { 0:torch.tensor([0,0,0]) , 1:torch.tensor([0,0,0,0,0]),... }
    for key,val in amount_bboxes.items():
      amount_bboxes[key] = torch.zeros(val) 

    # Sorting detections in the descending order of their probabilities
    detections.sort( key=lambda x : x[2], reverse=True )
    
    # Finding TP,FP, TP+FN
    TP = torch.zeros(len(detections))
    FP = torch.zeros(len(detections))
    total_true_bboxes = len(ground_truths)

    if total_true_bboxes==0:
      continue

    for detection_idx,detection in enumerate(detections):
      # All the true images of the detected image, same image and same object.
      ground_truth_img = [
                          bbox for bbox in ground_truths if bbox[0]==detection[0]
      ]

      best_iou = 0

      for idx,gt in enumerate(ground_truth_img):
        iou=intersection_over_union(
            torch.tensor(detection[3:]),
            torch.tensor(gt[3:]),
            box_format=box_format
        )

        if iou > best_iou:
          best_iou=iou
          best_gt_idx=idx

      if best_iou > iou_threshold:
        if amount_bboxes[detection[0]][best_gt_idx]==0:
          amount_bboxes[detection[0]][best_gt_idx]=1
          TP[detection_idx]=1
        else:
          FP[detection_idx]=1
      else:
        FP[detection_idx]=1

    TP_cumsum=torch.cumsum(TP,dim=0)
    FP_cumsum=torch.cumsum(FP,dim=0)

    recalls = TP_cumsum / (total_true_boxes+epsilon)
    precisions = TP_cumsum / (TP_cumsum+FP_cumsum+epsilon)

    precisions = torch.cat( (torch.tensor([1]),precisions) )
    recalls = torch.cat( (torch.tensor([0]),recalls) )

    average_precisions.append(torch.trapz(precisions,recalls))

  return sum(average_precisions)/len(average_precisions)



# **YOLO model/architecture**

In [1]:
import torch
import torch.nn as nn

# Format for architecture_config is tuple( kernel_size, num_filters, stride, padding ) ; "M" represents Maxpool of size 2x2 with stride 2
architecture_config = [
                       (7,64,2,3),
                       "M",
                       (3,192,1,1),
                       "M",
                       (1,128,1,0),
                       (3,256,1,1),
                       (1,256,1,0),
                       (3,512,1,1),
                       "M",
                       [(1,256,1,0),(3,512,1,1),4],
                       (1,512,1,0),
                       (3,1024,1,1),
                       "M",
                       [(1,512,1,0),(3,1024,1,1),2],
                       (3,1024,1,1),
                       (3,1024,2,1),
                       (3,1024,1,1),
                       (3,1024,1,1),
]

class CNN(nn.Module):
  def _init_(self,in_channels,out_channels):
    super(CNN,self)._init_()
    self.conv = nn.Conv2d(in_channels,out_channels, bias=False)
    self.batchnorm = nn.BatchNorm2d(out_channels)
    self.leakyrelu = nn.LeakyReLU(0.1)

  def forward(self,x):
    return self.leakyrelu(self.batchnorm(self.conv(x)))
  
class Yolov1(nn.Module):
  def _init_(self,in_channels=3):
    super(Yolov1,self)._init_()
    self.architecture = architecture_config
    self.in_channels = in_channels
    self.darknet = self._create_conv_layers(self.architecture)
    self.fcs = self._create_fcs(**kwargs)

  def forward(self,x):
    x=self.darknet(x)
    return self.fcs(torch.flatten(x,start_dim=1))

  def _create_conv_layers(self,architecture):
    layers=[]
    in_channels = self.in_channels

    for x in architecture:
      if type(x)==tuple:
        layers += [
                 CNN(
            in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]
        )
        ]
        in_channels = x[1]
      elif type(x)==str:
          layers += [
                     nn.MaxPool2d(kernel_size=2,stride=2)
          ]

      elif type(x)==list:
        conv1 = x[0]
        conv2 = x[1]
        num_repeats = x[2]

        for _ in range(num_repeats):
          layers += [
                     CNN( in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3] )
          ]
          layers += [
                     CNN( conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3] )
          ]

          in_channels = conv2[1]
    return nn.Sequential(*layers)

  def _create_fcs(self, split_size, num_boxes, num_classes):
    S, B, C = split_size, num_boxes, num_classes
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(1024*S*S,496),
        nn.Dropout(0.0),
        nn.LeakyReLU(0.1),
        nn.Linear(496,S*S*(C+B*5)),
    )

# **YOLO Loss function**

# **Unit Testing of individual modules**

    IoU
    
    Non-max suppression
    
    mAP

# Rough work

In [None]:
# # Rough work
import torch
# from collections import Counter

# a = [
#      [1,5],
#      [2,1],
#      [5,3],
#      [1,2],
#      [2,4],
#      [5,4]
# ]

# b= Counter([b[0] for b in a])

# for k,v in b.items():
#   b[k]=torch.zeros(v)

# # print(b.items())
# print(b.items())
# print(torch.zeros(5))

# aa = [1,2,3,4,5]

# for i,j in enumerate(aa):
#   print(i,j)

# tpp = torch.zeros(5)
tp=torch.tensor([1,1,1,0,0])
print(tp)
tp1=torch.cumsum(tp,dim=0)
print(tp1)

# precision = torch.cumsum()
# prec = torch.cat(torch.tensor([1]),precision)
# print(prec)