In [3]:
import torch
from torch import nn


## Yolo model

In [4]:
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1)
]

In [5]:
class CNNBlock(nn.Module):
  def __init__(self,
               in_channels,
               out_channels,
               **kwargs):
    super().__init__()
    self.conv = nn.Conv2d(in_channels=in_channels,
                          out_channels=out_channels,
                          bias=False,
                          **kwargs)
    self.batch_norm = nn.BatchNorm2d(num_features=out_channels)
    self.leaky_relu = nn.LeakyReLU(0.1)

  def forward(self, x):
    return self.leaky_relu(self.batch_norm(self.conv(x)))


In [6]:
class YoloV1(nn.Module):
  def __init__(self,
               in_channels=3,
               **kwargs):
    super().__init__()
    self.architecture = architecture_config
    self.in_channels = in_channels
    self.darknet = self._create_conv_layers(self.architecture)
    self.fcs = self._create_fcs(**kwargs)

  def forward(self, x):
    x = self.darknet(x)
    return self.fcs(torch.flatten(x, start_dim=1))

  def _create_conv_layers(self, architecture):
    layers = []
    in_channels = self.in_channels

    for x in architecture:
      if type(x) == tuple:
        layers += [CNNBlock(in_channels=in_channels,
                           out_channels=x[1],
                           kernel_size=x[0],
                           stride=x[2],
                           padding=x[3])]
        in_channels = x[1]

      elif type(x) == str:
        layers += [nn.MaxPool2d(kernel_size=2,
                               stride=2)]
      elif type(x) == list:
        conv1 = x[0]
        conv2 = x[1]
        num_repeats = x[2]

        for _ in range(num_repeats):
          layers += [CNNBlock(in_channels=in_channels,
                             out_channels=conv1[1],
                             kernel_size=conv1[0],
                             stride=conv1[2],
                             padding=conv1[3])]

          in_channels = conv1[1]
          layers += [CNNBlock(in_channels=in_channels,
                             out_channels=conv2[1],
                             kernel_size=conv2[0],
                             stride=conv2[2],
                             padding=conv2[3])]
          in_channels = conv2[1]

    return nn.Sequential(*layers)

  def _create_fcs(self, split_size, num_boxes, num_classes):
    S, B, C = split_size, num_boxes, num_classes
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(1024*S*S, 496),
        nn.Dropout(0.0),
        nn.LeakyReLU(0.1),
        nn.Linear(496, S*S*(C+B*5)),
    )

In [7]:
test = torch.randn(size=(2,3, 448, 448))
model = YoloV1(split_size=7, num_boxes=2, num_classes=20)
model(test).shape

torch.Size([2, 1470])

## Loss function

In [8]:
!wget https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Pytorch/object_detection/YOLO/utils.py

--2024-07-09 18:35:57--  https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Pytorch/object_detection/YOLO/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11922 (12K) [text/plain]
Saving to: 'utils.py'


2024-07-09 18:35:58 (57.2 MB/s) - 'utils.py' saved [11922/11922]



In [9]:
import torch
import torch.nn as nn
from utils import intersection_over_union

In [10]:
class YoloLoss(nn.Module):
  def __init__(self, S=7, B=2, C=20):
    super().__init__()
    self.mse = nn.MSELoss(reduction="sum")
    self.S = S
    self.B = B
    self.C = C
    self.lambda_noobj = 0.5
    self.lambda_coord = 5

  def forward(self, predictions, target):
    predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)

    iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
    iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
    ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
    iou_maxes, best_box = torch.max(ious, dim=0)
    exists_box = target[..., 20].unsqueeze(3)

    ## FOR BOX COORDINATES

    box_predictions = exists_box * (best_box * predictions[..., 26:30] + (1 - best_box) * predictions[..., 21:25])
    box_predictions[..., 2:4] = torch.sign(box_predictions * torch.sqrt(torch.abs(box_predictions[..., 2:4] + 1e-6)))

    box_targets = exists_box * target[..., 21:25]
    box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

    box_loss = self.mse(torch.flatten(box_predictions, end_dim=-2), torch.flatten(box_targets, end_dim=-2))

    ## FOR OBJECT LOSS
    pred_box = (best_box * predictions[..., 25:26] + (1 - best_box) * predictions[..., 20:21])
    object_loss = self.mse(torch.flatten(exists_box * pred_box), torch.flatten(exists_box * target[..., 20:21]))

    ## FOR NO OBJECT LOSS
    no_object_loss = self.mse(torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
                              torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1))

    no_object_loss = self.mse(torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
                              torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1))

    ## FOR CLASS LOSS
    class_loss = self.mse(torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
                          torch.flatten(exists_box * target[..., :20], end_dim=-2))

    loss = self.lambda_coord * box_loss + object_loss + self.lambda_noobj * no_object_loss + class_loss

    return loss

## Loading the dataset

In [20]:
%%writefile dataset.py

import torch
import os
import pandas as pd
from PIL import Image

class VOCDataset(torch.utils.data.Dataset):
    def __init__(self,
                 csv_file,
                 img_dir, 
                 label_dir,
                 S=7,
                 B=2,
                 C=20, 
                 transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = lable_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes = []
        with open(label_path) as f:
            for label in f.readlines():
                class_label, x, y, width, height = [float(x) if float(x) != int(float(x)) else int(x) 
                                                    for x in label.replace("\n", "").split()]
                
                boxes.append([class_label, x, y, width, height])
                
        img_path = os.path.join(self.img_dir, self.annotations.iloc(index, 0))
        image = Image.open(img_path)
        boxes = torch.tensor(boxes)
        if self.transform:
            image, boxes = self.transform(image, boxes)
        
        label_matrix = torch.zeros((self.S, self.S, self.C+5*self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)
            i, j = int(self.S*y), int(self.S*x)
            X_cell, y_cell = self.S * x - j, self.X * y - i
            width_cell, height_cell = width * self.S, heigth * self.S
            
            if label_matrix[i, j, 20] == 0:
                label_matrix[i, j, 20] = 1
                box_coordinates = torch.tensor([x_cell, y_cell, width_cell, height_cell])
                label_matrix[i, j, 21:25] = box_coordinates
                label_matrix[i, j, class_label] = 1
                
        return image, label_matrix

Writing dataset.py


In [15]:
annotations = pd.read_csv("/kaggle/input/pascalvoc-yolo/train.csv")

In [18]:
annotations.iloc[0]

000005.jpg    000007.jpg
000005.txt    000007.txt
Name: 0, dtype: object