In [10]:
# main code will be on kaggle
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
from matplotlib import style

import torch 
import torch.nn as nn
from torchsummary import summary

import os 
from PIL import Image

In [2]:
device = 'cpu' # baad main dekhenge
device

### Important Points Regarding the YOLO algorithm
* It is a fully convolutional neural network.
* No pooling layer is used in the network, instead conv layers uses stride=2 convolutions for downsampling.
* Output of the YOLO algorithm is (batch_size, height, width, B, 5 + C) where B stands for number of bounding boxes per cell and C stands for number of categories and C + 5 is as the other 5 elements are class_score, and the four coordinates of the bbox.
* 

In [24]:
# it means (kernel_size, out_channels, stride, padding) or maxpool and a list means a sequence of afforementioned tuples
# 
input_shape = (3, 448, 448)
num_boxes = 2
num_classes = 20
model_config = [
    [(7, 64, 2, 3), 1],
    "M",
    [(3, 192, 1, 1), 1],
    "M",
    [(1, 128, 1, 0), 1], 
    [(3, 256, 1, 1), 1],
    [(1, 256, 1, 0), 1],
    [(3, 512, 1, 1), 1],
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    [(1, 512, 1, 0), 1],
    [(3, 1024, 1, 1), 1],
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    [(3, 1024, 1, 1), 1],
    [(3, 1024, 2, 1), 1],
    [(3, 1024, 1, 1), 1],
    [(3, 1024, 1, 1), 1]
]

In [25]:
class conv_block(nn.Module):
    def __init__(self, in_channels, layers):
        '''
        split_size = height of image/height of feature map
        '''
        super(conv_block, self).__init__()
        block = []
        for i in range(layers[-1]): # number of repetitions
            for j in range(len(layers)-1): # different tuples
                kernel_size, out_channels, stride, padding = layers[j]
                block.append(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
                                       kernel_size=kernel_size, stride=stride, padding=padding))
                block.append(nn.BatchNorm2d(out_channels))
                block.append(nn.ReLU())
                in_channels = out_channels
        self.conv = nn.Sequential(*block)
        
        
        
    
    
    def forward(self, x):
        return self.conv(x)

In [26]:
model = conv_block(in_channels = 3, layers=model_config[0])
summary(model, (3, 448, 448))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           9,472
       BatchNorm2d-2         [-1, 64, 224, 224]             128
              ReLU-3         [-1, 64, 224, 224]               0
Total params: 9,600
Trainable params: 9,600
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 2.30
Forward/backward pass size (MB): 73.50
Params size (MB): 0.04
Estimated Total Size (MB): 75.83
----------------------------------------------------------------


In [33]:
class YOLO(nn.Module):
    def __init__(self, input_shape, model_config, num_boxes, num_classes):
        super(YOLO, self).__init__()
        in_channels, height, width = input_shape
        blocks = []
        for layers in model_config:
            if layers=='M':
                blocks.append(nn.MaxPool2d(kernel_size=2, stride=2))
            else:
                blocks.append(conv_block(in_channels=in_channels, layers=layers))
                in_channels = layers[-2][1]  # -2  represents the last covn layer to be added and 1 in tuple represents num_channels
        self.conv = nn.Sequential(*blocks)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=7*7*1024, out_features=512), # setting it to low taaki aukkat ho train karne ki,
            nn.Dropout(),
            nn.ReLU(),
            
            nn.Linear(in_features=512, out_features = 7*7*(num_classes + 5*num_boxes))
        )
                
    
    def forward(self, x):
        # input shape will be (batch_size, num_channels, height, width) # num_channels = 3
        x = self.conv(x) # output shape will be (batch_size, 1024, 7, 7)
        x = self.fc(x) # output shape will be (batch_size, 7*7*30)
        return x

In [34]:
yolo_model = YOLO(input_shape=input_shape, model_config=model_config, num_boxes=num_boxes, num_classes=num_classes)      
summary(yolo_model, input_shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           9,472
       BatchNorm2d-2         [-1, 64, 224, 224]             128
              ReLU-3         [-1, 64, 224, 224]               0
        conv_block-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 192, 112, 112]         110,784
       BatchNorm2d-7        [-1, 192, 112, 112]             384
              ReLU-8        [-1, 192, 112, 112]               0
        conv_block-9        [-1, 192, 112, 112]               0
        MaxPool2d-10          [-1, 192, 56, 56]               0
           Conv2d-11          [-1, 128, 56, 56]          24,704
      BatchNorm2d-12          [-1, 128, 56, 56]             256
             ReLU-13          [-1, 128, 56, 56]               0
       conv_block-14          [-1, 128,

In [35]:
def loss_fn():
    pass