In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#!pip install efficientnet_pytorch
#!pip install torchinfo
#!pip install tensorboardX
#!pip install albumentations==0.4.6
#!pip install -U albumentations
import torch
import torchinfo
import torchvision
import torch.nn.functional as F

from torch import nn
from torchvision import transforms
from torch.cuda.amp import autocast,GradScaler
from torchvision.models.resnet import BasicBlock
from tensorboardX import SummaryWriter

In [None]:
!pip install -U albumentations



In [None]:

import os
import cv2
import PIL
import time
import numpy as np
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
from typing import Tuple,List,Dict
from xml.etree import cElementTree as etree

In [None]:
class yolov1_resnet(nn.Module):
    
    def __init__(self, layer:int):
        super(yolov1_resnet,self).__init__()
        if layer == 18:
          #self.backbone = EfficientNet.from_pretrained('efficientnet-b7')
          self.backbone = torchvision.models.resnet18(pretrained=True)
        elif layer == 34:
            #self.backbone = EfficientNet.from_pretrained('efficientnet-b7')
            self.backbone = torchvision.models.resnet34(pretrained=True)
        elif layer == 50:
            #self.backbone = EfficientNet.from_pretrained('efficientnet-b7')
            self.backbone = torchvision.models.resnet50(pretrained=True)
        else:
            raise NotImplementedError
            
        self.backbone.fc = nn.Identity()
        
        # as paper suggested, deeper conv layer
        in_ch = 512 if layer != 50 else 2048
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch,1024,3,stride=1,padding=1,bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024,1024,3,stride=2,padding=1,bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024,1024,3,stride=1,padding=1,bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024,1024,3,stride=1,padding=1,bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
        )
        
#         self.conv = self.backbone._make_layer(BasicBlock, 1024, 2, stride=2)
                
        # too much parameters, over one fourth billion
#         self.classifier = nn.Sequential(
#             nn.Linear(7*7*512,1024),
#             nn.Dropout(p=.5),
#             nn.LeakyReLU(0.1),
#             nn.Linear(1024,7*7*30),
#         )

        self.conv_out = nn.Sequential(
            nn.Conv2d(1024,30,(1,1),stride=1)
        )

        
    def forward(self,x):
        tmp = self.backbone.conv1(x)
        tmp = self.backbone.bn1(tmp)
        tmp = self.backbone.relu(tmp)
        tmp = self.backbone.maxpool(tmp)
        tmp = self.backbone.layer1(tmp)
        tmp = self.backbone.layer2(tmp)
        tmp = self.backbone.layer3(tmp)
        tmp = self.backbone.layer4(tmp)
        
        tmp = self.conv(tmp)
        tmp = self.conv_out(tmp)
        
#         tmp = torch.flatten(tmp,start_dim=1)
#         tmp = self.classifier(tmp)
#         tmp = tmp.reshape(-1,30,7,7)#.contiguous()
        
        return tmp

In [None]:
torch.testing.assert_equal(yolov1_resnet(18)(torch.randn(1,3,448,448)).shape,torch.Size([1, 30, 7, 7]))
torch.testing.assert_equal(yolov1_resnet(34)(torch.randn(1,3,448,448)).shape,torch.Size([1, 30, 7, 7]))
torch.testing.assert_equal(yolov1_resnet(50)(torch.randn(1,3,448,448)).shape,torch.Size([1, 30, 7, 7]))

AttributeError: ignored

In [None]:
def generate_grid_train(grid_size: Tuple[int, int]) -> torch.Tensor:
    # recover x,y coord since we use x,y offset
    grid_x, grid_y = grid_size
    y_offset, x_offset = torch.meshgrid(torch.arange(grid_x),torch.arange(grid_y))    
    
    grid_tmp = torch.cat([x_offset.unsqueeze(0),y_offset.unsqueeze(0)],dim=0) # 2, 7, 7

    grid_tmp = grid_tmp.unsqueeze(0)/grid_size[0] # 1, 2, 7 ,7 # to relative
    
    return grid_tmp

In [None]:
def xywh_to_xyxy(coord: torch.Tensor) -> torch.Tensor:
    # convert xywh to xyxy
    b, _, sx, sy = coord.shape
    d = 'cuda:0' if coord.is_cuda else 'cpu'
    grid_for_train = generate_grid_train((sx,sy)).repeat(b,1,1,1).to(d) 
    xy = coord[:,0:2,:,:]+grid_for_train # B, 2, 7, 7
    wh = coord[:,2:4,:,:] # B, 2, 7, 7 

    return torch.cat([xy-wh/2,xy+wh/2],dim=1) # B, 4, 7, 7

In [None]:
def IOU(cbox1: torch.Tensor, cbox2: torch.Tensor) -> torch.Tensor:
    """
    input: x_offset y_offset wh format
    output: iou for each batch and grid cell
    """
        
    # coord_conversion
#     cbox1[:,2:4,:,:] = cbox1[:,2:4,:,:].pow(2)
    box1 = xywh_to_xyxy(cbox1) # B, 4, 7, 7
    box2 = xywh_to_xyxy(cbox2)
    
    # find intersection
    x1 = torch.max(box1[:,0:1,:,:],box2[:,0:1,:,:])
    y1 = torch.max(box1[:,1:2,:,:],box2[:,1:2,:,:])
    x2 = torch.min(box1[:,2:3,:,:],box2[:,2:3,:,:])
    y2 = torch.min(box1[:,3:4,:,:],box2[:,3:4,:,:])
    
    intersection = (x2-x1).clamp(min=0) * (y2-y1).clamp(min=0) # N, 1, 7, 7
    
    total_area = abs(cbox1[:,2:3,:,:] * cbox1[:,3:4,:,:]) + cbox2[:,2:3,:,:]*cbox2[:,3:4,:,:] - intersection # N, 1, 7, 7

    intersection[intersection.gt(0)] = intersection[intersection.gt(0)] / total_area[intersection.gt(0)]
    
#     intersection.clamp_(min = 0, max = 1) #TODO: check if clamp and abs prediction area necessary
    
    return intersection

In [None]:
def RMSE(cbox1: torch.Tensor, cbox2: torch.Tensor) -> torch.Tensor:
    with torch.no_grad():
        cbox1[:,2:4,:,:] = cbox1[:,2:4,:,:]
        cbox2[:,2:4,:,:] = cbox2[:,2:4,:,:].sqrt()
    return F.mse_loss(cbox1,cbox2, reduction='none')

In [None]:
class YoloLoss(nn.Module):
    
    def __init__(self, lambda_coord = 5, lambda_noobject = 0.5):
        super(YoloLoss,self).__init__()
        
        self.grid_size = IMAGE_SIZE//64
        self.lambda_coord = lambda_coord
        self.lambda_noobject = lambda_noobject
        self.epsilon = 1e-5
        
    def positive_box(self, prediction: torch.Tensor, groundtruth: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        with torch.no_grad():
            ious = [IOU(prediction[:,(1+5*b):(5+5*b),:,:], groundtruth[:,1:5,:,:]) for b in range(NUM_BOXES)]
#             rmse = [RMSE(prediction[:,(1+5*b):(5+5*b),:,:], groundtruth[:,1:5,:,:]) for b in range(NUM_BOXES)]
        ious = torch.cat(ious, dim=1) # N, 2, 7, 7
#         rmse = torch.cat(rmse, dim=1)
        
        max_iou, best_box = ious.max(dim=1,keepdim=True) # N,1,7,7
        best_box = torch.stack([best_box.eq(b).int() for b in range(NUM_BOXES)], 1) # N, 2, 1, 7, 7
        max_iou = torch.stack([max_iou for b in range(NUM_BOXES)], 1) # N, 2, 1, 7, 7
        
#         min_rmse, alt_box = rmse.min(dim=1,keepdim=True)
#         alt_box = torch.stack([alt_box.eq(b).int() for b in range(NUM_BOXES)], 1) # N, 2, 1, 7, 7
#         min_rmse = torch.stack([min_rmse for b in range(NUM_BOXES)], 1) # N, 2, 1, 7, 7
        
#         best_box[max_iou.le(0).logical_and(min_rmse.lt(20))] = alt_box[max_iou.le(0).logical_and(min_rmse.lt(20))]
        
        return max_iou, best_box
    
    def build_target(self, groundtruth: List[tuple]) -> torch.Tensor:        
        label = torch.zeros((5+NUM_CLASSES, self.grid_size, self.grid_size))
                
        for g in groundtruth:
            cx, cy, w, h, c = g
            c = int(c)
            x = (cx % (1/self.grid_size)) # left offset
            y = (cy % (1/self.grid_size)) # top offset
        
            x_ind = int(cx*self.grid_size)
            y_ind = int(cy*self.grid_size)
            label[0,y_ind,x_ind] = 1
            label[1:5,y_ind,x_ind] = torch.Tensor([x,y,w,h])
            label[5+c,y_ind,x_ind] = 1
        
        return label
        
    def forward(self, prediction: torch.Tensor, groundtruth: list) -> torch.Tensor:
        b, _, grid_y, grid_x = prediction.shape
        groundtruth = torch.stack([self.build_target(g) for g in groundtruth],0).float().to(DEVICE)
        
        # iou indicator
        ious, best_box = self.positive_box(prediction, groundtruth) # N, 2, 1, 7, 7
        
        # predictions
        box_pred = prediction[:,:5*NUM_BOXES,:,:].reshape(b, NUM_BOXES, 5, grid_y, grid_x) # N, 2, 5, 7, 7
        xy_pred = box_pred[:,:,1:3,:,:]
#         wh_pred = box_pred[:,:,3:5,:,:]
        wh_pred = box_pred[:,:,3:5,:,:].sign() * (box_pred[:,:,3:5,:,:].abs()+self.epsilon).sqrt() # sqrt the value then plus sign back
        
        cls_pred = prediction[:,NUM_BOXES*5:,:,:] # N, 20, 7, 7
    
        # groundtruth
        box_truth = groundtruth[:,1:5,:,:].unsqueeze(1).tile(1,NUM_BOXES,1,1,1)
        box_truth[:,:,2:4,:,:] = box_truth[:,:,2:4,:,:].sqrt() # N, 2, 4, 7, 7
    
        # obj indicator
        obj_here = groundtruth[:,0:1,:,:] # N,1,7,7
        box_here = obj_here.unsqueeze(1) # N, 1, 1, 7, 7
        positive = box_here * best_box
        
        # class loss objecness loss and xywh loss                
        # indicator has to be inside the loss function
        cls_loss = F.mse_loss(obj_here * cls_pred, groundtruth[:,5:,:,:],reduction="sum")
        xy_loss = F.mse_loss(positive * xy_pred, positive * box_truth[:,:,0:2,:,:],reduction="sum")
        wh_loss = F.mse_loss(positive * wh_pred, positive * box_truth[:,:,2:4,:,:],reduction="sum")
        obj_loss = F.mse_loss(positive * box_pred[:,:,0:1,:,:], positive * ious,reduction="sum")
        noobj_loss = F.mse_loss((1 - positive) * box_pred[:,:,0:1,:,:], ious*0, reduction="sum")    
        
        total_loss = cls_loss + self.lambda_noobject * noobj_loss + obj_loss + self.lambda_coord * (xy_loss+wh_loss)

        return total_loss

In [None]:
class VOC_dataset(torch.utils.data.Dataset):
    
    def __init__(self, root, year='2012', mode='train', transform=None):
        list_path = [(os.path.join(root,f'VOCdevkit/VOC{y}/ImageSets/Main/{mode}.txt'),y) for y in year.split('+')]

        self.root = root
        self.year = year
        self.mode = mode
        self.transform = transform
        self.labels = []
        for p,y in list_path:
            with open(p,'r') as f:
                p = [os.path.join(root,f'VOCdevkit/VOC{y}/Annotations',f'{g.strip()}.xml') for g in f.readlines()]
                self.labels += p

            
    def __len__(self):
        return len(self.labels)

    
    def get_img(self, idx):
        jpg = self.labels[idx].replace('Annotations','JPEGImages').replace('xml','jpg')
            
        img = cv2.imread(jpg)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w, _ = img.shape
        
        return img,h,w
    
    def get_label(self,idx,h,w):
        xml = self.labels[idx]
        tree = etree.parse(xml)
        label = self.process_label(tree,h,w)
        
        return label

    def __getitem__(self, idx):        
        img,h,w = self.get_img(idx)
        label = self.get_label(idx,h,w) #if self.mode != 'test' else None
        
        if self.transform: 
            transformed_data = self.transform(image=img,bboxes=label)
            img = transformed_data['image']
            label = transformed_data['bboxes'] # totensor
        
        return img, label
    
    def process_label(self,tree,img_h,img_w):
        tree = tree.findall('object')
                
        label = []

        for obj in tree:
            if obj.find('difficult').text == '1': 
                continue
            bbox = obj.find('bndbox')

            xmin = float(bbox.find('xmin').text)
            xmax = float(bbox.find('xmax').text)
            ymin = float(bbox.find('ymin').text)
            ymax = float(bbox.find('ymax').text)

            w = (xmax-xmin)/img_w
            h = (ymax-ymin)/img_h

            class_ind = voc_classes.index(obj.find('name').text)

            cx = (xmin+xmax)/2/img_w
            cy = (ymin+ymax)/2/img_h
            
            label.append([cx,cy,w,h,class_ind])
                                
        return label

In [None]:
def collate_fn(batch):
    img_list = []
    label_list = []
    
    for a,b in batch:
        img_list.append(a)
        label_list.append(b)
        
    return torch.stack(img_list,0), label_list

In [None]:
torch.backends.cudnn.benchmark=True


In [None]:
voc_classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
            "bus", "car", "cat", "chair", "cow", "diningtable",
            "dog", "horse", "motorbike", "person", "pottedplant",
            "sheep", "sofa", "train", "tvmonitor"]

In [None]:

MODEL_BACKBONE = 'resnet34'

if MODEL_BACKBONE == 'yolov1':
    BATCH_SIZE = 64
    ACC_ITER = 1
elif MODEL_BACKBONE == 'resnet18':
    BATCH_SIZE = 32
    ACC_ITER = 2
elif MODEL_BACKBONE == 'resnet34':
    BATCH_SIZE = 16
    ACC_ITER = 4
elif MODEL_BACKBONE == 'resnet50':
    BATCH_SIZE = 8
    ACC_ITER = 8
    
IMAGE_SIZE = 448
DEVICE = 'cuda:0'
NUM_BOXES = 2
NUM_CLASSES = len(voc_classes)
LR = 1e-4
DECAY = 5e-4

In [None]:
preprocess_train = A.Compose([
    A.Resize(IMAGE_SIZE,IMAGE_SIZE),
    A.ColorJitter(p=0.5),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(p=0.5, rotate_limit= 0),
    A.Normalize(mean=(0,0,0),std=(1,1,1)),
    ToTensorV2(),
],bbox_params=A.BboxParams(format='yolo',min_area=1024,min_visibility=0.3))

In [None]:
preprocess_val = A.Compose([
    A.Resize(IMAGE_SIZE,IMAGE_SIZE),
    A.Normalize(mean=(0,0,0),std=(1,1,1)),
    ToTensorV2(),
])

In [None]:
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar # VOC2007 train+val set
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar # VOC2012 train+val set
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar # VOC2007 test set

--2021-11-01 15:02:57--  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460032000 (439M) [application/x-tar]
Saving to: ‘VOCtrainval_06-Nov-2007.tar’


2021-11-01 15:03:06 (49.3 MB/s) - ‘VOCtrainval_06-Nov-2007.tar’ saved [460032000/460032000]

--2021-11-01 15:03:06--  http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1999639040 (1.9G) [application/x-tar]
Saving to: ‘VOCtrainval_11-May-2012.tar’


2021-11-01 15:03:45 (48.8 MB/s) - ‘VOCtrainval_11-May-2012.tar’ saved [1999639040/1999639040]

--2021-11-01 15:03:

In [None]:
!tar xvf VOCtest_06-Nov-2007.tar
!tar xvf VOCtrainval_06-Nov-2007.tar
!tar xvf VOCtrainval_11-May-2012.tar

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
VOCdevkit/VOC2012/SegmentationClass/2008_001874.png
VOCdevkit/VOC2012/SegmentationClass/2008_001876.png
VOCdevkit/VOC2012/SegmentationClass/2008_001882.png
VOCdevkit/VOC2012/SegmentationClass/2008_001885.png
VOCdevkit/VOC2012/SegmentationClass/2008_001895.png
VOCdevkit/VOC2012/SegmentationClass/2008_001896.png
VOCdevkit/VOC2012/SegmentationClass/2008_001926.png
VOCdevkit/VOC2012/SegmentationClass/2008_001966.png
VOCdevkit/VOC2012/SegmentationClass/2008_001971.png
VOCdevkit/VOC2012/SegmentationClass/2008_001992.png
VOCdevkit/VOC2012/SegmentationClass/2008_001997.png
VOCdevkit/VOC2012/SegmentationClass/2008_002032.png
VOCdevkit/VOC2012/SegmentationClass/2008_002043.png
VOCdevkit/VOC2012/SegmentationClass/2008_002064.png
VOCdevkit/VOC2012/SegmentationClass/2008_002066.png
VOCdevkit/VOC2012/SegmentationClass/2008_002067.png
VOCdevkit/VOC2012/SegmentationClass/2008_002073.png
VOCdevkit/VOC2012/SegmentationClass/2008_002079.png

In [None]:
trainset = VOC_dataset('/content/gdrive/My Drive/yolo/', year='2012+2007', mode='trainval', transform=preprocess_train)
testset = VOC_dataset('/content/gdrive/My Drive/yolo/', year='2007', mode='test', transform=preprocess_val)

In [None]:
trainloader = torch.utils.data.DataLoader(trainset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_fn)
testloader = torch.utils.data.DataLoader(testset,batch_size=BATCH_SIZE,shuffle=False,collate_fn=collate_fn)

In [None]:
if MODEL_BACKBONE.startswith('resnet'):
    layer_num = int(MODEL_BACKBONE[6:])
    model = yolov1_resnet(layer_num).to(DEVICE)
elif MODEL_BACKBONE == 'yolov1':
    model = yolov1().to(DEVICE)

In [None]:
criterion = YoloLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=LR,weight_decay=DECAY)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,lr_lambda=[lambda e: 1e-4 if e < 50 else 5e-5])

In [None]:
writer = SummaryWriter()


In [None]:
def train(e):
    model.train()
    optimizer.zero_grad(set_to_none=True)
    for i, data in enumerate(tqdm(trainloader)):

        img, label = data
        
        img= img.to(DEVICE) 

        output = model(img)
        loss = criterion(output,label)
        loss = loss/ACC_ITER
        
        loss.backward()
        
        if i % ACC_ITER == 0:
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
        
        assert not np.isnan(loss.item()), 'loss died'
        
        writer.add_scalar(f'training loss', loss.item(), i+e*len(trainloader),
                         summary_description=f'{MODEL_BACKBONE}/{LR}/final')

    
    if (e+1) % 10 == 0:
        torch.save(model.state_dict(),f'yolov1.{MODEL_BACKBONE}.{e+1}.pth')
    
#     val_loss = validate(e)
#     scheduler.step(val_loss)

In [None]:
def validate(e):
    model.eval()
    val_loss = 0
    print('validating......')
    with torch.no_grad():
        for i, data in enumerate(tqdm(testloader)):

            img, label = data

            img= img.to(DEVICE) 
            label = label.to(DEVICE) 

            output = model(img)
            loss = criterion(output,label)
            val_loss += loss.item()

            assert not np.isnan(val_loss), 'loss died'

    writer.add_scalar(f'validation loss', val_loss/len(testset), e,
                      summary_description=f'{MODEL_BACKBONE}/{LR}/plateau')
    return val_loss/len(testset)

In [None]:
epoch = 150
for epoch in range(0,epoch):
    train(epoch)

 56%|█████▌    | 575/1035 [2:27:16<2:05:32, 16.38s/it]

In [None]:
writer.close()

In [None]:
# load carefully, if you have many checkpoints
model.load_state_dict(torch.load(f'yolov1.{MODEL_BACKBONE}.100.pth'))

FileNotFoundError: ignored

In [None]:
!nvidia-smi

Sat Nov 13 10:20:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces