<a href="https://colab.research.google.com/github/yhb1834/cv_prj2/blob/main/yolov1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download VOC Dataset

!mkdir train
!mkdir test
!wget http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar -P train/
!wget http://pjreddie.com/media/files/VOCtest_06-Nov-2007.tar -P test/
!tar -xf test/VOCtest_06-Nov-2007.tar -C test/
!tar -xf train/VOCtrainval_06-Nov-2007.tar -C train/
!rm -rf test/VOCtest_06-Nov-2007.tar

In [7]:
# Download VOC Dataset

!mkdir train
!mkdir test
!wget http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar -P train/
!wget http://pjreddie.com/media/files/VOCtest_06-Nov-2007.tar -P test/
!tar -xf test/VOCtest_06-Nov-2007.tar -C test/
!tar -xf train/VOCtrainval_06-Nov-2007.tar -C train/
!rm -rf test/VOCtest_06-Nov-2007.tar

--2022-06-07 06:20:07--  http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar
Resolving pjreddie.com (pjreddie.com)... 128.208.4.108
Connecting to pjreddie.com (pjreddie.com)|128.208.4.108|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar [following]
--2022-06-07 06:20:07--  https://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar
Connecting to pjreddie.com (pjreddie.com)|128.208.4.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460032000 (439M) [application/octet-stream]
Saving to: ‘train/VOCtrainval_06-Nov-2007.tar’


2022-06-07 06:20:19 (38.6 MB/s) - ‘train/VOCtrainval_06-Nov-2007.tar’ saved [460032000/460032000]

URL transformed to HTTPS due to an HSTS policy
--2022-06-07 06:20:19--  https://pjreddie.com/media/files/VOCtest_06-Nov-2007.tar
Resolving pjreddie.com (pjreddie.com)... 128.208.4.108
Connecting to pjreddie.com (pjreddie.com)|128.208

# Download Package

!pip install xmltodict
!pip install -U albumentations
!pip install "opencv-python-headless<4.3"

In [2]:
# configs
root_dir = '/content'
annot_f = './{}/VOCdevkit/VOC2007/Annotations'
image_f = './{}/VOCdevkit/VOC2007/JPEGImages/{}'

classes = ['person', # Person
           'bird', 'cat', 'cow', 'dog', 'horse', 'sheep', # Animal
           'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train', # Vehicle
           'bottle', 'chair', 'dining table', 'potted plant', 'sofa', 'tv/monitor' # Indoor
           ]

num_classes = len(classes)
feature_size = 7
num_bboxes = 2

In [22]:
import sys
sys.path.append(root_dir)

from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler

from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## utils
import numpy as np
import random, math, time
# 노트북일 때만 사용
from tqdm.notebook import tqdm

## File Loader
import os, xmltodict
import os.path as pth
from PIL import Image

# Draw Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

## Transformer
from albumentations.pytorch import transforms
from random import sample
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

# Seed
random.seed(53)

In [11]:
# utils
def draw_image(image_info, w=448, h=448, transforms=None):
    im = np.array(Image.open(image_f.format('train', image_info['image_id'])).convert('RGB').resize((w,h)), dtype=np.uint8)

    # Create figure and axes
    fig,ax = plt.subplots(1, figsize=(7,7))

    bb = image_info['bboxs']
    la = image_info['labels']

    if transforms:
        sample = transforms(image=im, bboxes=bb, category_ids=la)
        im = sample['image'].permute(1,2,0).numpy()
        bb = sample['bboxes']
        la = sample['category_ids']

    # Display the image
    ax.imshow(im)


    # Create a Rectangle patch
    for b, l in zip(bb, la):
        # top left (x, y) , (w, h)
        rect = patches.Rectangle((b[0]*w,b[1]*h),(b[2]-b[0])*w,(b[3]-b[1])*h,linewidth=1,edgecolor='r',facecolor='none')
        # Add the patch to the Axes
        ax.add_patch(rect)
        props = dict(boxstyle='round', facecolor='red', alpha=0.9)
        plt.text(b[0]*w, b[1]*h, classes[l], fontsize=10, color='white', bbox=props)
    plt.axis('off')
    plt.show()

In [12]:
# dataset # VOC
def get_infos(annot_f=annot_f, mode='train'):
    annot_dir = annot_f.format(mode)
    result = []
    for ano in [pth.join(annot_dir, ano) for ano in os.listdir(annot_dir)]:
        f = open(ano)
        info = xmltodict.parse(f.read())['annotation']
        image_id = info['filename']
        image_size = np.asarray(tuple(map(int, info['size'].values()))[:2], np.int16)
        w, h = image_size
        box_objects = info['object']
        labels = []
        bboxs = []
        for obj in box_objects:
            try:
                labels.append(classes.index(obj['name'].lower()))
                bboxs.append(tuple(map(int, obj['bndbox'].values())))
            except: pass

        # Resizing Box, Change x1 y1 x2 y2
        # albumentations (normalized box)
        bboxs = np.asarray(bboxs, dtype=np.float64)
        try:
            bboxs[:, [0,2]] /= w
            bboxs[:, [1,3]] /= h
        except: pass
        if bboxs.shape[0] or mode=='test':
            result.append({'image_id':image_id, 'image_size':image_size, 'bboxs':bboxs, 'labels':labels})

    return result
    
trval_list = get_infos()
test_list = get_infos(mode='test')

len(trval_list), len(test_list)

(3067, 4952)

In [13]:
# utils
def get_tv_idx(tl, k = 0.5):
    total_idx = range(tl)
    train_idx = sample(total_idx, int(tl*k))
    valid_idx = set(total_idx) - set(train_idx)
    return train_idx, list(valid_idx)

train_idx, valid_idx = get_tv_idx(len(trval_list))

trval_list = np.asarray(trval_list)
train_list = trval_list[train_idx]
valid_list = trval_list[valid_idx]

In [14]:
# Make Dataset
class VOCDataset(Dataset):
    def __init__(self, data_list, mode='train', transforms=None):
        self.data_list = data_list
        self.mode = mode
        self.transforms = transforms

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        record = self.data_list[idx]
        img_id = record['image_id']
        bboxs = record['bboxs']
        labels = record['labels']

        img = Image.open(image_f.format(self.mode, img_id)).convert('RGB') #.resize((800,800))
        img = np.array(img)

        if self.transforms:
            for t in self.transforms:
                sample = self.transforms(image=img, bboxes=bboxs, category_ids=labels)
                image = sample['image']
                bboxs = np.asarray(sample['bboxes'])
                labels = np.asarray(sample['category_ids'])


        if self.mode=='train':
            target = encode(bboxs, labels)
            return image, target
        else:
            return image

In [15]:
# utils
def encode(bboxs, labels):
    # Make YoLo Target

    S = feature_size
    B = num_bboxes
    N = 5 * B + num_classes
    cell_size = 1.0 / float(S)
    # print(bboxs.shape)

    box_cxy = (bboxs[:, 2:] + bboxs[:, :2])/2.0
    box_wh = bboxs[:, 2:] - bboxs[:, :2]
    target = np.zeros((S, S, N))
    for b in range(bboxs.shape[0]):
        cxy, wh, label = box_cxy[b], box_wh[b], labels[b]
        ij = np.ceil(cxy / cell_size) - 1.0
        i, j = map(int, ij)
        top_left = ij*cell_size
        dxy_norm = (cxy-top_left)/cell_size
        for k in range(B):
            target[j, i, 5*k: 5*(k+1)] = np.r_[dxy_norm, wh, 1]
        target[j, i, 5*B+label] = 1.0
    return target

In [24]:
# Transformer 재정의
def get_train_transforms():
    return A.Compose([
        A.Resize(448,448, always_apply=True, p=1),
        # A.Cutout(num_holes=7, max_h_size=16, max_w_size=16, fill_value=0, always_apply=False, p=0.5),
        A.RandomBrightnessContrast(p=0.2),
        A.HorizontalFlip(),
        ToTensorV2(),
    ], bbox_params=A.BboxParams(format='albumentations', label_fields=['category_ids']))

def get_test_transforms():
    return A.Compose([
        A.Resize(448,448, always_apply=True, p=1),
        ToTensorV2(),
    ])

In [25]:
train_ds = VOCDataset(train_list, transforms=get_train_transforms())
valid_ds = VOCDataset(valid_list, transforms=get_test_transforms())
test_ds = VOCDataset(test_list, mode='test', transforms=get_test_transforms())

# torch tensor를 batch size만큼 묶어줌
def collate_fn(batch):
    images, targets = zip(*batch)
    return torch.cat([img.reshape(-1, 3, 448, 448) for img in images], 0), torch.FloatTensor(targets)

def test_collate_fn(batch):
    images = batch
    return torch.cat([img.reshape(-1, 3, 448, 448) for img in images], 0)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, collate_fn=test_collate_fn)