In [1]:
# Cell 1
from pathlib import Path
PROJECT_DIR = Path('/content/project')
DATA_DIR    = PROJECT_DIR / 'data'
MODEL_DIR   = PROJECT_DIR / 'models'
OUTPUT_DIR  = PROJECT_DIR / 'outputs'
NOTEBOOKS   = PROJECT_DIR / 'notebooks'
for d in (PROJECT_DIR, DATA_DIR, MODEL_DIR, OUTPUT_DIR, NOTEBOOKS):
    d.mkdir(parents=True, exist_ok=True)


In [2]:
# Cell 2
!pip install torch torchvision pycocotools matplotlib tqdm
import os, time, random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.datasets import VOCDetection, CocoDetection
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from tqdm import tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
# — Revised Cell 3: Config & Hyperparams —
class Config:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    img_size = 416
    batch_size = 16
    epochs = 12
    lr = 1e-3
    num_workers = 2    # <- lowered from 4 to 2
    num_classes = 20
    anchors = [
        [(10,13),(16,30),(33,23)],
        [(30,61),(62,45),(59,119)],
        [(116,90),(156,198),(373,326)]
    ]
cfg = Config()


In [4]:
# Cell 4
class VOCDataset(Dataset):
    def __init__(self, root, year, image_set, img_size):
        self.voc = VOCDetection(root=root, year=year, image_set=image_set,
                                download=True,
                                transform=transforms.Compose([
                                    transforms.Resize((img_size, img_size)),
                                    transforms.ToTensor(),
                                ]))
        # The CLASS_NAMES attribute is no longer available in VOCDetection
        # Instead, we manually define the class names for Pascal VOC 2007
        self.classes = [
            "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car",
            "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike",
            "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"
        ]
        self.img_size = img_size
    def __len__(self):
        return len(self.voc)
    def __getitem__(self, idx):
        img, tgt = self.voc[idx]
        objs = tgt['annotation']['object']
        if not isinstance(objs, list): objs = [objs]
        w, h = img.shape[2], img.shape[1]
        boxes, labels = [], []
        for o in objs:
            b = o['bndbox']
            x1 = float(b['xmin'])/w; y1 = float(b['ymin'])/h
            x2 = float(b['xmax'])/w; y2 = float(b['ymax'])/h
            boxes.append([x1,y1,x2,y2])
            labels.append(self.classes.index(o['name']))
        return img, {'boxes':torch.tensor(boxes), 'labels':torch.tensor(labels)}

In [5]:
# Cell 5
train_ds = VOCDataset(str(DATA_DIR/'VOCdevkit'), '2007', 'trainval', cfg.img_size)
val_ds   = VOCDataset(str(DATA_DIR/'VOCdevkit'), '2007', 'test',     cfg.img_size)
collate = lambda batch: tuple(zip(*batch))
train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True,
                          collate_fn=collate, num_workers=cfg.num_workers)
val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size, shuffle=False,
                          collate_fn=collate, num_workers=cfg.num_workers)


100%|██████████| 460M/460M [00:29<00:00, 15.4MB/s]
100%|██████████| 451M/451M [00:30<00:00, 14.9MB/s]


In [6]:
# Cell 6
from torchvision.models import resnet50

class YOLOv3Head(nn.Module):
    def __init__(self, in_chs, na, nc):
        super().__init__()
        self.layers = nn.ModuleList()
        for c in in_chs:
            self.layers.append(nn.Sequential(
                nn.Conv2d(c, c//2, 1, 1, 0),
                nn.BatchNorm2d(c//2),
                nn.LeakyReLU(0.1),
                nn.Conv2d(c//2, c, 3, 1, 1),
                nn.BatchNorm2d(c),
                nn.LeakyReLU(0.1),
                nn.Conv2d(c, na*(5+nc), 1, 1, 0)
            ))
    def forward(self, feats):
        return [l(f) for l,f in zip(self.layers, feats)]

class YOLOv3(nn.Module):
    def __init__(self, nc, anchors):
        super().__init__()
        self.backbone = resnet50(pretrained=True)
        self.head = YOLOv3Head([512,1024,2048], len(anchors[0]), nc)
    def forward(self, x):
        x = self.backbone.conv1(x); x = self.backbone.bn1(x)
        x = self.backbone.relu(x); x = self.backbone.maxpool(x)
        x1 = self.backbone.layer1(x)
        f1 = self.backbone.layer2(x1)
        f2 = self.backbone.layer3(f1)
        f3 = self.backbone.layer4(f2)
        return self.head([f1,f2,f3])

model = YOLOv3(cfg.num_classes, cfg.anchors).to(cfg.device)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:03<00:00, 29.1MB/s]


In [7]:
# — Revised Cell 7: Loss Function (now actually non-zero) —
import torch.nn.functional as F

class YOLOLoss(nn.Module):
    def __init__(self, anchors, nc, img_size):
        super().__init__()
        self.anchors = torch.tensor(anchors, dtype=torch.float32).to(cfg.device)
        self.nc = nc
        self.img_size = img_size

    def forward(self, preds, targets):
        # preds: list of [B, na*(5+nc), G, G]
        # flatten all predictions to (N, 5+nc)
        p = []
        for out in preds:
            bs, _, G, _ = out.shape
            p.append(out.view(bs, len(self.anchors[0]), 5+self.nc, G, G)
                        .permute(0,3,4,1,2)
                        .reshape(-1, 5+self.nc))
        p = torch.cat(p, dim=0)

        # build a dummy target tensor t of same shape as p
        # here we simply tile each ground-truth once and pad/truncate to match p
        t_list = []
        for tgt in targets:
            boxes = tgt['boxes'].to(cfg.device)      # [M,4]
            labels = tgt['labels'].to(cfg.device)    # [M]
            # objectness = 1 for every GT box, cls one-hot
            obj = torch.ones(boxes.size(0),1, device=cfg.device)
            cls = F.one_hot(labels, num_classes=self.nc).float()
            t_list.append(torch.cat([boxes, obj, cls], dim=1))
        t = torch.cat(t_list, dim=0)
        if t.size(0) < p.size(0):
            pad = torch.zeros(p.size(0)-t.size(0), t.size(1), device=cfg.device)
            t = torch.cat([t, pad], dim=0)
        else:
            t = t[:p.size(0), :]

        # split predictions/targets
        pred_box, pred_obj, pred_cls = p[:, :4], p[:, 4], p[:, 5:]
        true_box, true_obj, true_cls = t[:, :4], t[:, 4], t[:, 5:]

        # losses
        loc_loss = F.mse_loss(pred_box, true_box, reduction='mean')
        obj_loss = F.binary_cross_entropy_with_logits(pred_obj, true_obj, reduction='mean')
        cls_loss = F.binary_cross_entropy_with_logits(pred_cls, true_cls, reduction='mean')

        return loc_loss + obj_loss + cls_loss

criterion = YOLOLoss(cfg.anchors, cfg.num_classes, cfg.img_size)


In [8]:
# Cell 8
optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)


In [9]:
# Cell 9
for epoch in range(1, cfg.epochs+1):
    model.train()
    tloss = 0
    for imgs, targets in tqdm(train_loader):
        imgs = torch.stack(imgs).to(cfg.device)
        loss = criterion(model(imgs), targets)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        tloss += loss.item()
    scheduler.step()
    torch.save(model.state_dict(), MODEL_DIR/f'yolo_epoch{epoch}.pth')
    print(f'Epoch {epoch} loss {tloss/len(train_loader):.4f}')


100%|██████████| 314/314 [03:36<00:00,  1.45it/s]


Epoch 1 loss 0.7166


100%|██████████| 314/314 [03:38<00:00,  1.44it/s]


Epoch 2 loss 0.3695


100%|██████████| 314/314 [03:37<00:00,  1.45it/s]


Epoch 3 loss 0.2566


100%|██████████| 314/314 [03:37<00:00,  1.44it/s]


Epoch 4 loss 0.1961


100%|██████████| 314/314 [03:38<00:00,  1.44it/s]


Epoch 5 loss 0.1585


100%|██████████| 314/314 [03:37<00:00,  1.44it/s]


Epoch 6 loss 0.1329


100%|██████████| 314/314 [03:38<00:00,  1.44it/s]


Epoch 7 loss 0.1143


100%|██████████| 314/314 [03:37<00:00,  1.45it/s]


Epoch 8 loss 0.1003


100%|██████████| 314/314 [03:37<00:00,  1.45it/s]


Epoch 9 loss 0.0893


100%|██████████| 314/314 [03:37<00:00,  1.45it/s]


Epoch 10 loss 0.0805


100%|██████████| 314/314 [03:38<00:00,  1.44it/s]


Epoch 11 loss 0.0733


100%|██████████| 314/314 [03:38<00:00,  1.44it/s]


Epoch 12 loss 0.0673


In [12]:
# Cell 10
!pip install pycocotools
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

def evaluate_coco(model, loader, ann):
    model.eval()
    coco = COCO(ann)
    results = []
    for imgs, _ in tqdm(loader):
        imgs = torch.stack(imgs).to(cfg.device)
        with torch.no_grad():
            preds = model(imgs)
        # decode to COCO format and append to results
    dt = coco.loadRes(results)
    ev = COCOeval(coco, dt, 'bbox')
    ev.evaluate(); ev.accumulate(); ev.summarize()




In [11]:
# Cell 11
import matplotlib.patches as patches

def infer(model, img):
    model.eval()
    with torch.no_grad():
        p = model(img.unsqueeze(0).to(cfg.device))
    # decode p into boxes, labels, scores
    return boxes, labels, scores

def plot(img, boxes, labels, scores=None):
    img = img.permute(1,2,0).cpu().numpy()
    fig,ax = plt.subplots()
    ax.imshow(img)
    for i,b in enumerate(boxes):
        x1,y1,x2,y2 = b
        rect = patches.Rectangle((x1*cfg.img_size,y1*cfg.img_size),
                                 (x2-x1)*cfg.img_size,(y2-y1)*cfg.img_size,
                                 linewidth=2, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
        txt = train_ds.classes[labels[i]] + (f'{scores[i]:.2f}' if scores is not None else '')
        ax.text(x1*cfg.img_size, y1*cfg.img_size-5, txt, color='white',
                bbox=dict(facecolor='r', alpha=0.5))
    plt.show()


In [26]:
# Cell Y0 — clear out previous inference images
import glob, os

for img_path in glob.glob(str(OUTPUT_DIR/'inference'/'*.png')):
    os.remove(img_path)


In [29]:
# Cell Z — full test & evaluation on first 100 VOC test images (with ‘area’ field)
import torch, csv
from pathlib import Path
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt, matplotlib.patches as patches
from tqdm import tqdm
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# paths
PROJECT_DIR = Path('/content/project')
DATA_DIR    = PROJECT_DIR/'data'
MODELS_DIR  = PROJECT_DIR/'models'
OUTPUTS_DIR = PROJECT_DIR/'outputs'
INFER_DIR   = OUTPUTS_DIR/'inference'
METRICS_CSV = OUTPUTS_DIR/'metrics.csv'

INFER_DIR.mkdir(parents=True, exist_ok=True)

# 1) reload model
model = YOLOv3(cfg.num_classes, cfg.anchors).to(cfg.device)
ckpt = torch.load(MODELS_DIR/f'yolo_epoch{cfg.epochs}.pth', map_location=cfg.device)
model.load_state_dict(ckpt)
model.eval()

# 2) build VOC test loader
test_ds = VOCDataset(str(DATA_DIR/'VOCdevkit'), '2007', 'test', cfg.img_size)
loader  = DataLoader(test_ds, batch_size=1, shuffle=False,
                     collate_fn=lambda b: tuple(zip(*b)),
                     num_workers=cfg.num_workers)

# 3) fake COCO‐style GT (including 'area')
voc_gt = {}
for idx, (_, tgt) in enumerate(test_ds):
    voc_gt[idx] = {
        'boxes': tgt['boxes'].tolist(),
        'labels': tgt['labels'].tolist()
    }

coco_gt = COCO()
coco_gt.dataset = {
    'images':    [{'id': i} for i in voc_gt],
    'annotations': [
        {
            'id': aid,
            'image_id': i,
            'category_id': lab,
            'bbox': [
                b[0]*cfg.img_size,
                b[1]*cfg.img_size,
                (b[2]-b[0])*cfg.img_size,
                (b[3]-b[1])*cfg.img_size
            ],
            'area': (b[2]-b[0])*(b[3]-b[1])*(cfg.img_size**2),
            'iscrowd': 0
        }
        for i, gd in voc_gt.items()
        for aid, (b, lab) in enumerate(zip(gd['boxes'], gd['labels']))
    ],
    'categories': [{'id': i, 'name': n} for i, n in enumerate(test_ds.classes)]
}
coco_gt.createIndex()

coco_dt_results = []

# 4) inference + visuals + collect detections
for idx, (imgs, tgt) in enumerate(tqdm(loader, total=100)):
    if idx >= 100: break
    img = imgs[0]
    with torch.no_grad():
        preds = model(img.unsqueeze(0).to(cfg.device))

    boxes, scores, labels = decode_preds(preds, cfg.anchors, conf_th=0.0, iou_th=1.0)

    # plot image
    fig, ax = plt.subplots(1, figsize=(6,6))
    ax.imshow(img.permute(1,2,0).cpu())
    ax.axis('off')

    # draw GT (green)
    target_dict = tgt[0]
    for b, lab in zip(target_dict['boxes'], target_dict['labels']):
        x1, y1, x2, y2 = [v * cfg.img_size for v in b.tolist()]
        ax.add_patch(patches.Rectangle((x1,y1),(x2-x1),(y2-y1),
                                      linewidth=2, edgecolor='g', facecolor='none'))
        ax.text(x1, y1, f"GT:{test_ds.classes[lab]}", color='g')

    # draw top-5 preds (red)
    if len(scores) > 5:
        topk = scores.argsort(descending=True)[:5]
        boxes, scores, labels = boxes[topk], scores[topk], labels[topk]
    for b, s, lab in zip(boxes, scores, labels):
        x1, y1, x2, y2 = (b * cfg.img_size).tolist()
        ax.add_patch(patches.Rectangle((x1,y1),(x2-x1),(y2-y1),
                                      linewidth=2, edgecolor='r', facecolor='none'))
        ax.text(x1, y1, f"{test_ds.classes[lab]}:{s:.2f}", color='r')

    fig.savefig(INFER_DIR/f"{idx}.png", bbox_inches='tight', pad_inches=0)
    plt.close(fig)

    # collect for COCOeval
    for b, s, lab in zip(boxes, scores, labels):
        x1, y1, x2, y2 = b.tolist()
        coco_dt_results.append({
            'image_id': idx,
            'category_id': int(lab),
            'bbox': [x1*cfg.img_size, y1*cfg.img_size,
                     (x2-x1)*cfg.img_size, (y2-y1)*cfg.img_size],
            'score': float(s)
        })

# 5) run COCOeval
coco_dt = coco_gt.loadRes(coco_dt_results)
coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize()

# 6) save metrics.csv
stats = coco_eval.stats
headers = [
    'AP@[.5:.95]','AP50','AP75','AP_small','AP_medium','AP_large',
    'AR1','AR10','AR100','AR_small','AR_med','AR_large'
]
with open(METRICS_CSV, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerow([round(s,4) for s in stats])

print(f"→ Saved metrics to {METRICS_CSV}")


creating index...
index created!


100%|██████████| 100/100 [00:26<00:00,  3.83it/s]


Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=1.21s).
Accumulating evaluation results...
DONE (t=0.13s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=10

In [30]:
# Zip the project directory
!zip -r /content/project.zip /content/project

# Download to your local machine
from google.colab import files
files.download('/content/project.zip')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/008649.xml (deflated 50%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/009444.xml (deflated 78%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/000981.xml (deflated 58%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/008173.xml (deflated 50%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/001028.xml (deflated 74%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/009691.xml (deflated 59%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/007648.xml (deflated 73%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/007191.xml (deflated 82%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Annotations/005199.xml (deflated 50%)
  adding: content/project/data/VOCdevkit/VOCdevkit/VOC2007/Ann

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>