In [95]:
import os
import torchvision.models as models
from PIL import Image
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_and_extract_archive

from torch.utils.data import DataLoader
from torchvision import transforms

# Fine-tuning and bounding boxes
- we take pre-trained model (e.g. ResNet) on image classification task
- then we fine-tune the weights on bounding box task
- we use Penn-Fudan database (see https://www.cis.upenn.edu/~jshi/ped_html/)
- finally, we evaluate the model performance using standard metric
- TODOs: 1) Implement ResNetBoxes, 2) Choose loss function, 3) Implement IoU as a metric and evaluate performance, 4) Improve ResNetBoxes with argument that determines if the backbone network should be freezed for fine-tuning, 5) Observe performance when different ResNet variants are used

In [60]:
# Download and extract dataset
url = "https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip"
download_and_extract_archive(url, download_root="data/", extract_root="data/", remove_finished=True)

Downloading https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip to data/PennFudanPed.zip


100%|██████████| 53.7M/53.7M [00:10<00:00, 5.32MB/s]


Extracting data/PennFudanPed.zip to data/


In [81]:
class PennFudanDataset(Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        self.img_dir = os.path.join(root, "PNGImages")
        self.mask_dir = os.path.join(root, "PedMasks")
        self.imgs = sorted(os.listdir(self.img_dir))
        self.masks = sorted(os.listdir(self.mask_dir))

    def __getitem__(self, idx):
        # Construct full paths
        img_path = os.path.join(self.img_dir, self.imgs[idx])
        mask_path = os.path.join(self.mask_dir, self.masks[idx])

        # Load image and mask
        img = Image.open(img_path).convert("RGB")
        mask = np.array(Image.open(mask_path))

        # Remove background (assumed to be ID 0)
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[obj_ids != 0]

        # Generate binary masks
        masks = mask == obj_ids[:, None, None]

        # Generate bounding boxes
        boxes = []
        for m in masks:
            pos = np.where(m)
            xmin, xmax = pos[1].min(), pos[1].max()
            ymin, ymax = pos[0].min(), pos[0].max()
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.tensor(boxes, dtype=torch.float32)

        # All objects are labeled as class 1 (pedestrian)
        labels = torch.ones((len(obj_ids),), dtype=torch.int64)

        # Construct target dictionary
        target = {
            "box": boxes[0],  # First object's box only
            "label": labels[0],
        }

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [96]:
class ResNetBoxes(nn.Module):
    def __init__(self, resnet):
        super().__init__()
        self.backbone = nn.Sequential(*list(resnet.children())[:-2])

        # Get number of features from original ResNet fc layer
        num_features = resnet.fc.in_features

        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.box_head = ...

    def forward(self, x):
        x = self.backbone(x)
        x = self.pool(x)
        boxes = self.box_head(x)
        return boxes

In [91]:
# Dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])
dataset = PennFudanDataset(root='data/PennFudanPed', transforms=transform)

# Loader
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
backbone_model = models.resnet50(pretrained=True)  # download ResNet model
model = ResNetBoxes(backbone_model)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = ...

In [92]:
# Training loop
for epoch in range(5):
    model.train()
    running_loss = 0.0
    for imgs, targets in loader:
        imgs = imgs.to(device)
        gt_boxes = torch.stack([t for t in targets['box']]).to(device)

        preds = model(imgs)
        loss = criterion(preds, gt_boxes)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss / len(loader):.4f}")

Epoch 1, Loss: 57905.9334
Epoch 2, Loss: 52831.4314
Epoch 3, Loss: 44664.5384
Epoch 4, Loss: 34712.5667
Epoch 5, Loss: 25658.3496


In [97]:
# IOU = area of overlap / area of union
def compute_iou(box1, box2):
    # box = [xmin, ymin, xmax, ymax]
    ...

# Evaluate after training
model.eval()
ious = []
with torch.no_grad():
    for imgs, targets in loader:
        imgs = imgs.to(device)
        gt_boxes = torch.stack([t for t in targets['box']]).to(device)
        preds = model(imgs)

        for pred, gt in zip(preds, gt_boxes):
            iou = compute_iou(pred, gt)
            ious.append(iou)

print(f"Mean IoU: {sum(ious) / len(ious):.4f}")

KeyboardInterrupt: 