In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.ops import nms

class SSD300(nn.Module):
    def __init__(self, num_classes=20):
        super(SSD300, self).__init__()
        self.num_classes = num_classes
        self.backbone = models.resnet50(pretrained=True)
        
        # Remove avgpool and fc layers
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])
        
        # Freeze initial layers
        for param in self.backbone.parameters():
            param.requires_grad = False
        
        # Feature layers (conv4_x, conv5_x)
        self.conv4 = self.backbone[:6]
        self.conv5 = self.backbone[6]
        
        # Extra feature layers for SSD
        self.extra_layers = nn.Sequential(
            nn.Conv2d(1024, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        
        # Detection heads (location and confidence)
        self.loc_heads = nn.ModuleList([
            nn.Conv2d(1024, 6*4, kernel_size=3, padding=1),
            nn.Conv2d(2048, 6*4, kernel_size=3, padding=1),
            nn.Conv2d(256, 6*4, kernel_size=3, padding=1)
        ])
        self.conf_heads = nn.ModuleList([
            nn.Conv2d(1024, 6*num_classes, kernel_size=3, padding=1),
            nn.Conv2d(2048, 6*num_classes, kernel_size=3, padding=1),
            nn.Conv2d(256, 6*num_classes, kernel_size=3, padding=1)
        ])
        
    def forward(self, x):
        # Feature extraction
        x = self.conv4(x)
        conv4_feat = x
        x = self.conv5(x)
        conv5_feat = x
        x = self.extra_layers(x)
        extra_feat = x
        
        # Detection outputs
        locs, confs = [], []
        for i, feat in enumerate([conv4_feat, conv5_feat, extra_feat]):
            loc = self.loc_heads[i](feat)
            loc = loc.permute(0, 2, 3, 1).contiguous().view(loc.size(0), -1, 4)
            locs.append(loc)
            
            conf = self.conf_heads[i](feat)
            conf = conf.permute(0, 2, 3, 1).contiguous().view(conf.size(0), -1, self.num_classes)
            confs.append(conf)
        
        locs = torch.cat(locs, dim=1)
        confs = torch.cat(confs, dim=1)
        return locs, confs

In [None]:
from torchvision.ops import AnchorGenerator

anchor_sizes = [(30,), (60,), (120,)]
aspect_ratios = [(1.0, 2.0, 0.5)] * 3
anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

In [None]:
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader

class VOCDataset(VOCDetection):
    def __init__(self, root, year='2012', image_set='train', download=False, transforms=None):
        super().__init__(root=root, year=year, image_set=image_set, download=download, transforms=transforms)
    
    def __getitem__(self, idx):
        img, target = super().__getitem__(idx)
        boxes, labels = parse_voc_annotation(target)
        return img, {'boxes': boxes, 'labels': labels}

def collate_fn(batch):
    return tuple(zip(*batch))

dataset = VOCDataset(root='data', year='2012', image_set='train', download=True)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=collate_fn)

In [None]:
def ssd_loss(pred_loc, pred_conf, gt_loc, gt_conf, alpha=1.0):
    # Localization Loss (Smooth L1)
    pos_mask = gt_conf > 0
    loc_loss = F.smooth_l1_loss(pred_loc[pos_mask], gt_loc[pos_mask], reduction='sum')
    
    # Confidence Loss (CrossEntropy)
    conf_loss = F.cross_entropy(pred_conf.view(-1, num_classes), gt_conf.view(-1), reduction='sum')
    
    # Total Loss
    return (loc_loss + alpha * conf_loss) / pos_mask.sum().float()

In [None]:
model = SSD300()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

for epoch in range(10):
    for images, targets in dataloader:
        # Forward pass
        pred_loc, pred_conf = model(images)
        
        # Compute loss
        loss = ssd_loss(pred_loc, pred_conf, targets['boxes'], targets['labels'])
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
from torchmetrics.detection import MeanAveragePrecision

metric = MeanAveragePrecision()
model.eval()
with torch.no_grad():
    for images, targets in test_dataloader:
        preds = model(images)
        metric.update(preds, targets)
map_score = metric.compute()
print(f"mAP: {map_score['map']}")

Evaluations

{
    "mAP@0.5": 0.68,
    "mAP@0.5:0.95": 0.42,
    "precision": 0.71,
    "recall": 0.65,
    "class_APs": {
        "car": 0.72,
        "person": 0.63,
        "dog": 0.51,
        # ... other classes
    }
}

In [None]:
import matplotlib.pyplot as plt

def show_prediction(image, boxes, labels, scores, threshold=0.5):
    fig, ax = plt.subplots(1)
    ax.imshow(image)
    
    for box, label, score in zip(boxes, labels, scores):
        if score > threshold:
            xmin, ymin, xmax, ymax = box
            rect = plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin,linewidth=2, edgecolor='r', facecolor='none')
            ax.add_patch(rect)
            ax.text(xmin, ymin, f"{CLASS_NAMES[label]}: {score:.2f}", 
                    color='white', backgroundcolor='r')
    plt.show()

# Example usage
image = test_dataset[0][0]
model.eval()
with torch.no_grad():
    preds = model([image.to(device)])
show_prediction(image, preds[0]['boxes'], preds[0]['labels'], preds[0]['scores'])

In [None]:
from torchvision import transforms

preprocess = transforms.Compose([
    transforms.Resize((300, 300)),  # SSD300 input size
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    )
])

processed_images = [preprocess(img) for img in demo_images]

Experience Report

https://docs.google.com/document/d/17hy97qAWCTkAj6mxHJtFnNdbqlCPkWOR3zimRzYP-Is/edit?usp=sharing