In [None]:
pip install numpy opencv-python matplotlib



In [None]:
import torchvision.models as models

# Load a pre-trained VGG-16 model
vgg16 = models.vgg16(pretrained=True)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:07<00:00, 73.8MB/s]


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RPN(nn.Module):
    def __init__(self, in_channels):
        super(RPN, self).__init__()
        self.conv = nn.Conv2d(in_channels, 512, kernel_size=3, stride=1, padding=1)
        self.cls_layer = nn.Conv2d(512, 18, kernel_size=1, stride=1)  # 9 anchors * 2 (object/not object)
        self.reg_layer = nn.Conv2d(512, 36, kernel_size=1, stride=1)  # 9 anchors * 4 (coordinates)

    def forward(self, x):
        x = F.relu(self.conv(x))
        cls_output = self.cls_layer(x)  # Objectness scores
        reg_output = self.reg_layer(x)  # Bounding box coordinates
        return cls_output, reg_output

The RPN is a small network that predicts region proposals.




Fast R-CNN takes the region proposals from the RPN and classifies them.



In [None]:
class FastRCNN(nn.Module):
    def __init__(self, num_classes):
        super(FastRCNN, self).__init__()
        self.fc1 = nn.Linear(512 * 7 * 7, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.cls_layer = nn.Linear(4096, num_classes)
        self.reg_layer = nn.Linear(4096, num_classes * 4)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # this to flatten the input
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        cls_output = self.cls_layer(x)
        reg_output = self.reg_layer(x)
        return cls_output, reg_output

Now, let’s combine the RPN and Fast R-CNN into a single Faster R-CNN model.



In [None]:
class FasterRCNN(nn.Module):
    def __init__(self, backbone, num_classes):
        super(FasterRCNN, self).__init__()
        self.backbone = backbone
        self.rpn = RPN(in_channels=512)
        self.fast_rcnn = FastRCNN(num_classes=num_classes)

    def forward(self, x):
        features = self.backbone(x)
        cls_scores, reg_coords = self.rpn(features)
        # For Apply RoI Pooling
        rois = self._generate_rois(reg_coords)
        cls_output, reg_output = self.fast_rcnn(rois)
        return cls_output, reg_output

    def _generate_rois(self, reg_coords):
        # This is like Simplified RoI generation (but not actual RoI pooling)
        return reg_coords

Training our model with an example

In [None]:
from torch.utils.data import DataLoader
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import VOCDetection
from torchvision.transforms import Compose, ToTensor, Resize
import os


In [None]:
# Define transforms for the dataset
def get_transform(train):
    transforms = []
    transforms.append(ToTensor())  # Convert PIL image to tensor
    if train:
        transforms.append(Resize((600, 600)))  # Resize images for training
    return Compose(transforms)

# Load the PASCAL VOC dataset
def get_voc_dataset(root, year="2012", image_set="train", download=True):
    # Define the path to the dataset
    dataset_path = os.path.join(root, f"VOC{year}")

    # Load the dataset
    dataset = VOCDetection(
        root=dataset_path,
        year=year,
        image_set=image_set,
        download=download,
        transforms=get_transform(train=(image_set == "train"))
    )
    return dataset

# Example usage
root = "./data"  # Directory where the dataset will be stored
train_dataset = get_voc_dataset(root, year="2012", image_set="train", download=True)
val_dataset = get_voc_dataset(root, year="2012", image_set="val", download=True)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to ./data/VOC2012/VOCtrainval_11-May-2012.tar


100%|██████████| 2.00G/2.00G [00:51<00:00, 39.0MB/s]


Extracting ./data/VOC2012/VOCtrainval_11-May-2012.tar to ./data/VOC2012
Using downloaded and verified file: ./data/VOC2012/VOCtrainval_11-May-2012.tar
Extracting ./data/VOC2012/VOCtrainval_11-May-2012.tar to ./data/VOC2012


In [None]:
def collate_fn(batch):
    """
    Collate function for object detection datasets.
    Args:
        batch: A list of tuples (image, target) from the dataset.
    Returns:
        images: A tensor of shape [batch_size, C, H, W].
        targets: A list of dictionaries containing "boxes" and "labels".
    """
    images = [item[0] for item in batch]  # Extract images
    targets = [item[1] for item in batch]  # Extract targets

    # Stack images into a single tensor
    images = torch.stack(images, dim=0)

    return images, targets

In [None]:
# Create DataLoader for training and validation
train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    num_workers=4,
    collate_fn=collate_fn
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)

In [None]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import VOCDetection
from torchvision.transforms import Compose, ToTensor, Resize
import os
import xml.etree.ElementTree as ET

# Define transforms
def get_transform(train):
    """
    Apply transforms to the image only (not the target).
    """
    transforms = []
    transforms.append(ToTensor())  # Convert PIL image to tensor
    if train:
        transforms.append(Resize((600, 600)))  # Resize images for training
    return Compose(transforms)

# Parse XML annotations
def parse_voc_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    boxes = []
    labels = []
    for obj in root.findall("object"):
        # Get class label
        label = obj.find("name").text
        labels.append(label)

        # Get bounding box coordinates
        bbox = obj.find("bndbox")
        xmin = float(bbox.find("xmin").text)
        ymin = float(bbox.find("ymin").text)
        xmax = float(bbox.find("xmax").text)
        ymax = float(bbox.find("ymax").text)
        boxes.append([xmin, ymin, xmax, ymax])

    return boxes, labels

# Custom dataset class
class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, root, year="2012", image_set="train", transforms=None):
        self.voc = VOCDetection(root, year=year, image_set=image_set, download=True)
        self.transforms = transforms
        self.classes = [
            "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
            "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
            "pottedplant", "sheep", "sofa", "train", "tvmonitor"
        ]

    def __len__(self):
        return len(self.voc)

    def __getitem__(self, idx):
        # Load image and target
        img, target = self.voc[idx]

        # Parse XML annotations
        boxes, labels = parse_voc_xml(target["annotation"])

        # Convert boxes and labels to tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor([self.classes.index(label) for label in labels], dtype=torch.int64)

        # Create target dictionary
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels

        # Apply transforms to the image only
        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

# Custom collate function
def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    images = torch.stack(images, dim=0)
    return images, targets

# Load datasets
root = "./data"
train_dataset = VOCDataset(root, year="2012", image_set="train", transforms=get_transform(train=True))
val_dataset = VOCDataset(root, year="2012", image_set="val", transforms=get_transform(train=False))

# Create DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    num_workers=4,
    collate_fn=collate_fn
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)

# Define Faster R-CNN model
backbone = torchvision.models.vgg16(pretrained=True).features
backbone.out_channels = 512
anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

model = FasterRCNN(
    backbone,
    num_classes=21,  # 20 classes + background
    rpn_anchor_generator=anchor_generator
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for images, targets in train_dataloader:
        # Move images and targets to the device (GPU or CPU)
        images = images.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass and optimization
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        # Accumulate loss for logging
        epoch_loss += losses.item()

    # Print the average loss for the epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_dataloader):.4f}")

# Validation loop
model.eval()
val_loss = 0.0

with torch.no_grad():
    for images, targets in val_dataloader:
        # Move images and targets to the device
        images = images.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Accumulate loss for logging
        val_loss += losses.item()

# Print the average validation loss
print(f"Validation Loss: {val_loss/len(val_dataloader):.4f}")

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to ./data/VOCtrainval_11-May-2012.tar


100%|██████████| 2.00G/2.00G [03:11<00:00, 10.4MB/s]


Extracting ./data/VOCtrainval_11-May-2012.tar to ./data
Using downloaded and verified file: ./data/VOCtrainval_11-May-2012.tar
Extracting ./data/VOCtrainval_11-May-2012.tar to ./data


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "<ipython-input-26-b634fc37801e>", line 64, in __getitem__
    boxes, labels = parse_voc_xml(target["annotation"])
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-26-b634fc37801e>", line 25, in parse_voc_xml
    tree = ET.parse(xml_file)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/xml/etree/ElementTree.py", line 1219, in parse
    tree.parse(source, parser)
  File "/usr/lib/python3.11/xml/etree/ElementTree.py", line 570, in parse
    source = open(source, "rb")
             ^^^^^^^^^^^^^^^^^^
TypeError: expected str, bytes or os.PathLike object, not dict


In [None]:


# Define transforms
def get_transform(train):
    """
    Apply transforms to the image only (not the target).
    """
    transforms = []
    transforms.append(ToTensor())  # Convert PIL image to tensor
    if train:
        transforms.append(Resize((600, 600)))  # Resize images for training
    return Compose(transforms)

# Extract bounding boxes and labels from the target dictionary
def extract_boxes_and_labels(target):
    """
    Extract bounding boxes and labels from the VOC target dictionary.
    """
    boxes = []
    labels = []
    for obj in target["annotation"]["object"]:
        # Get class label
        label = obj["name"]
        labels.append(label)

        # Get bounding box coordinates
        bbox = obj["bndbox"]
        xmin = float(bbox["xmin"])
        ymin = float(bbox["ymin"])
        xmax = float(bbox["xmax"])
        ymax = float(bbox["ymax"])
        boxes.append([xmin, ymin, xmax, ymax])

    return boxes, labels

# Custom dataset class
class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, root, year="2012", image_set="train", transforms=None):
        self.voc = VOCDetection(root, year=year, image_set=image_set, download=True)
        self.transforms = transforms
        self.classes = [
            "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
            "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
            "pottedplant", "sheep", "sofa", "train", "tvmonitor"
        ]

    def __len__(self):
        return len(self.voc)

    def __getitem__(self, idx):
        # Load image and target
        img, target = self.voc[idx]

        # Extract bounding boxes and labels from the target dictionary
        boxes, labels = extract_boxes_and_labels(target)

        # Convert boxes and labels to tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor([self.classes.index(label) for label in labels], dtype=torch.int64)

        # Create target dictionary
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels

        # Apply transforms to the image only
        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

# Custom collate function
def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    images = torch.stack(images, dim=0)
    return images, targets

# Load datasets
root = "./data"
train_dataset = VOCDataset(root, year="2012", image_set="train", transforms=get_transform(train=True))
val_dataset = VOCDataset(root, year="2012", image_set="val", transforms=get_transform(train=False))

# Create DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    num_workers=4,
    collate_fn=collate_fn
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)

# Define Faster R-CNN model
backbone = torchvision.models.vgg16(pretrained=True).features
backbone.out_channels = 512
anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

model = FasterRCNN(
    backbone,
    num_classes=21,  # 20 classes + background
    rpn_anchor_generator=anchor_generator
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for images, targets in train_dataloader:
        # Move images and targets to the device (GPU or CPU)
        images = images.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass and optimization
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        # Accumulate loss for logging
        epoch_loss += losses.item()

    # Print the average loss for the epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_dataloader):.4f}")

# Validation loop
model.eval()
val_loss = 0.0

with torch.no_grad():
    for images, targets in val_dataloader:
        # Move images and targets to the device
        images = images.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Accumulate loss for logging
        val_loss += losses.item()

# Print the average validation loss
print(f"Validation Loss: {val_loss/len(val_dataloader):.4f}")