In [10]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
import torch.optim as optim
import torchvision

# Anchor generator

Your team is developing object detection models based on the Faster R-CNN architecture and using pre-trained backbones. Your task is to create anchor boxes to serve as reference bounding boxes for proposing potential object regions.

You will create 9 standard anchors (3 box sizes and 3 aspect ratios).

* Import AnchorGenerator from torchvision.models.detection.rpn.
* Configure anchor sizes with 3 values: ((32, 64, 128),).
* Configure aspect ratio with 3 values `((0.5, 1.0, 2.0),).
* Instantiate AnchorGenerator with anchor_sizes and aspect_ratios.

In [2]:
# Import AnchorGenerator
from torchvision.models.detection.rpn import AnchorGenerator

# Configure anchor size
anchor_sizes = ((32, 64, 128),)

# Configure aspect ratio
aspect_ratios = ((0.5, 1.0, 2.0),)

# Instantiate AnchorGenerator
rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

In [3]:
print(rpn_anchor_generator)

AnchorGenerator()


The AnchorGenerator can generate anchor boxes for multiple sizes and for different scales in the image.

# Faster R-CNN model

Your next task is to build a Faster R-CNN model that can detect objects of different sizes in an image. For this task, you will be using a handy class MultiScaleRoIAlign() from torchvision.ops.

* Import MultiScaleRoIAlign from torchvision.ops.
* Instantiate the RoI pooler using MultiScaleRoIAlign with featmap_names set to ["0"], output_size to 7, and sampling_ratio to 2.
* Create the Faster R-CNN model passing it the backbone, num_class for a binary classification, anchor_generator, and roi_pooler.

In [13]:
from torchvision.models.detection import FasterRCNN
# Import MultiScaleRoIAlign
from torchvision.ops import MultiScaleRoIAlign

# Instantiate RoI pooler
roi_pooler = MultiScaleRoIAlign(
    featmap_names = ["0"],
    output_size = 7,
    sampling_ratio = 2,
    )

mobilenet = torchvision.models.mobilenet_v2(weights="DEFAULT")
backbone = nn.Sequential(*list(mobilenet.features.children()))
backbone.out_channels = 1280

# Create Faster R-CNN model
model = FasterRCNN(
    backbone=backbone,
    num_classes=2,
    anchor_generator=rpn_anchor_generator,
    box_roi_pool=roi_pooler,
    )

# Define losses for RPN and R-CNN

You are planning to train an object detection model that utilizes both the RPN and R-CNN components. To be able to train it, you will need to define the loss function for each component.

You remember that the RPN component classifies whether a region contains an object and predicts the bounding box coordinates for the proposed regions.The R-CNN component classifies the object into one of multiple classes while also predicting the final bounding box coordinates.

* Define the RPN classification loss function and assign it to rpn_cls_criterion.
* Define the RPN regression loss function and assign it to rpn_reg_criterion.
* Define the R-CNN classification loss function and assign it to rcnn_cls_criterion.
* Define the R-CNN regression loss function using and assign it to rcnn_reg_criterion.



In [14]:
# Implement the RPN classification loss function
rpn_cls_criterion = nn.BCEWithLogitsLoss()

# Implement the RPN regression loss function
rpn_reg_criterion = nn.MSELoss()

# Implement the R-CNN classification Loss function
rcnn_cls_criterion = nn.CrossEntropyLoss()

# Implement the R-CNN regression loss function
rcnn_reg_criterion = nn.MSELoss()

Using the correct loss functions is crucial for training machine learning models effectively and it ensures that the model updates its parameters correctly.

**Prepare A Dataset**

First, create a custom dataset class that can load images and annotations. Assuming you have annotations for each image, you'll need to parse these annotations and return them along with the images.

In [21]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from PIL import Image

class CustomDataset(Dataset):
    def __init__(self, root_dir, transforms=None):
        self.root_dir = root_dir
        self.transforms = transforms
        self.image_paths = [os.path.join(root_dir, img) for img in os.listdir(root_dir)]
        # Assuming you have a way to get annotations for each image
        self.annotations = self.load_annotations()

    def load_annotations(self):
        # Implement this function to load annotations for your dataset
        # This should return a dictionary where keys are image filenames and values are annotations
        annotations = {}
        # Example: annotations['image1.jpg'] = {'boxes': [[x1, y1, x2, y2], ...], 'labels': [1, ...]}
        return annotations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transforms:
            image = self.transforms(image)
        annotation = self.annotations[os.path.basename(img_path)]
        boxes = torch.tensor(annotation['boxes'], dtype=torch.float32)
        labels = torch.tensor(annotation['labels'], dtype=torch.int64)
        target = {'boxes': boxes, 'labels': labels}
        return image, target

# Example usage:
transform = ToTensor()
dataset = CustomDataset(root_dir='/kaggle/input/cats-and-dogs-image-classification/train/dogs', transforms=transform)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))


**Use the DataLoader to load your dataset in batches**

In [22]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))


In [24]:
import json

class CustomDataset(Dataset):
    def __init__(self, root_dir, annotation_file, transforms=None):
        self.root_dir = root_dir
        self.transforms = transforms
        self.image_paths = [os.path.join(root_dir, img) for img in os.listdir(root_dir)]
        self.annotations = self.load_annotations(annotation_file)

    def load_annotations(self, annotation_file):
        with open(annotation_file) as f:
            annotations = json.load(f)
        return annotations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transforms:
            image = self.transforms(image)
        filename = os.path.basename(img_path)
        if filename in self.annotations:
            annotation = self.annotations[filename]
            boxes = torch.tensor(annotation['boxes'], dtype=torch.float32)
            labels = torch.tensor(annotation['labels'], dtype=torch.int64)
            target = {'boxes': boxes, 'labels': labels}
            return image, target
        else:
            # Handle missing annotations
            print(f"No annotation for image {filename}")
            return None

# Example usage:
annotation_file = '/kaggle/input/cats-and-dogs-image-classification/annotations.json'
dataset = CustomDataset(root_dir='/kaggle/input/cats-and-dogs-image-classification/train/dogs', annotation_file=annotation_file, transforms=ToTensor())


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/cats-and-dogs-image-classification/annotations.json'

In [23]:
# Define the loss functions
rpn_cls_criterion = nn.BCEWithLogitsLoss()
rpn_reg_criterion = nn.MSELoss()
rcnn_cls_criterion = nn.CrossEntropyLoss()
rcnn_reg_criterion = nn.MSELoss()

# Define the model components as previously done
anchor_sizes = ((32, 64, 128),)
aspect_ratios = ((0.5, 1.0, 2.0),)
rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

roi_pooler = MultiScaleRoIAlign(featmap_names=["0"], output_size=7, sampling_ratio=2)

mobilenet = torchvision.models.mobilenet_v2(weights="DEFAULT")
backbone = nn.Sequential(*list(mobilenet.features.children()))
backbone.out_channels = 1280

model = FasterRCNN(backbone=backbone, num_classes=2, anchor_generator=rpn_anchor_generator, box_roi_pool=roi_pooler)

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to the GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for images, targets in dataloader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    print(f"Epoch: {epoch}, Loss: {losses.item()}")

KeyError: 'dog_596.jpg'

# **Faster R-CNN in PyTorch**

In [15]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights = "DEFAULT")

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 129MB/s]  


In [18]:
# Define number of classes and classife input size

num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features

Replace the model's classifer with a one with desired number of classes

In [19]:
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [20]:
print(model.roi_heads.box_predictor)

FastRCNNPredictor(
  (cls_score): Linear(in_features=1024, out_features=2, bias=True)
  (bbox_pred): Linear(in_features=1024, out_features=8, bias=True)
)
