pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI

Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass

.\packages\Scripts\Activate.ps1

In [5]:
import torch
import torchvision
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import os
import requests
from PIL import Image
from torchvision import datasets, transforms, utils
import pickle
import torch.optim as optim
from tqdm import tqdm
from torchinfo import summary
import cv2
from torchvision.io import read_image
from pathlib import Path
from torch.utils.data import DataLoader
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.datasets import CocoDetection
from torchvision.transforms import functional as F
from collections import Counter
from sklearn.model_selection import train_test_split
from torchvision.datasets.utils import download_and_extract_archive
from torchvision.datasets import VisionDataset
import json


### Model

In [2]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True)

summary(model=model,
        input_size=(16, 3, 224, 224), # (batch_size, color_channels, height, width)
        # col_names=["input_size"], # uncomment for smaller output
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)



Layer (type (var_name))                                 Input Shape          Output Shape         Param #              Trainable
FasterRCNN (FasterRCNN)                                 [16, 3, 224, 224]    [0, 4]               --                   Partial
├─GeneralizedRCNNTransform (transform)                  [16, 3, 224, 224]    [16, 3, 800, 800]    --                   --
├─BackboneWithFPN (backbone)                            [16, 3, 800, 800]    [16, 256, 13, 13]    --                   Partial
│    └─IntermediateLayerGetter (body)                   [16, 3, 800, 800]    [16, 2048, 25, 25]   --                   Partial
│    │    └─Conv2d (conv1)                              [16, 3, 800, 800]    [16, 64, 400, 400]   (9,408)              False
│    │    └─FrozenBatchNorm2d (bn1)                     [16, 64, 400, 400]   [16, 64, 400, 400]   --                   --
│    │    └─ReLU (relu)                                 [16, 64, 400, 400]   [16, 64, 400, 400]   --                   --

### 1. Download and prepare data
You'll need a labeled dataset for object detection in COCO-style format or VOC-style format. 
The Penn-Fudan dataset contains images of pedestrians and annotations for object detection (bounding boxes).

In [6]:
# Download the dataset
url = "https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip"
root = "./PennFudanPed"
download_and_extract_archive(url, root)

# Define the dataset class
class PennFudanDataset(VisionDataset):
    def __init__(self, root, transforms=None):
        super().__init__(root)
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)

        # Convert the mask into a binary format (background=0, person=1)
        mask = np.array(mask)
        obj_ids = np.unique(mask)[1:]
        masks = mask == obj_ids[:, None, None]
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        target = {"boxes": boxes, "labels": labels, "masks": masks}
        if self.transforms:
            img = self.transforms(img)
        return img, target

    def __len__(self):
        return len(self.imgs)

# Load the dataset
dataset = PennFudanDataset(root=os.path.join(root, "PennFudanPed"))


Downloading https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip to ./PennFudanPed\PennFudanPed.zip


100%|██████████| 53.7M/53.7M [00:13<00:00, 3.94MB/s]


Extracting ./PennFudanPed\PennFudanPed.zip to ./PennFudanPed
