In [55]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms import v2 as T


# Define the data transform
transform = transforms.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


train_dataset = torchvision.datasets.WIDERFace(
    root="./",
    transform=transform,
    split='train',
    download=True
)



Files already downloaded and verified


In [56]:
test_dataset = torchvision.datasets.WIDERFace(
    root="./",
    transform=transform,
    split='test',
    download=True
)

Files already downloaded and verified


In [75]:
test_dataset[0]

(tensor([[[-0.8980, -0.8588, -0.8667,  ..., -0.8588, -0.8588, -0.8824],
          [-0.8745, -0.8510, -0.8353,  ..., -0.8745, -0.8588, -0.8588],
          [-0.8510, -0.8667, -0.8588,  ..., -0.8745, -0.8745, -0.8667],
          ...,
          [-0.5137, -0.7569, -0.3569,  ..., -0.7333, -0.5843, -0.2863],
          [ 0.1529, -0.6863, -0.5843,  ..., -0.6706, -0.3490,  0.1843],
          [ 0.6157, -0.1686, -0.6471,  ..., -0.7255,  0.0118,  0.2471]],
 
         [[-0.8431, -0.8275, -0.8588,  ..., -0.8745, -0.8745, -0.8980],
          [-0.8196, -0.8196, -0.8275,  ..., -0.8902, -0.8745, -0.8745],
          [-0.8039, -0.8353, -0.8510,  ..., -0.8902, -0.8902, -0.8824],
          ...,
          [-0.4275, -0.8667, -0.5294,  ..., -0.8745, -0.7255, -0.4431],
          [ 0.3333, -0.7333, -0.7412,  ..., -0.7804, -0.5137, -0.0118],
          [ 0.8667, -0.1451, -0.7882,  ..., -0.8039, -0.1686,  0.0196]],
 
         [[-0.7725, -0.7647, -0.8039,  ..., -0.8510, -0.8510, -0.8745],
          [-0.7412, -0.7569,

In [58]:
train_dataset[1][1]

{'bbox': tensor([[361,  98, 263, 339]]),
 'blur': tensor([0]),
 'expression': tensor([0]),
 'illumination': tensor([0]),
 'occlusion': tensor([0]),
 'pose': tensor([0]),
 'invalid': tensor([0])}

In [59]:
def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=2, shuffle=True, num_workers=4,
    collate_fn=collate_fn
)

test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=2, shuffle=True, num_workers=4,
    collate_fn=collate_fn
)

In [60]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

In [61]:
images,targets = next(iter(train_loader))

In [62]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [63]:
model.roi_heads.box_predictor.cls_score.in_features

1024

In [64]:
model.roi_heads.box_predictor

FastRCNNPredictor(
  (cls_score): Linear(in_features=1024, out_features=91, bias=True)
  (bbox_pred): Linear(in_features=1024, out_features=364, bias=True)
)

In [65]:
num_classes = 1
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [66]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [67]:
device

device(type='cuda')

In [68]:
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)

In [69]:
# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

In [70]:
# let's train it for 10 epochs
from torch.optim.lr_scheduler import StepLR
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    header = f"Epoch: [{epoch}]"
    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer, start_factor=warmup_factor, total_iters=warmup_iters
        )
    
    for images, targets in train_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        with torch.cuda.amp.autocast(enabled=False):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print(f"Loss is {loss_value}, stopping training")
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        if scaler is not None:
            scaler.scale(losses).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            losses.backward()
            optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

KeyError: 'boxes'