In [2]:
import os

import torchvision.transforms as T
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import torchvision
import torch
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler

torch.backends.cudnn.benchmark = True

In [3]:
NUM_CPU = os.cpu_count()
print(NUM_CPU)

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

voc_classes = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
               'dog', 'horse', 'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']

voc_class_to_id = {name: i for i, name in enumerate(voc_classes)}


def target_transform(voc_dict):
    anno = voc_dict['annotation']
    boxes = []
    labels = []

    for obj in anno['object']:
        bbox = obj['bndbox']
        boxes.append([
            int(bbox['xmin']),
            int(bbox['ymin']),
            int(bbox['xmax']),
            int(bbox['ymax'])
        ])
        labels.append(voc_class_to_id[obj['name']])

    return {
        'boxes': torch.tensor(boxes, dtype=torch.float32),
        'labels': torch.tensor(labels, dtype=torch.int64)
    }

dataset = VOCDetection(
    root='data/',
    year='2012',
    image_set='train',
    download=True,
    transform=transform,
    target_transform=target_transform 
)


8


In [4]:
dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    num_workers=NUM_CPU,        
    pin_memory=True,
    collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
)

In [5]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
backbone.out_channels = 1280

anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),) * 5)

roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                output_size=7,
                                                sampling_ratio=2)

model = FasterRCNN(backbone,
                   num_classes=21,  # 20 класів + фон
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)



Налаштуємо оптимізатор, scheduler та проведемо дотренування моделі:

In [None]:
params = [p for p in model.parameters() if p.requires_grad]

optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Scheduler для зменшення коефіцієнта навчання
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10
scaler = GradScaler()                     
optimizer = torch.optim.SGD(
    [p for p in model.parameters() if p.requires_grad],
    lr=0.005, momentum=0.9, weight_decay=0.0005
)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    model.train()
    running_loss = 0.0

    for i, (images, targets) in enumerate(dataloader):
        print(i)
        images = [img.to(device) for img in images]                   
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        with autocast():                                               
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

        scaler.scale(losses).backward()
        scaler.step(optimizer)
        scaler.update()                                               

        running_loss += losses.item()
        if (i + 1) % 10 == 0:
            print(f"Step {i+1}/{len(dataloader)}, Loss: {losses.item():.4f}")

    lr_scheduler.step()
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1} done. Avg Loss: {avg_loss:.4f}\n")

  scaler = GradScaler()


Epoch 1/10
