In [130]:
import torchvision
import torch
import PIL.Image
import matplotlib.pyplot as plt
import pandas
import pytorch_lightning as L
from pathlib import Path
from torchvision.transforms import v2
from torch.utils import data
from matplotlib.patches import Rectangle
from neural_networks.tinyssd import TinySSD
from lightning_datasets import BananaDetection

When process predict for TinySSD model, anchors are not affected by input batch size.

In output, `cls_preds` and `bbox_preds` contains predicts for each anchor, position is same with anchor index.

In [131]:
model = TinySSD(1)

for i in range(4, 17, 4):
    X = torch.zeros(i, 3, 256, 256)
    anchors, cls_preds, bbox_preds = model(X)
    print(
        f"When input {i} image\n"
        f"    anchors_shape: {anchors.shape}\n"
        f"    cls_preds_shape: {cls_preds.shape}\n"
        f"    bbox_preds_shape: {bbox_preds.shape}\n")


When input 4 image
    anchors_shape: torch.Size([1, 5444, 4])
    cls_preds_shape: torch.Size([4, 5444, 2])
    bbox_preds_shape: torch.Size([4, 21776])

When input 8 image
    anchors_shape: torch.Size([1, 5444, 4])
    cls_preds_shape: torch.Size([8, 5444, 2])
    bbox_preds_shape: torch.Size([8, 21776])

When input 12 image
    anchors_shape: torch.Size([1, 5444, 4])
    cls_preds_shape: torch.Size([12, 5444, 2])
    bbox_preds_shape: torch.Size([12, 21776])

When input 16 image
    anchors_shape: torch.Size([1, 5444, 4])
    cls_preds_shape: torch.Size([16, 5444, 2])
    bbox_preds_shape: torch.Size([16, 21776])



In every batch, we will have raw data with following shapes.

In [132]:
dataset = BananaDetection(batch_size=32, num_workers=0)
dataset.prepare_data()
dataset.setup('fit')

for batch in dataset.train_dataloader():
    X, y = batch
    anchors, cls_preds, bbox_preds = model(X)
    print(f"""
Raw data shape in batch idx 0
    X.shape: {X.shape}
    y.shape: {y.shape}
    anchors.shape: {anchors.shape}
    cls_preds.shape: {cls_preds.shape}
    bbox_preds.shape: {bbox_preds.shape}""")
    break


Raw data shape in batch idx 0
    X.shape: torch.Size([32, 3, 256, 256])
    y.shape: torch.Size([32, 1, 5])
    anchors.shape: torch.Size([1, 5444, 4])
    cls_preds.shape: torch.Size([32, 5444, 2])
    bbox_preds.shape: torch.Size([32, 21776])


In [133]:
y[0]

tensor([[0.0000, 0.7500, 0.0234, 0.9375, 0.2578]], dtype=torch.float64)

The dim 0 for anchors was added in `TinySSD.multibox_prior` function, seems can be removed.

To keep consistent with d2l book, keep it in code.

In [134]:
def box_iou(boxes1, boxes2):
    box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) *
    (boxes[:, 3] - boxes[:, 1]))
    # boxes1,boxes2,areas1,areas2的形状:
    # boxes1：(boxes1的数量,4),
    # boxes2：(boxes2的数量,4),
    # areas1：(boxes1的数量,),
    # areas2：(boxes2的数量,)
    areas1 = box_area(boxes1)
    areas2 = box_area(boxes2)
    # inter_upperlefts,inter_lowerrights,inters的形状:
    # (boxes1的数量,boxes2的数量,2)
    inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2])
    inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
    inters = (inter_lowerrights - inter_upperlefts).clamp(min=0)
    # inter_areasandunion_areas的形状:(boxes1的数量,boxes2的数量)
    inter_areas = inters[:, :, 0] * inters[:, :, 1]
    union_areas = areas1[:, None] + areas2 - inter_areas
    return inter_areas / union_areas

In [135]:
def assign_anchor_to_bbox(ground_truth, anchors, iou_threshold=0.5):
    num_anchors = anchors.shape[0]
    num_gt_boxes = ground_truth.shape[0]
    jaccard = box_iou(anchors, ground_truth)
    anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long)
    print(f"box_iou result:\n{jaccard}")
    # 这里算法的顺序和书上写的略有差异，实现顺序和逻辑顺序相反
    # 为了方便计算，先根据iou_threshold把每个锚框都打上标号，然后再用最适合的锚框覆盖一部分值
    max_ious, indices = torch.max(jaccard, dim=1)
    anchors_bbox_map[max_ious >=
                     iou_threshold] = indices[max_ious >= iou_threshold]
    # 然后开始按照iou最大值，给每个真实边框，分配最接近的锚框
    column_discard_placeholder = torch.full((num_anchors,), -1)
    row_discard_placeholder = torch.full((num_gt_boxes,), -1)

    for _ in range(num_gt_boxes):
        max_idx = torch.argmax(jaccard)
        box_idx = (max_idx % num_gt_boxes).long()
        anc_idx = (max_idx / num_gt_boxes).long()
        anchors_bbox_map[anc_idx] = box_idx
        jaccard[anc_idx, :] = row_discard_placeholder
        jaccard[:, box_idx] = column_discard_placeholder
    return anchors_bbox_map

In [136]:
ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],
                             [1, 0.55, 0.2, 0.9, 0.88]])
anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],
                        [0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],
                        [0.57, 0.3, 0.92, 0.9]])

assign_anchor_to_bbox(ground_truth[:,1:],anchors)

box_iou result:
tensor([[0.0536, 0.0000],
        [0.1417, 0.0000],
        [0.0000, 0.5657],
        [0.0000, 0.2059],
        [0.0000, 0.7459]])


tensor([-1,  0,  1, -1,  1])

In [137]:
def box_corner_to_center(boxes):
    """从（左上，右下）转换到（中间，宽度，高度）"""
    x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    cx = (x1 + x2) / 2
    cy = (y1 + y2) / 2
    w = x2 - x1
    h = y2 - y1
    boxes = torch.stack((cx, cy, w, h), axis=-1)
    return boxes


def box_center_to_corner(boxes):
    """从（中间，宽度，高度）转换到（左上，右下）"""
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = cx + 0.5 * w
    y2 = cy + 0.5 * h
    boxes = torch.stack((x1, y1, x2, y2), axis=-1)
    return boxes


In [138]:
def offset_boxes(anchors, assigned_bb, eps=1e-6):
    """对锚框偏移量的转换"""
    c_anc = box_corner_to_center(anchors)
    c_assigned_bb = box_corner_to_center(assigned_bb)
    offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
    offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
    offset = torch.cat([offset_xy, offset_wh], axis=1)
    return offset

In [139]:
def multibox_target(anchors, labels):
    batch_size, anchors = labels.shape[0], anchors.squeeze(0)

    num_anchors = anchors.shape[0]

    batch_offset, batch_mask, batch_class_labels = [], [], []

    for image_idx in range(batch_size):
        label = labels[image_idx]
        anchors_bbox_map = assign_anchor_to_bbox(label[:, 1:], anchors)
        # bbox_mask is used to mask backgroud as 0.
        # label other than background will multiply an identity of 1
        # bbox_mask.shape[0] equals num_anchors
        bbox_mask = (anchors_bbox_map >= 0).float().unsqueeze(-1).repeat(1, 4)

        class_labels = torch.zeros(num_anchors, dtype=torch.long)
        assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32)

        indices_true = torch.nonzero(anchors_bbox_map >= 0)
        bounding_box_idx = anchors_bbox_map[indices_true]

        class_labels[indices_true] = label[bounding_box_idx, 0].long() + 1
        assigned_bb[indices_true] = label[bounding_box_idx, 1:]
        offset = offset_boxes(anchors, assigned_bb) * bbox_mask
        batch_offset.append(offset.reshape(-1))
        batch_mask.append(bbox_mask.reshape(-1))
        batch_class_labels.append(class_labels)

    bbox_offset = torch.stack(batch_offset)
    bbox_mask = torch.stack(batch_mask)
    class_labels = torch.stack(batch_class_labels)

    return bbox_offset, bbox_mask, class_labels

These works are used to construct labels used to train models.

The output can match model output, which meas the target we want neural networks to generate.

In [140]:
model_train_label = multibox_target(anchors.unsqueeze(0), ground_truth.unsqueeze(0))

box_iou result:
tensor([[0.0536, 0.0000],
        [0.1417, 0.0000],
        [0.0000, 0.5657],
        [0.0000, 0.2059],
        [0.0000, 0.7459]])


In [141]:
print(f"""
Label used to train model:
    bounding box offset(match bounding box predict output): {model_train_label[0]}
    bounding box mask(0 means background, 1 meas object): {model_train_label[1]}
    class labels(match class predict output): {model_train_label[2]}
""")


Label used to train model:
    bounding box offset(match bounding box predict output): tensor([[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,  1.4000e+00,
          1.0000e+01,  2.5940e+00,  7.1754e+00, -1.2000e+00,  2.6882e-01,
          1.6824e+00, -1.5655e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00, -5.7143e-01, -1.0000e+00,  4.1723e-06,  6.2582e-01]])
    bounding box mask(0 means background, 1 meas object): tensor([[0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
         1., 1.]])
    class labels(match class predict output): tensor([[0, 1, 2, 0, 2]])

