In [1]:
import cv2
import torch
import torchvision
import torch.nn as nn
from torchvision.models.detection.transform import GeneralizedRCNNTransform
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone

In [2]:
# 图像数据
img = cv2.imread('1.jpg')
# label数据，好像有几个类
label = {
    "boxes":[[100, 100, 200, 300]],
    "label": 1,
    "image_id": 2
    }

In [3]:
img = []
img.append(cv2.imread('1.jpg'))
img = torch.LongTensor(img).permute(0, 3, 1, 2).float()
img.shape

torch.Size([1, 3, 333, 500])

In [4]:
backbone = resnet_fpn_backbone('resnet50', False)
output = backbone(img)
print(len(output))

5


In [5]:
class RPNhead(nn.Module):

    def __init__(self, in_channels, num_anchors):
        super(RPNhead, self).__init__()
        self.conv = nn.Conv2d(
            in_channels, in_channels, kernel_size=3, stride=1, padding=1
        )
        self.cls_logits = nn.Conv2d(
            in_channels, num_anchors, kernel_size=1, stride=1
        )
        self.bbox_pred = nn.Conv2d(
            in_channels, num_anchors * 4, kernel_size=1, stride=1
        )

        # init parameters
        for l in self.children():
            torch.nn.init.normal_(l.weight, std=0.01)
            torch.nn.init.constant_(l.bias, 0)

    def forward(self, x):
        # 输入的是feature_map, feature_map有可能是多层(resnet)
        # x: C * [batch_size, out_channel, H_out, W_out]
        features = list(x.values())
        logits = []
        bbox_reg = []
        for feature in features:
            t = nn.functional.relu(self.conv(feature))
            logits.append(self.cls_logits(t))
            bbox_reg.append(self.bbox_pred(t))
        return bbox_reg, logits

In [6]:
input_channel = backbone.out_channels
Anchor_sizes = (16, 64, 128)
aspect_ratios = (0.5, 1, 2)

In [7]:
rpn_head = RPNhead(input_channel, len(Anchor_sizes) * len(aspect_ratios))
bbox_reg, logits = rpn_head(output)

In [8]:
# 生成anchor的各个中心点
# 已知feature_map有5层
[i.shape for i in output.values()]

[torch.Size([1, 256, 84, 125]),
 torch.Size([1, 256, 42, 63]),
 torch.Size([1, 256, 21, 32]),
 torch.Size([1, 256, 11, 16]),
 torch.Size([1, 256, 6, 8])]

In [9]:
# 获取每层feature_map的尺寸
feature_size = [i.shape[-2:] for i in output.values()]

In [12]:
# 获取输入图片的尺寸
img_size = img.shape[-2:]

In [13]:
print(feature_size)
print(img_size)

[torch.Size([84, 125]), torch.Size([42, 63]), torch.Size([21, 32]), torch.Size([11, 16]), torch.Size([6, 8])]
torch.Size([333, 500])


In [14]:
#计算每层的stride
stride = torch.LongTensor([[img_size[0]//f[0], img_size[1]//f[1]] for f in feature_size])

In [15]:
print(stride)

tensor([[ 3,  4],
        [ 7,  7],
        [15, 15],
        [30, 31],
        [55, 62]])


In [16]:
# 这样的话我们就可以生成,每个anchor的中心点
centers = []
for f, s in zip(feature_size, stride):
    y = torch.arange(0, f[0], dtype = torch.float32) * s[0]
    x = torch.arange(0, f[1], dtype = torch.float32) * s[1]
    y, x = torch.meshgrid(y,x)
    y = y.reshape(-1)
    x = x.reshape(-1)
    center = torch.stack((y,x,y,x), dim = 1)
    centers.append((center.view(-1, 1, 4) + base_anchor.view(1, -1, 4)).view(-1,4))

NameError: name 'base_anchor' is not defined

In [183]:
print(len(centers))
[i.shape for i in centers]

5


[torch.Size([94500, 4]),
 torch.Size([23814, 4]),
 torch.Size([6048, 4]),
 torch.Size([1584, 4]),
 torch.Size([432, 4])]

In [148]:
# 生成每个anchor的长宽
# Anchor_sizes = (16, 64, 128)
# aspect_ratios = (0.5, 1, 2)
Anchor_scales = torch.as_tensor(Anchor_sizes, dtype = torch.float32)
aspect_ratios = torch.as_tensor(aspect_ratios, dtype = torch.float32)

In [173]:
h_ratios = torch.sqrt(aspect_ratios)
w_ratios = 1/h_ratios

ws = (h_ratios[:, None] * Anchor_scales[None, :]).view(-1)
hs = (w_ratios[:, None] * Anchor_scales[None, :]).view(-1)

In [174]:
ws.shape

torch.Size([9])

In [179]:
base_anchor = (torch.stack([-ws, -hs, ws, hs], dim = 1)/2).round()

In [176]:
base_anchor.shape

torch.Size([9, 4])

In [177]:
base_anchor.view(1, -1, 4)

tensor([[[ -5.6569, -11.3137,   5.6569,  11.3137],
         [-22.6274, -45.2548,  22.6274,  45.2548],
         [-45.2548, -90.5097,  45.2548,  90.5097],
         [ -8.0000,  -8.0000,   8.0000,   8.0000],
         [-32.0000, -32.0000,  32.0000,  32.0000],
         [-64.0000, -64.0000,  64.0000,  64.0000],
         [-11.3137,  -5.6569,  11.3137,   5.6569],
         [-45.2548, -22.6274,  45.2548,  22.6274],
         [-90.5097, -45.2548,  90.5097,  45.2548]]])

In [164]:
base_anchor.round()

torch.Size([3, 4, 3])

In [18]:
class Anchor_generator(nn.Module):

    def __init__(self, Anchor_sizes = (64, 256, 512), aspect_ratios = (0.5, 1.0, 2.0)):
        super(Anchor_generator, self).__init__()

        self.Anchor_sizes = Anchor_sizes
        self.aspect_ratios = aspect_ratios

    # 生成Anchor的话，需要原输入图片的img跟feature_map
    def forward(self, imgs, feature_maps):
        # imgs [batch_size, 3, H, W]
        # feature_maps C * [batch_size, out_channel, h_out, w_out]

        # 获取图片的尺寸
        img_sizes = imgs.shape[-2:]
        # 获取每一层feature_map的尺寸
        features_size = tuple([feature_map.shape[-2:] for feature_map in feature_maps])
        # 计算一个feature_map的特征所代表图像的像素点个数
        stride = [[img_sizes[0] / f[0], img_sizes[1] / f[1]] for f in features_size]

        # 生成base_anchor
        Anchor_sizes = torch.as_tensor(self.Anchor_sizes, dtype=torch.float32)
        aspect_ratios = torch.as_tensor(self.aspect_ratios, dtype=torch.float32)

        h_ratios = torch.sqrt(aspect_ratios)
        w_ratios = 1 / h_ratios

        # 所有ratio对各个Anchor_size做计算
        hs = (h_ratios[:, None] * Anchor_sizes[None, :]).view(-1)
        ws = (w_ratios[:, None] * Anchor_sizes[None, :]).view(-1)

        # hs, ws做拼接 N * 4, round 取整
        base_anchor = (torch.stack([-hs, -ws, hs, ws], dim=1) / 2).round()

        # 生成每个feature_map层的中心点, 获取每层的尺寸和步长stride
        anchors = []
        for f, s in zip(features_size, stride):
            # 生成h_c, w_c点
            feature_h, feature_w = f
            stride_h, stride_w = s
            h_c = torch.arange(0, feature_h, dtype=torch.float32) * stride_h
            w_c = torch.arange(0, feature_w, dtype=torch.float32) * stride_w
            h_c, w_c = torch.meshgrid(h_c, w_c)
            h_c = h_c.reshape(-1)
            w_c = w_c.reshape(-1)
            anchor = torch.stack([h_c, w_c, h_c, w_c]).view(-1, 1, 4) + \
                     base_anchor.view(1, -1, 4)

            anchors.append(anchor.view(-1, 4))
        return anchors

In [19]:
anchor_gener = Anchor_generator()

In [20]:
feature_maps = [i for i in output.values()]

In [21]:
anchors = anchor_gener(img, output.values())

In [23]:
anchors[0].shape

torch.Size([94500, 4])

In [209]:
[anchor.shape for anchor in anchors]

[torch.Size([94500, 4]),
 torch.Size([23814, 4]),
 torch.Size([6048, 4]),
 torch.Size([1584, 4]),
 torch.Size([432, 4])]

In [21]:
layer = backbone[:-1]

In [22]:
import torch

In [23]:
torch.nn.Sequential(layer)

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1