In [1]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import torchvision
from torchvision import transforms as T
import torchvision.models as models
import numpy as np
import matplotlib.pyplot as plt
import cv2
import PIL
import random
from time import time
%matplotlib inline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# 设置随机种子
seed = 42
torch.manual_seed(seed)
random.seed(seed)

cuda


In [2]:
class PascalVOC2012(torch.utils.data.Dataset):
    """PASCAL VOC 2012 数据集"""
    def __init__(self, train=True, scale_ratio=1.0):
        super().__init__()
        self.train = train
        self.scale_ratio = scale_ratio # 图片缩放倍数
        self.data = torchvision.datasets.VOCDetection(root='../../data',
                                                      year='2012',
                                                      image_set='train' if train else 'val',
                                                      download=False)
        # 训练集对亮度、对比度、饱和度和色调进行扰动，随后归一化
        self.trans_train = T.Compose([T.ToTensor(),
                                      T.ColorJitter(brightness=0.2,
                                                    contrast=0.2,
                                                    saturation=0.2,
                                                    hue=0.1),
                                      T.Normalize(mean=[0.4570, 0.4382, 0.4062],
                                                   std=[0.2391, 0.2351, 0.2397],)])
        # 验证集仅做归一化处理
        self.trans_valid = T.Compose([T.ToTensor(),
                                      T.Normalize(mean=[0.4570, 0.4382, 0.4062],
                                                   std=[0.2391, 0.2351, 0.2397],)])
        self.horizontal_flip = T.RandomHorizontalFlip(p=1)
        # 类别列表，共20个类别：人、6种动物、7种交通工具和6种日用品
        self.cls_labels = ['person',
                           'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
                           'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train',
                           'bottle', 'chair', 'diningtable', 'pottedplant', 'sofa', 'tvmonitor']
        
    def __len__(self):
        """返回数据集样本个数"""
        return len(self.data)
    
    def __getitem__(self, index):
        """从数据集中取出图片和对应标签"""
        if self.train:
            # 格式转换、色彩扰动、归一化
            image = self.trans_train(self.data[index][0])
        else:
            # 格式转换、归一化
            image = self.trans_valid(self.data[index][0])
        box_labels, box_coords = self.get_label_list(self.data[index][1])
        # 水平翻转
        image, box_coords = self.RandomHorizontalFlip(image, box_coords)
        # 按缩放倍数进行缩放
        image = T.Resize(size=int(min(image.shape[-1], image.shape[-2]) * self.scale_ratio))(image)
        return image, (box_labels, (box_coords * self.scale_ratio).int())
    
    def get_label_list(self, label):
        """从标签字典中取出各物体类别和对应边界框"""
        obj_list = label['annotation']['object']
        box_labels = [self.cls_labels.index(obj['name'] if type(obj['name']) == str else obj['name'][0]) for obj in obj_list]
        box_coords = []
        for obj in obj_list:
            coord = []
            # 真实边界框为xyxy格式
            for k in ['xmin', 'ymin', 'xmax', 'ymax']:
                v = obj['bndbox'][k]
                coord.append(int(v if type(v) == str else v[0]))
            box_coords.append(coord)
            # 返回两个张量，box_labels形状为（物体数），box_coords形状为（物体数，4）
        return (torch.tensor(box_labels), torch.tensor(box_coords))

    def RandomHorizontalFlip(self, image, box_coords):
        """训练集中，将图片以0.5的概率水平翻转"""
        if self.train and random.random() > 0.5:
            w = image.shape[-1]
            # 水平翻转图片
            image = self.horizontal_flip(image)
            # 水平翻转边界框：
            # x1 = w - x2, x2 = w - x1
            x1, x2 = box_coords[:, 0], box_coords[:, 2]
            box_coords[:, 0], box_coords[:, 2] = w - x2, w - x1
        return image, box_coords

In [3]:
voc_train = PascalVOC2012(train=True)
voc_val = PascalVOC2012(train=False)

In [4]:
def refine_box(box_cxcywh, shift):
    """使用偏移系数修正锚框/候选框，输入输出皆为cxcywh格式"""
    box = box_cxcywh.to(shift.device)
    p_cx = box[:, 2] * shift[:, 0] + box[:, 0]
    p_cy = box[:, 3] * shift[:, 1] + box[:, 1]
    p_w = box[:, 2] * torch.exp(shift[:, 2])
    p_h = box[:, 3] * torch.exp(shift[:, 3])
    return torch.stack([p_cx, p_cy, p_w, p_h], dim=1)

def coord_to_shift(src_cxcywh, tgt_cxcywh):
    """使用源框和目标框计算从源框到目标框的偏移系数"""
    assert src_cxcywh.shape == tgt_cxcywh.shape
    t_x = (tgt_cxcywh[:, 0] - src_cxcywh[:, 0]) / src_cxcywh[:, 2]
    t_y = (tgt_cxcywh[:, 1] - src_cxcywh[:, 1]) / src_cxcywh[:, 3]
    t_w = torch.log(tgt_cxcywh[:, 2] / src_cxcywh[:, 2])
    t_h = torch.log(tgt_cxcywh[:, 3] / src_cxcywh[:, 3])
    return torch.stack([t_x, t_y, t_w, t_h], dim=1)

# 边界框格式转换
def cxcywh2xyxy(boxes):
    return torchvision.ops.box_convert(boxes, 'cxcywh', 'xyxy').int()
def xyxy2cxcywh(boxes):
    return torchvision.ops.box_convert(boxes, 'xyxy', 'cxcywh').int()

# 固定/解除固定模型参数
def freeze(module):
    for param in module.parameters():
        param.requires_grad_(False)
def unfreeze(module):
    for param in module.parameters():
        param.requires_grad_(True)

# 
def batched_nms(boxes, # 预测框集合[N, 4]
                scores, # 预测框对应类别的置信度
                idxs, # 预测框对应类别
                iou_threshold # IOU阈值，与置信度最高的预测框的IOU高于此阈值的同类别预测框会被丢弃
                ):
    """
    逐类进行非极大值抑制
    Args:
        boxes (Tensor): 预测框集合，形状为[N, 4]
        scores (Tensor): 预测框对应类别的置信度，形状为[N]
        idxs (Tensor): 预测框对应类别，形状为[N]
        iou_threshold (float): IOU阈值，与置信度最高的预测框的IOU高于此阈值的同类别预测框会被丢弃
    """
    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
    # 遍历所有类别
    for class_id in torch.unique(idxs):
        curr_indices = torch.where(idxs == class_id)[0]
        # 逐类NMS
        curr_keep_indices = torchvision.ops.nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
        keep_mask[curr_indices[curr_keep_indices]] = True
    keep_indices = torch.where(keep_mask)[0]
    # 将留下的预测框按置信度降序排列
    return keep_indices[scores[keep_indices].sort(descending=True)[1]]

def init_weight(module):
    """递归初始化模型参数"""
    if isinstance(module, (nn.Conv2d, nn.Linear)):
        nn.init.normal_(module.weight, std=0.01)
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, (nn.Sequential, nn.ModuleList)):
        for m in module:
            init_weight(m)

In [5]:
def assign_pred_to_gt(pred,
                      gt,
                      pos_threshold,
                      neg_threshold,
                      allow_low_quality_matches=True):
    """
    按阈值将锚框/候选框分为正、负样本，并将正样本分配给真实边界框
    Args:
        pred (Tensor): 待分配的锚框，xyxy格式，形状为[N, 4]。
        gt (Tensor): 真实边界框，xyxy格式，形状为[M, 4]。
        pos_threshold (float): 正样本IOU阈值，与某真实边界框IOU超过此阈值的
            锚框/候选框被标记为正样本。
        neg_threshold (float, tuple): 负样本IOU阈值，与所有真实边界框最高IOU
            在此区间或低于此值的锚框/候选框被标记为正样本。
        allow_low_quality_matches (bool): 允许低IOU的匹配，保证每个真实边界框
            都存在至少一个锚框/候选框与之对应。
    """
    iou_table = torchvision.ops.box_iou(pred, gt)
    pos_pred_indices, pos_gt_indices = [], []
    max_values, max_indices = iou_table.max(dim=1)
    # 正样本：与某真实边界框IOU超过阈值的锚框/候选框
    positive = max_values > pos_threshold
    pos_pred_indices.append(torch.arange(0, pred.shape[0], 1, device=pred.device)[positive])
    pos_gt_indices.append(max_indices[positive])
    iou_table[positive] = -1 # 防止该锚框/候选框再次被选中成为正/负样本
    
    # 用来兜底的低质量正样本：与每个真实边界框的IOU最高的锚框/候选框
    if allow_low_quality_matches:
        for i in range(gt.shape[0]): # 遍历所有真实边界框
            argmax = iou_table[:, i].argmax().reshape(1)
            if iou_table[argmax, i] > 0: # 已被选中过的锚框/候选框不参与本轮分配
                pos_pred_indices.append(argmax)
                pos_gt_indices.append(torch.tensor([i], device=pred.device))
                iou_table[argmax] = -1
    # 负样本：与所有真实边界框的最大IOU在区间内或低于阈值的
    max_values, max_indices = iou_table.max(dim=1)
    if isinstance(neg_threshold, float): # 若负样本IOU阈值为一小数
        negative = (max_values < neg_threshold) & (max_values > 0.)
    elif isinstance(neg_threshold, tuple): # 若负样本IOU阈值为一区间
        negative = (max_values < neg_threshold[1]) & (max_values > neg_threshold[0])
    neg_pred_indices = torch.arange(0, pred.shape[0], 1, device=pred.device)[negative]
    
    pos_pred_indices = torch.concat(pos_pred_indices)
    pos_gt_indices = torch.concat(pos_gt_indices)
    # 确保每个锚框/候选框仅被分配给了一个真实边界框
    assert pos_pred_indices.unique().shape == pos_pred_indices.shape,\
        'there is at least one predicted box assigned to multiple ground truth boxes'
    return pos_pred_indices, pos_gt_indices, neg_pred_indices

In [6]:
def random_sampling(pos_pred_indices,
                    pos_gt_indices,
                    neg_pred_indices,
                    pred_xyxy,
                    pred_conf,
                    gt_xyxy,
                    gt_label,
                    neg_cls_target,
                    pos_ratio,
                    batch_size):
    """
    对assign_pred_to_gt分配的结果（正负样本集）按指定正样本比例和批量大小进行采样，
    Args:
        pos_pred_indices (Tensor): 被选中成为正样本的锚框/预测框的下标。
        pos_gt_indices (Tensor): 各个成为正样本的锚框/预测框对应的真实边界框。
        neg_pred_indices (Tensor): 被选中成为负样本的锚框/预测框的下标。
        pred_xyxy (Tensor): 全部预测框/锚框。
        pred_conf (Tensor): 网络在各个预测框/锚框（在各个类别上）的回归输出。
        gt_xyxy (Tensor): 真实边界框，xyxy格式。
        gt_label (Tensor): 各个真实边界框所属的类别编号。
        neg_cls_target (Tensor): 负样本对应的类别（背景类）编号。
        pos_ratio (float): 一个小批量中正样本占比上限。
        batch_size (int): 批量大小。
    """
    # 采样正样本
    pos_indices = [i for i in range(len(pos_pred_indices))]
    random.shuffle(pos_indices)
    pos_indices = pos_indices[:int(batch_size * pos_ratio)]
    
    # 采样负样本
    neg_indices = [i for i in range(len(neg_pred_indices))]
    random.shuffle(neg_indices)
    # 正样本不够就用负样本补全 batch
    neg_indices = neg_indices[:max(batch_size-len(pos_indices), int(batch_size * (1 - pos_ratio)))]
    
    pos_pred_indices = pos_pred_indices[pos_indices]
    pos_gt_indices = pos_gt_indices[pos_indices]
    neg_pred_indices = neg_pred_indices[neg_indices]
    
    # 被选中成为正样本的锚框/候选框到其对应的真实边界框的偏移系数就是网络的回归目标
    pos_reg_target = coord_to_shift(xyxy2cxcywh(pred_xyxy)[pos_pred_indices],
                                    xyxy2cxcywh(gt_xyxy)[pos_gt_indices])
    
    # 网络在正负样本上输出的（各个类别的）置信度
    pos_cls_conf = pred_conf[pos_pred_indices]
    neg_cls_conf = pred_conf[neg_pred_indices]
    
    # 各个正样本对应的真实边界框的类别编号
    pos_cls_target = gt_label[pos_gt_indices]
    
    # 背景类类别编号
    neg_cls_target = torch.empty(neg_pred_indices.shape,
                                 device=pred_conf.device,
                                 dtype=int).fill_(neg_cls_target)
    
    # 正负样本的类别置信度和类别编号
    cls_conf = torch.concat([pos_cls_conf, neg_cls_conf])
    cls_target = torch.concat([pos_cls_target, neg_cls_target])
    return pos_reg_target, (cls_conf, cls_target), pos_pred_indices

In [7]:
class CBR(nn.Sequential):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )

In [8]:
class RPN(nn.Module):
    """区域提议网络（不含特征提取部分）。"""
    def __init__(self, in_channels, hidden_channels, num_layers, area, ratio, downsample_rate):
        super().__init__()
        # RPN独有的卷积层
        self.conv = [CBR(in_channels, hidden_channels), ]
        for i in range(1, num_layers):
            self.conv.append(CBR(hidden_channels, hidden_channels))
        self.conv = nn.Sequential(*self.conv)
        
        self.area = area
        self.ratio = ratio
        self.num_boxes = len(area) * len(ratio)
        # 回归分支，输出 H * W * num_boxes * 4个偏移系数
        self.branch_reg = nn.Conv2d(hidden_channels, self.num_boxes * 4, kernel_size=1, stride=1, padding=0)
        # 分类分支，输出 H * W * num_boxes * 1个置信度
        self.branch_cls = nn.Conv2d(hidden_channels, self.num_boxes, kernel_size=1, stride=1, padding=0)
        # 单个位置上的锚框集合
        self.boxes = [(self.area[i], self.ratio[j]) for i in range(len(self.area)) for j in range(len(self.ratio))]
        # backboneCNN的下采样率（所有层的步长的乘积）
        self.conv_downsample_rate = downsample_rate
        
        # 初始化网络权重
        init_weight(self.conv)
        init_weight(self.branch_reg)
        init_weight(self.branch_cls)
    
    def forward(self, feature_map):
        """输入backbone CNN输出的特征图，输出每个位置上每个锚框的物体置信度和4个偏移系数。"""
        hidden = self.conv(feature_map)
        objectness = self.branch_cls(hidden)
        shift = self.branch_reg(hidden)
        # shape of objectness: [H * W * self.num_boxes]
        # shape of shift: [H * W * self.num_boxes, 4]
        return objectness.permute(0, 2, 3, 1).flatten(), shift.permute(0, 2, 3, 1).reshape(-1, 4)
    
    def generate_anchor(self, output_size):
        """给定backbone CNN输出的特征图尺寸[C * H * W]，生成H * W * self.num_boxes个锚框，cxcywh格式。"""
        # 锚框的高宽与其位置无关，统一计算后复制到各个位置上
        wh = torch.zeros(self.num_boxes, 2, device=device)
        for k, (area, ratio) in enumerate(self.boxes):
            # 高宽比ratio = h / w
            w = int((area / ratio) ** 0.5)
            h = int(w * ratio)
            wh[k, 0] = w
            wh[k, 1] = h
        # 计算各个位置上锚框的中心点位置
        # 原文将锚框的中心点放置在了特征图上对应像素的感受野中心，
        # 这里从距离左上角(下采样率/2)个像素开始放置，步长为下采样率
        cx = torch.arange(0, output_size[-1], 1, device=device).reshape(1, output_size[-1], 1, 1)\
            * self.conv_downsample_rate + self.conv_downsample_rate // 2
        cy = torch.arange(0, output_size[-2], 1, device=device).reshape(output_size[-2], 1, 1, 1)\
            * self.conv_downsample_rate + self.conv_downsample_rate // 2
        # 将高宽和中心点位置拼接起来，形成 H * W * self.num_boxes个锚框
        anchors = torch.concat([cx.expand(output_size[-2], -1, self.num_boxes, -1),
                                cy.expand(-1, output_size[-1], self.num_boxes, -1),
                                wh.expand(output_size[-2], output_size[-1], -1, -1)], dim=-1)
        # shape of anchors: [H * W * self.num_boxes, 4]
        return anchors.reshape(-1, 4)
    
    def get_proposal(self, feature_map):
        """利用RPN输出的偏移系数和锚框坐标来对锚框进行修正得到候选框"""
        with torch.no_grad():
            objectness, shift = self.forward(feature_map)
        anchor = self.generate_anchor(feature_map.shape)
        proposal_cxcywh = refine_box(anchor, shift)
        proposal_xyxy = cxcywh2xyxy(proposal_cxcywh)
        return proposal_xyxy, torch.sigmoid(objectness)

    def generate_training_data(self,
                               feature_map,
                               objectness,
                               gtboxes_xyxy,
                               pos_threshold,
                               neg_threshold,
                               pos_ratio=0.5,
                               batch_size=256):
        """生成训练数据"""
        # 生成锚框
        anchor_cxcywh = self.generate_anchor(feature_map.shape)
        anchor_xyxy = cxcywh2xyxy(anchor_cxcywh)
        # 匹配锚框和真实边界框
        pos_anchor_indices, pos_gtbox_indices, neg_anchor_indices = \
            assign_pred_to_gt(anchor_xyxy,
                              gtboxes_xyxy,
                              pos_threshold,
                              neg_threshold,
                              allow_low_quality_matches=True)
        # 采样，生成训练数据
        pos_reg_target, (cls_logits, cls_target), pos_anchor_indices = \
            random_sampling(pos_anchor_indices,
                            pos_gtbox_indices,
                            neg_anchor_indices,
                            anchor_xyxy,
                            objectness,
                            gtboxes_xyxy,
                            gt_label=torch.ones_like(objectness),
                            neg_cls_target=0,
                            pos_ratio=pos_ratio,
                            batch_size=batch_size)
        return anchor_xyxy, pos_reg_target, (cls_logits, cls_target), pos_anchor_indices

In [9]:
class Fast_RCNN(nn.Module):
    """Fast R-CNN（不含特征提取部分）。"""
    def __init__(self,
                 in_channels,
                 hidden_channels,
                 num_layers,
                 roi_output_size,
                 downsample_rate,
                 num_classes):
        super().__init__()
        
        # ROI Pooling层的输出特征图大小
        self.roi_output_size = roi_output_size
        # 类别数
        self.num_classes = num_classes
        # backbone CNN 下采样率
        self.conv_downsample_rate = downsample_rate
        
        # Fast R-CNN独有的卷积层
        self.conv = [CBR(in_channels, hidden_channels), ]
        for i in range(1, num_layers):
            self.conv.append(CBR(hidden_channels, hidden_channels))
        self.conv.append(nn.Flatten())
        self.conv = nn.Sequential(*self.conv)
        # 分类分支，每个候选框输出 num_classes + 1个置信度
        self.branch_cls = nn.Linear(hidden_channels * self.roi_output_size**2, num_classes + 1)
        # 回归分支，每个候选框输出 4 * num_classes个偏移系数
        self.branch_reg = nn.Linear(hidden_channels * self.roi_output_size**2, 4 * num_classes)
        
        # 权重初始化
        init_weight(self.conv)
        init_weight(self.branch_reg)
        init_weight(self.branch_cls)
        
    def forward(self, feature_map, proposals):
        """输入backbone CNN输出的特征图和一堆候选框，Fast R-CNN输出每个候选框的类别置信度和相对于候选框的偏移系数"""
        # torchvision.ops.roi_pool输入候选框的格式要求为[N, 5]，每行第一个数为该框
        # 对应的特征图编号，后四个数为xyxy格式的候选框坐标。
        proposals = torch.concat([torch.zeros(proposals.shape[0], 1, device=proposals.device), proposals], dim=-1)
        rois = torchvision.ops.roi_pool(feature_map, proposals, self.roi_output_size, 1/self.conv_downsample_rate)
        feature = self.conv(rois)
        category_confidence = self.branch_cls(feature)
        category_shift = self.branch_reg(feature)
        # shape of category_confidence: [num_proposals, num_classes + 1]
        # shape of category_shift: [num_proposals, num_classes, 4]
        return category_confidence, category_shift.reshape(-1, self.num_classes, 4)

    def generate_training_data(self,
                               proposal,
                               category_confidence,
                               category_shift,
                               gtbox_labels,
                               gtbox_coords,
                               pos_threshold=0.5,
                               neg_threshold=(0.1, 0.5),
                               pos_ratio=0.5,
                               batch_size=64):
        """生成训练数据"""
        # 匹配候选框和真实边界框
        pos_proposal_indices, pos_gtbox_indices, neg_proposal_indices = \
            assign_pred_to_gt(proposal,
                              gtbox_coords,
                              pos_threshold,
                              neg_threshold,
                              allow_low_quality_matches=True)
        # 采样，生成训练数据
        pos_reg_target, (cls_logits, cls_target), pos_proposal_indices = \
            random_sampling(pos_proposal_indices,
                            pos_gtbox_indices,
                            neg_proposal_indices,
                            proposal,
                            category_confidence,
                            gtbox_coords,
                            gtbox_labels,
                            neg_cls_target=self.num_classes,
                            pos_ratio=pos_ratio,
                            batch_size=batch_size)
        # 正样本对应的真实边界框的类别编号
        pos_cls_target = cls_target[:pos_proposal_indices.shape[0]]
        # 确保正样本的分类目标非背景
        assert (pos_cls_target == self.num_classes).sum() == 0
        return pos_reg_target, (cls_logits, cls_target), pos_proposal_indices, pos_cls_target

In [10]:
class FasterRCNN(nn.Module):
    """Faster RCNN"""
    def __init__(self, rpn_backbone, frcn_backbone, cfg):
        super().__init__()
        # Fast R-CNN所用的backbone CNN
        self.frcn_backbone = frcn_backbone
        # RPN所用的backbone CNN
        self.rpn_backbone = rpn_backbone
        # 类别数量
        self.num_classes = cfg.num_classes
        self.FRCN = Fast_RCNN(in_channels=self.frcn_backbone(torch.zeros(1, 3, 64, 64)).shape[1],
                              hidden_channels=cfg.frcn_hidden_channels,
                              num_layers=cfg.frcn_num_layers,
                              roi_output_size=cfg.frcn_roi_output_size,
                              downsample_rate=cfg.downsample_rate,
                              num_classes=self.num_classes)
        self.RPN = RPN(in_channels=self.rpn_backbone(torch.zeros(1, 3, 64, 64)).shape[1],
                       hidden_channels=cfg.rpn_hidden_channels,
                       num_layers=cfg.rpn_num_layers,
                       area=cfg.anchor_area,
                       ratio=cfg.anchor_ratio,
                       downsample_rate=cfg.downsample_rate)
        # 记录RPN和Fast R-CNN 两部分能否用同一个backbone CNN输出的特征图作为输入
        # 需要手动维护
        self.same_backbone = None
        
    def forward(self, input, proposal_iou_threshold):
        """Faster R-CNN的前向传播"""
        # 从[rpn_backbone, RPN]处拿到rpn_backbone输出的特征图和候选框
        conv_rpn_out, proposal, _ = self.get_proposal(input, proposal_iou_threshold)
        # 如果 RPN 和 Fast R-CNN 使用的backbone CNN相同的话，仅需抽取一次特征即可
        if self.same_backbone:
            conv_frcn_out = conv_rpn_out
        # 训练时二者的 backbone 可能不同，需要抽取两次特征
        else:
            conv_frcn_out = self.frcn_backbone(input)
        # Fast R-CNN根据特征图计算各个候选框的类别置信度和偏移系数
        confidence, shift = self.FRCN(conv_frcn_out, proposal)
        return proposal, confidence, shift
    
    def get_prediction(self,
                       input,
                       proposal_iou_threshold,
                       confidence_threshold,
                       prediction_iou_threshold):
        """
        输入图片，输出预测框和类别置信度
        Args:
            input (Tensor): 输入图片。
            proposal_iou_threshold (float): 作用在候选框上的NMS的IOU阈值。
            confidence_threshold (float): 置信度阈值，最高置信度低于此值的预测框将被丢弃。
            prediction_iou_threshold (float): 作用在预测框上的NMS的IOU阈值。
        """
        # 前向传播，得到候选框、各个候选框的类别置信度和偏移系数
        with torch.no_grad():
            proposal, confidence, shift = self.forward(input, proposal_iou_threshold)
        confidence = confidence.softmax(dim=1)
        # 置信度最高的类的置信度和类别编号
        max_values, max_indices = confidence.max(dim=1)
        # 非背景类且最高置信度大于置信度阈值confidence_threshold
        not_background = (max_indices != self.num_classes) & (max_values >= confidence_threshold)
        # 非背景类且置信度高于阈值的候选框的编号
        selected_proposal_indices = torch.arange(0, confidence.shape[0],
                                                 device=confidence.device)[not_background]
        selected_proposals = proposal[selected_proposal_indices]
        selected_category = max_indices[selected_proposal_indices]
        selected_confidence = max_values[selected_proposal_indices]
        # 取出各候选框对应置信度最高的类别的偏移系数
        selected_shift = shift[selected_proposal_indices, selected_category, :]
        # 使用偏移系数对满足条件的候选框进行修正，得到预测框
        prediction = cxcywh2xyxy(refine_box(xyxy2cxcywh(selected_proposals), selected_shift))
        # 对预测框应用类内NMS消除冗余预测框，得到被保留下来的预测框的编号
        remained = batched_nms(prediction.float(),
                               selected_confidence,
                               selected_category,
                               prediction_iou_threshold)
        return prediction[remained], selected_category[remained], selected_confidence[remained]
    
    def get_proposal(self, input, proposal_iou_threshold=1.0):
        """使用RPN提出候选框，并对候选框应用NMS，消除冗余候选框"""
        with torch.no_grad():
            conv_rpn_out = self.rpn_backbone(input)
            proposal, objectness = self.RPN.get_proposal(conv_rpn_out)
        remained = torchvision.ops.nms(proposal.float(), objectness, proposal_iou_threshold)
        return conv_rpn_out, proposal[remained], objectness[remained]
    
    def load_params(self, version):
        """加载权重"""
        self.rpn_backbone.load_state_dict(torch.load(f'models/{version}_rpn_backbone.pth'))
        self.RPN.load_state_dict(torch.load(f'models/{version}_rpn.pth'))
        self.frcn_backbone.load_state_dict(torch.load(f'models/{version}_frcn_backbone.pth'))
        self.FRCN.load_state_dict(torch.load(f'models/{version}_frcn.pth'))

In [11]:
def show_boxes(image, box1=None, box2=None, display=True, scale=2.0):
    """将box1和box2分别用红色和绿色显示在图片上"""
    if isinstance(image, torch.Tensor):
        if image.dim() == 4:
            image = image.squeeze(0)
        image = image.clone()
        # 反归一化
        image *= torch.tensor([0.2391, 0.2351, 0.2397], device=image.device).reshape(3, 1, 1)
        image += torch.tensor([0.4570, 0.4382, 0.4062], device=image.device).reshape(3, 1, 1)
        # 按指定倍率缩放
        image = T.Resize(int(scale * min(image.shape[-1], image.shape[-2])))(image)
        image = T.ToPILImage()(image)
    image = np.array(image)
    if box2 is not None:
        box2 = (box2 * scale).int()
        for box in box2:
            cv2.rectangle(image,
                          (box[0].item(), box[1].item()),
                          (box[2].item(), box[3].item()),
                          (0, 255, 0), int(2*scale))
    if box1 is not None:
        box1 = (box1 * scale).int()
        for box in box1:
            cv2.rectangle(image,
                          (box[0].item(), box[1].item()),
                          (box[2].item(), box[3].item()),
                          (255, 0, 0), int(1*scale))
            cv2.circle(image,
                       ((box[0].item()+box[2].item())//2,
                        (box[1].item()+box[3].item())//2),
                       int(1*scale), (128, 128, 255), -1)
    if display:
        plt.figure(figsize=(10, 10), dpi=int(60*scale))
        plt.imshow(image)
    return image

In [12]:
def show_predictions(net,
                     data,
                     proposal_iou_threshold,
                     confidence_threshold,
                     prediction_iou_threshold,
                     display=True,
                     scale=2.0):
    """
    给定模型和数据，应用前向传播，得到预测框，并将预测框、对应类别和置信度
    和真实边界框一同显示在图片上。
    """
    image, (box_labels, box_coords_xyxy) = data
    image = image.to(device)
    box_labels = box_labels.to(device)
    box_coords_xyxy = box_coords_xyxy.to(device)
    if image.dim() == 3:
        image = image.unsqueeze(0)
    else:
        box_labels = box_labels.squeeze(0)
        box_coords_xyxy = box_coords_xyxy.squeeze(0)
    net.eval()
    with torch.no_grad():
        output = net.get_prediction(image,
                                    proposal_iou_threshold,
                                    confidence_threshold,
                                    prediction_iou_threshold)
    label_text = ['person',
                  'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
                  'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train',
                  'bottle', 'chair', 'diningtable', 'pottedplant', 'sofa', 'tvmonitor']
    if output[0].numel() != 0:
        image = show_boxes(image, output[0], box_coords_xyxy, display=False, scale=scale)
        for prediction, category, confidence in zip(*output):
            # 将类别和置信度显示在预测框左上角上方
            text_pos = (prediction * scale)[:2]
            text_pos[1] -= scale * 2
            text_pos = text_pos.int().cpu().numpy()
            cv2.putText(image, f'{label_text[category]} {confidence.item():.2f}',
                        text_pos,
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.4 * scale,
                        (255, 0, 0),
                        round(1 * scale))
    else:
        # 没预测框就不画
        image = show_boxes(image, None, box_coords_xyxy, display=False, scale=scale)
    if display:
        plt.figure(figsize=(10, 10), dpi=int(60*scale))
        plt.imshow(image)
    return image

In [13]:
def train_rpn_one_step(net,
                       data,
                       cfg,
                       optimizer,
                       criterion_reg,
                       criterion_cls):
    """进行一次前向传播和一次反向传播以训练RPN"""
    image, (box_labels, box_coords_xyxy) = data
    image = image.to(device)
    box_labels = box_labels.to(device).squeeze(0)
    box_coords_xyxy = box_coords_xyxy.to(device)
    if box_coords_xyxy.dim() == 3:
        box_coords_xyxy = box_coords_xyxy.squeeze(0)
    # 前向传播
    feature_map = net.rpn_backbone(image)
    logits, shift = net.RPN(feature_map)
    # 利用前向传播的结果生成训练数据
    anchor_xyxy, pos_reg_target, (cls_logits, cls_target), pos_indices = \
        net.RPN.generate_training_data(feature_map,
                                       logits,
                                       box_coords_xyxy,
                                       pos_threshold=cfg.rpn_pos_threshold,
                                       neg_threshold=cfg.rpn_neg_threshold,
                                       pos_ratio=cfg.rpn_pos_ratio,
                                       batch_size=cfg.rpn_batch_size)
    # 计算分类和回归分支的Loss
    loss_reg = criterion_reg(shift[pos_indices], pos_reg_target)
    loss_cls = criterion_cls(cls_logits, cls_target)
    # 使用一个系数对分类和回归分支的Loss进行加权求和
    loss = loss_reg + cfg.rpn_alpha * loss_cls
    # 梯度清零
    optimizer.zero_grad()
    # 反向传播
    loss.backward()
    # 更新参数
    optimizer.step()
    return loss_reg.item(), loss_cls.item(), loss.item()

In [14]:
def train_rpn(net, cfg, stage, lr, num_epochs, update_backbone=True):
    # 预测框回归Loss：平滑L1Loss
    criterion_reg = nn.SmoothL1Loss()
    # Objectness回归Loss：交叉熵
    # 由于正样本相对于负样本来说数量较少，
    # 因此需要对正样本的Loss乘以一个大于1的权重以平衡二者对损失函数的影响
    criterion_cls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(cfg.rpn_pos_weight))
    # 若要连带着backbone CNN一起训练，则需将backbone CNN的参数也传递给优化器
    if update_backbone:
        optimizer = torch.optim.SGD(nn.ModuleList([net.rpn_backbone, net.RPN]).parameters(),
                                    lr=lr, weight_decay=cfg.rpn_weight_decay, momentum=0.9)
    # 否则仅需将RPN独有的层的参数传递给优化器
    else:
        optimizer = torch.optim.SGD(net.RPN.parameters(),
                                    lr=lr, weight_decay=cfg.rpn_weight_decay, momentum=0.9)
    # 余弦退火学习率
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    
    voc_train = PascalVOC2012(train=True, scale_ratio=cfg.scale_ratio)
    voc_train_dataloader = torch.utils.data.DataLoader(voc_train,
                                                       batch_size=1,
                                                       shuffle=True,
                                                       num_workers=cfg.num_workers)
    writer = SummaryWriter(log_dir=f'runs/{cfg.version}/RPN/{stage}')
    freeze(net)
    net.eval()
    # 仅解锁网络需要更新的部分的参数
    unfreeze(net.RPN)
    net.RPN.train()
    if update_backbone:
        unfreeze(net.rpn_backbone)
        net.rpn_backbone.train()
    global_step = 0
    for epoch in range(num_epochs):
        epoch_loss = []
        for i, data in enumerate(voc_train_dataloader):
            # 迭代一步
            loss_reg, loss_cls, loss = train_rpn_one_step(net,
                                                          data,
                                                          cfg,
                                                          optimizer,
                                                          criterion_reg,
                                                          criterion_cls)
            if (global_step+1) % 5 == 0:
                writer.add_scalars('train/loss', {'reg': loss_reg, 
                                                  'cls': loss_cls,
                                                  'weighted sum': loss}, global_step=global_step)
            global_step += 1
            epoch_loss.append(loss)
            
            # 打印Loss，画图
            if (global_step+1) % (len(voc_train) // 5) == 0:
                moving_average = epoch_loss[-(len(voc_train) // 5):]
                print(f'epoch {epoch+1:4d}, iter {global_step+1:8d}, loss={sum(moving_average) / len(moving_average):8.4f}')
                
                image, (box_labels, box_coords_xyxy) = data
                image = image.to(device)
                box_labels = box_labels.to(device).squeeze(0)
                if box_coords_xyxy.dim() == 3:
                    box_coords_xyxy = box_coords_xyxy.squeeze(0)
                with torch.no_grad():
                    _, proposal, objectness = net.get_proposal(image, proposal_iou_threshold=0.8)
                proposal = proposal[objectness > 0.6]
                image_with_proposals = show_boxes(image, proposal, box_coords_xyxy, display=False)
                writer.add_image('train/images_with_proposals',
                                 image_with_proposals,
                                 global_step=global_step,
                                 dataformats='HWC')
        scheduler.step()
        # 每个epoch将参数导出至硬盘
        torch.save(net.rpn_backbone.state_dict(), f'models/{cfg.version}_rpn_backbone.pth')
        torch.save(net.RPN.state_dict(), f'models/{cfg.version}_rpn.pth')
    writer.close()

In [15]:
def train_frcn_one_step(net,
                        data,
                        cfg,
                        optimizer,
                        criterion_reg,
                        criterion_cls):
    """进行一次前向传播和一次反向传播以训练Fast R-CNN"""
    image, (box_labels, box_coords_xyxy) = data
    image = image.to(device)
    box_labels = box_labels.to(device).squeeze(0)
    box_coords_xyxy = box_coords_xyxy.to(device)
    if box_coords_xyxy.dim() == 3:
        box_coords_xyxy = box_coords_xyxy.squeeze(0)
    # 利用RPN提出候选框，无需计算梯度
    with torch.no_grad():
        _, proposal, objectness = net.get_proposal(image, proposal_iou_threshold=1.0)
    # 前向传播
    feature_map = net.frcn_backbone(image)
    category_confidence, category_shift = net.FRCN(feature_map, proposal)
    # 生成训练数据
    pos_reg_target, (cls_logits, cls_target), pos_proposal_indices, pos_cls_target = \
        net.FRCN.generate_training_data(proposal,
                                        category_confidence,
                                        category_shift,
                                        box_labels,
                                        box_coords_xyxy,
                                        pos_threshold=cfg.frcn_pos_threshold,
                                        neg_threshold=cfg.frcn_neg_threshold,
                                        pos_ratio=cfg.frcn_pos_ratio,
                                        batch_size=cfg.frcn_batch_size)
    # 计算回归分支的Loss
    loss_reg = criterion_reg(category_shift[pos_proposal_indices, pos_cls_target, :], pos_reg_target)
    # 计算分类分支的Loss
    loss_cls = criterion_cls(cls_logits, cls_target)
    # 加权求和以平衡两个分支的Loss对于梯度的影响
    loss = loss_reg + cfg.frcn_alpha * loss_cls
    # 梯度清零
    optimizer.zero_grad()
    # 反向传播
    loss.backward()
    # 更新参数
    optimizer.step()
    return loss_reg.item(), loss_cls.item(), loss.item()

In [16]:
def train_frcn(net, cfg, stage, lr, num_epochs, update_backbone=True):
    criterion_reg = nn.SmoothL1Loss()
    # 按正负样本比例1:1时计算的各个类别样本占比的倒数
    # 进行一定平滑处理后当做各个类别的分类权重
    weight = torch.tensor([ 3.1429, 26.6453, 25.9015, 44.4338, 20.5391, 41.8409, 30.9902,
                           33.5617,  8.4732, 31.0512, 49.7603, 13.2443, 42.0640, 48.2385,
                           21.0601, 10.8264, 42.2895, 28.3196, 39.5338, 38.2864,  1.0000],
                          device=device) / 10 + 0.9
    # 也可以对所有正类样本加一个统一的权重
    # weight = torch.ones(cfg.num_classes + 1, device=device)
    # weight[:cfg.num_classes] *= cfg.frcn_pos_weight
    print(weight)
    criterion_cls = nn.CrossEntropyLoss(weight=weight)
    if update_backbone:
        optimizer = torch.optim.SGD(nn.ModuleList([net.frcn_backbone, net.FRCN]).parameters(),
                                    lr=lr, weight_decay=cfg.frcn_weight_decay, momentum=0.9)
    else:
        optimizer = torch.optim.SGD(net.FRCN.parameters(),
                                    lr=lr, weight_decay=cfg.frcn_weight_decay, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    voc_train = PascalVOC2012(train=True, scale_ratio=cfg.scale_ratio)
    voc_train_dataloader = torch.utils.data.DataLoader(voc_train,
                                                       batch_size=1,
                                                       shuffle=True,
                                                       num_workers=cfg.num_workers)
    writer = SummaryWriter(log_dir=f'runs/{cfg.version}/FRCN/{stage}')
    
    freeze(net)
    net.eval()
    unfreeze(net.FRCN)
    net.FRCN.train()
    if update_backbone:
        unfreeze(net.frcn_backbone)
        net.frcn_backbone.train()
    global_step = 0
    for epoch in range(num_epochs):
        epoch_loss = []
        for i, data in enumerate(voc_train_dataloader):
            loss_reg, loss_cls, loss = train_frcn_one_step(net,
                                                           data,
                                                           cfg,
                                                           optimizer,
                                                           criterion_reg,
                                                           criterion_cls)
            if (global_step+1) % 5 == 0:
                writer.add_scalars('train/loss', {'reg': loss_reg, 
                                                  'cls': loss_cls,
                                                  'weighted sum': loss}, global_step=global_step)
            global_step += 1
            if str(loss) != 'nan':
                epoch_loss.append(loss)
            if (global_step+1) % (len(voc_train) // 5) == 0:
                moving_average = epoch_loss[-(len(voc_train) // 5):]
                print(f'epoch {epoch+1:4d}, iter {global_step+1:8d}, loss={sum(moving_average) / len(moving_average):8.4f}')
                
                images_with_predictions = show_predictions(net,
                                                           data,
                                                           proposal_iou_threshold=0.6,
                                                           confidence_threshold=0.6,
                                                           prediction_iou_threshold=0.6,
                                                           display=False,
                                                           scale=2.0)
                writer.add_image('train/images_with_predictions',
                                 images_with_predictions,
                                 global_step=global_step,
                                 dataformats='HWC')
        scheduler.step()
        torch.save(net.frcn_backbone.state_dict(), f'models/{cfg.version}_frcn_backbone.pth')
        torch.save(net.FRCN.state_dict(), f'models/{cfg.version}_frcn.pth')
    writer.close()

In [17]:
class Configuration:
    def __init__(self, save=False):
        self.version = 'version 14'
        # 使用的backboneCNN
        self.backbone = 'resnet50'
        # 数据增强手段
        self.augmentation = 'ColorJitter, RandomHorizontalFlip'
        # backboneCNN的下采样率（所有层步长的乘积）
        self.downsample_rate = 32
        self.num_classes = 20
        # 数据集的图片放大倍率
        self.scale_ratio = 1.0
        self.num_workers = 0

        ################# RPN ARCHITECTURE HYPERPARAMETERS #################
        # 锚框面积
        self.anchor_area = [32**2, 64**2, 128**2, 256**2, 512**2]
        # 锚框宽高比
        self.anchor_ratio = [2., 1., 0.5]
        # RPN内卷积层输出通道数
        self.rpn_hidden_channels = 256
        # RPN内卷积层个数
        self.rpn_num_layers = 4

        ################# FAST R-CNN ARCHITECTURE HYPERPARAMETERS #################
        # ROI Pooling对于每个候选框的输出特征图大小
        self.frcn_roi_output_size = 5
        # Fast R-CNN内卷积层通道数
        self.frcn_hidden_channels = 256
        # Fast R-CNN内卷积层个数
        self.frcn_num_layers = 5

        ################# RPN TRAINING HYPERPARAMETERS #################
        # 四阶段训练中第一、三阶段的学习率
        self.rpn_lr_stage_1 = 1e-3
        self.rpn_lr_stage_3 = 1e-4
        # 权重衰减
        self.rpn_weight_decay = 1e-5
        # 四阶段训练中第一、三阶段的训练轮次
        self.rpn_num_epochs_stage_1 = 25
        self.rpn_num_epochs_stage_3 = 10
        # 训练RPN时与分类分支的Loss相乘的trade off项
        self.rpn_alpha = 0.02
        # 正样本Loss增益
        self.rpn_pos_weight = 6.5
        # 正、负样本IOU阈值
        self.rpn_pos_threshold = 0.6
        self.rpn_neg_threshold = 0.3
        # 小批量内正样本占比上限
        self.rpn_pos_ratio = 1/2
        # 批量大小
        self.rpn_batch_size = 256

        ################# FAST R-CNN TRAINING HYPERPARAMETERS #################
        self.frcn_lr_stage_2 = 3e-3
        self.frcn_lr_stage_4 = 1e-4
        self.frcn_weight_decay = 1e-5
        self.frcn_num_epochs_stage_2 = 20
        self.frcn_num_epochs_stage_4 = 10
        self.frcn_alpha = 0.01
        self.frcn_pos_weight = 4.5
        self.frcn_pos_threshold = 0.5
        self.frcn_neg_threshold = (0.1, 0.5)
        self.frcn_pos_ratio = 1/2
        self.frcn_batch_size = 64

        if save:
            self.save_config()
            
    def __str__(self):
        return self.version
    
    def save_config(self):
        with open('configs.txt', 'a') as f:
            f.write('{\n')
            for k, v in self.__dict__.items():
                f.write(k + ': ' + str(v) + "\n")
            f.write('}\n')

In [18]:
cfg = Configuration(False)

In [19]:
cfg.save_config()

In [20]:
def get_resnet_backbone():
    modules = models.resnet50(pretrained=True)._modules
    backbone = nn.Sequential(*[modules[key] for key in list(modules.keys())[:-2]])
    return backbone

In [21]:
rpn_backbone = get_resnet_backbone()
frcn_backbone = get_resnet_backbone()
net = FasterRCNN(rpn_backbone, frcn_backbone, cfg).to(device)
net.same_backbone = False
print(net.RPN)
print(net.FRCN)

RPN(
  (conv): Sequential(
    (0): CBR(
      (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): CBR(
      (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (2): CBR(
      (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (3): CBR(
      (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (branch_reg): Conv2d(256, 60, kernel_size=(1, 1), stride=(1, 1))
  (branch_cls): C

In [None]:
"""第一阶段，RPN和rpn_backbone参与训练"""
train_rpn(net,
          cfg,
          'stage1',
          cfg.rpn_lr_stage_1,
          num_epochs=cfg.rpn_num_epochs_stage_1,
          update_backbone=True)
# 训练完毕后，rpn_backbone 的参数发生改变，与frcn_backbone不同
net.same_backbone = False

epoch    1, iter     1143, loss=  0.0433
epoch    1, iter     2286, loss=  0.0369
epoch    1, iter     3429, loss=  0.0345
epoch    1, iter     4572, loss=  0.0330
epoch    1, iter     5715, loss=  0.0323
epoch    2, iter     6858, loss=  0.0291
epoch    2, iter     8001, loss=  0.0287
epoch    2, iter     9144, loss=  0.0276
epoch    2, iter    10287, loss=  0.0271
epoch    2, iter    11430, loss=  0.0255
epoch    3, iter    12573, loss=  0.0229
epoch    3, iter    13716, loss=  0.0234
epoch    3, iter    14859, loss=  0.0227
epoch    3, iter    16002, loss=  0.0232
epoch    3, iter    17145, loss=  0.0229
epoch    4, iter    18288, loss=  0.0202
epoch    4, iter    19431, loss=  0.0198
epoch    4, iter    20574, loss=  0.0202
epoch    4, iter    21717, loss=  0.0206
epoch    4, iter    22860, loss=  0.0192
epoch    5, iter    24003, loss=  0.0178
epoch    5, iter    25146, loss=  0.0175
epoch    5, iter    26289, loss=  0.0185
epoch    5, iter    27432, loss=  0.0181
epoch    5, iter

In [None]:
"""第二阶段，Fast R-CNN和frcn_backbone参与训练"""
train_frcn(net,
           cfg,
           'stage2',
           cfg.frcn_lr_stage_2,
           num_epochs=cfg.frcn_num_epochs_stage_2,
           update_backbone=True)
# 训练完毕后，frcn_backbone 的参数也发生改变，但仍与rpn_backbone不同
net.same_backbone = False

In [None]:
net.load_params(cfg.version)
# 将rpn_backbone的参数替换为frcn_backbone的参数
net.rpn_backbone.load_state_dict(torch.load(f'models/{cfg.version}_frcn_backbone.pth'))
# 此时两个backbone的参数相同
net.same_backbone = True

In [None]:
"""第三阶段，仅更新RPN的参数"""
train_rpn(net,
          cfg,
          'stage3',
          cfg.rpn_lr_stage_3,
          num_epochs=cfg.rpn_num_epochs_stage_3,
          update_backbone=False)
# 由于rpn_backbone的参数固定不变，因此训练完毕后两个backbone参数仍然相同
net.same_backbone = True

In [None]:
"""第四阶段，仅更新Fast R-CNN的参数"""
train_frcn(net,
           cfg,
           'stage4',
           cfg.frcn_lr_stage_4,
           num_epochs=cfg.frcn_num_epochs_stage_4,
           update_backbone=False)
# 由于frcn_backbone的参数固定不变，因此训练完毕后两个backbone参数仍然相同
net.same_backbone = True

In [None]:
net.load_params(cfg.version)

In [31]:
voc_dataset = PascalVOC2012(False)
for i in range(len(voc_dataset)):
    data = voc_dataset[i]
    img = show_predictions(net,
                           data,
                           proposal_iou_threshold=0.3,
                           confidence_threshold=0.8,
                           prediction_iou_threshold=0.5,
                           scale=2.,
                           display=False)
    plt.imsave(f'./outputs/{cfg.version}/{i}.jpg', img)