# yolov2创新点
1. 使用预定义框进行训练，预定义框基于k聚类生成，模型产出是基于预定义框的偏移量

In [None]:
    # 生成预定义框
    def generate_anchors(self, fmp_size):
        """ 
            用于生成G矩阵，其中每个元素都是特征图上的像素坐标和先验框的尺寸。
        """
        fmp_h, fmp_w = fmp_size

        # generate grid cells
        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
        anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2)
        # [HW, 2] -> [HW, A, 2] -> [M, 2]
        anchor_xy = anchor_xy.unsqueeze(1).repeat(1, self.num_anchors, 1)
        anchor_xy = anchor_xy.view(-1, 2).to(self.device)

        # [A, 2] -> [1, A, 2] -> [HW, A, 2] -> [M, 2]
        anchor_wh = self.anchor_size.unsqueeze(0).repeat(fmp_h*fmp_w, 1, 1)
        anchor_wh = anchor_wh.view(-1, 2).to(self.device)

        anchors = torch.cat([anchor_xy, anchor_wh], dim=-1)

        return anchors

    # 预测框解码，基于生成的预定义框
    def decode_boxes(self, anchors, pred_reg):
        """
            将YOLO预测的 (tx, ty)、(tw, th) 转换为bbox的左上角坐标 (x1, y1) 和右下角坐标 (x2, y2)。
            输入:
                pred_reg: (torch.Tensor) -> [B, HxWxA, 4] or [HxWxA, 4]，网络预测的txtytwth
                fmp_size: (List[int, int])，包含输出特征图的宽度和高度两个参数
            输出:
                pred_box: (torch.Tensor) -> [B, HxWxA, 4] or [HxWxA, 4]，解算出的边界框坐标
        """
        # 计算预测边界框的中心点坐标和宽高
        pred_ctr = (torch.sigmoid(pred_reg[..., :2]) + anchors[..., :2]) * self.stride
        pred_wh = torch.exp(pred_reg[..., 2:]) * anchors[..., 2:]

        # 将所有bbox的中心带你坐标和宽高换算成x1y1x2y2形式
        pred_x1y1 = pred_ctr - pred_wh * 0.5
        pred_x2y2 = pred_ctr + pred_wh * 0.5
        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)

        return pred_box

# Yolov3 主要创新点
1. 特征金字塔
   - 主干网络输出3个不同大小分辨率的特征图
   - 对第三个特征图进行SPP操作
   - 将第三个特征图和第二个特征图上采用，与第一个特征图拼接

In [None]:
# 1. 主干网络输出3个不同分辨率的特征图
## DarkNet-Tiny
class DarkNetTiny(nn.Module):
    def __init__(self, act_type='silu', norm_type='BN'):
        super(DarkNetTiny, self).__init__()
        self.feat_dims = [64, 128, 256]

        # stride = 2
        self.layer_1 = nn.Sequential(
            Conv(3, 16, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
            ResBlock(16, 16, nblocks=1, act_type=act_type, norm_type=norm_type)
        )
        # stride = 4
        self.layer_2 = nn.Sequential(
            Conv(16, 32, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
            ResBlock(32, 32, nblocks=1, act_type=act_type, norm_type=norm_type)
        )
        # stride = 8
        self.layer_3 = nn.Sequential(
            Conv(32, 64, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
            ResBlock(64, 64, nblocks=3, act_type=act_type, norm_type=norm_type)
        )
        # stride = 16
        self.layer_4 = nn.Sequential(
            Conv(64, 128, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
            ResBlock(128, 128, nblocks=3, act_type=act_type, norm_type=norm_type)
        )
        # stride = 32
        self.layer_5 = nn.Sequential(
            Conv(128, 256, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type),
            ResBlock(256, 256, nblocks=2, act_type=act_type, norm_type=norm_type)
        )


    def forward(self, x):
        c1 = self.layer_1(x)
        c2 = self.layer_2(c1)
        c3 = self.layer_3(c2)
        c4 = self.layer_4(c3)
        c5 = self.layer_5(c4)

        outputs = [c3, c4, c5]

        return outputs

In [None]:
# Spatial Pyramid Pooling， spatial pyramid，通过池化层进行采样，扩大视域，融合不同大小的特征
class SPPF(nn.Module):
    """
        该代码参考YOLOv5的官方代码实现 https://github.com/ultralytics/yolov5
    """
    def __init__(self, in_dim, out_dim, expand_ratio=0.5, pooling_size=5, act_type='lrelu', norm_type='BN'):
        super().__init__()
        inter_dim = int(in_dim * expand_ratio)
        self.out_dim = out_dim
        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
        self.cv2 = Conv(inter_dim * 4, out_dim, k=1, act_type=act_type, norm_type=norm_type)
        self.m = nn.MaxPool2d(kernel_size=pooling_size, stride=1, padding=pooling_size // 2)

    def forward(self, x):
        x = self.cv1(x)
        y1 = self.m(x)
        y2 = self.m(y1)

        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))


def build_neck(cfg, in_dim, out_dim):
    model = cfg['neck']
    print('==============================')
    print('Neck: {}'.format(model))
    # build neck
    if model == 'sppf':
        neck = SPPF(
            in_dim=in_dim,
            out_dim=out_dim,
            expand_ratio=cfg['expand_ratio'], 
            pooling_size=cfg['pooling_size'],
            act_type=cfg['neck_act'],
            norm_type=cfg['neck_norm']
            )
    else:
        raise NotImplementedError('Neck {} not implemented.'.format(cfg['neck']))

    return neck

In [None]:
# Yolov3FPN
class Yolov3FPN(nn.Module):
    def __init__(self,
                 in_dims=[256, 512, 1024],
                 width=1.0,
                 depth=1.0,
                 out_dim=None,
                 act_type='silu',
                 norm_type='BN'):
        super(Yolov3FPN, self).__init__()
        self.in_dims = in_dims
        self.out_dim = out_dim
        c3, c4, c5 = in_dims

        # P5 -> P4 
        # top_down_layer 用于融合不同层次图像
        # reduce_layer 用于降低通道数
        self.top_down_layer_1 = ConvBlocks(c5, int(512*width), act_type=act_type, norm_type=norm_type)
        self.reduce_layer_1 = Conv(int(512*width), int(256*width), k=1, act_type=act_type, norm_type=norm_type)
        # 上采样保证输出特征图尺寸一致，在计算时进行采样
        # P4 -> P3
        self.top_down_layer_2 = ConvBlocks(c4 + int(256*width), int(256*width), act_type=act_type, norm_type=norm_type)
        self.reduce_layer_2 = Conv(int(256*width), int(128*width), k=1, act_type=act_type, norm_type=norm_type)
        # 上采样保证输出特征图尺寸一致，在计算时进行采样
        # P3
        self.top_down_layer_3 = ConvBlocks(c3 + int(128*width), int(128*width), act_type=act_type, norm_type=norm_type)

        # output proj layers
        if out_dim is not None:
            # output proj layers
            self.out_layers = nn.ModuleList([
                Conv(in_dim, out_dim, k=1,
                        norm_type=norm_type, act_type=act_type)
                        for in_dim in [int(128 * width), int(256 * width), int(512 * width)]
                        ])
            self.out_dim = [out_dim] * 3

        else:
            self.out_layers = None
            self.out_dim = [int(128 * width), int(256 * width), int(512 * width)]


    def forward(self, features):
        c3, c4, c5 = features
        
        # p5/32
        p5 = self.top_down_layer_1(c5)

        # p4/16
        p5_up = F.interpolate(self.reduce_layer_1(p5), scale_factor=2.0)
        p4 = self.top_down_layer_2(torch.cat([c4, p5_up], dim=1))

        # P3/8
        p4_up = F.interpolate(self.reduce_layer_2(p4), scale_factor=2.0)
        p3 = self.top_down_layer_3(torch.cat([c3, p4_up], dim=1))

        out_feats = [p3, p4, p5]

        # output proj layers
        if self.out_layers is not None:
            # output proj layers
            out_feats_proj = []
            for feat, layer in zip(out_feats, self.out_layers):
                out_feats_proj.append(layer(feat))
            return out_feats_proj

        return out_feats