# DarkNet-53 主干网络实现

实现了一个简洁版的 DarkNet-53 主干网络。通过5个阶段的下采样和残差块堆叠，有效提取不同尺度的图像特征，增强网络的表达能力。残差连接设计缓解了梯度消失问题，提高了深层网络的训练稳定性。输出的多尺度特征图可以用于后续的目标检测任务，如 YOLO 系列模型。

In [2]:
import math
from collections import OrderedDict
import torch.nn as nn
import torch

# Residual structure
# Use a 1x1 convolution to reduce the number of channels, then use a 3x3 convolution to extract features and increase the number of channels,
# and finally add a residual connection.
class BasicBlock(nn.Module):
    def __init__(self, inplanes, planes): # inplanes输入特征图的通道数。
        super(BasicBlock, self).__init__()
        self.conv1  = nn.Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1    = nn.BatchNorm2d(planes[0])
        self.relu1  = nn.LeakyReLU(0.1)
        
        self.conv2  = nn.Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2    = nn.BatchNorm2d(planes[1])
        self.relu2  = nn.LeakyReLU(0.1)

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)

        out += residual
        return out

class DarkNet(nn.Module): 
    def __init__(self, layers): 
        super(DarkNet, self).__init__() # The layers parameter indicates the number of residual blocks in each stage.
        self.inplanes = 32 
        # 416,416,3 -> 416,416,32
        self.conv1  = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) 
        # Kernel size is 3x3, stride is 1, and padding is 1, which keeps the image size unchanged.
        self.bn1    = nn.BatchNorm2d(self.inplanes)
        self.relu1  = nn.LeakyReLU(0.1) 

        # 416,416,32 -> 208,208,64
        self.layer1 = self._make_layer([32, 64], layers[0]) # Build the first stage (increase from 32 channels to 64 channels, stacking 1 residual block).
        # 208,208,64 -> 104,104,128
        self.layer2 = self._make_layer([64, 128], layers[1]) # Similarly, continue building stages 2, 3, 4, and 5. Each time: the number of channels doubles, the image size is halved, and the number of residual blocks varies.
        # 104,104,128 -> 52,52,256
        self.layer3 = self._make_layer([128, 256], layers[2])
        # 52,52,256 -> 26,26,512
        self.layer4 = self._make_layer([256, 512], layers[3])
        # 26,26,512 -> 13,13,1024
        self.layer5 = self._make_layer([512, 1024], layers[4])

        self.layers_out_filters = [64, 128, 256, 512, 1024] # Record the number of output channels in the feature map at each stage of the five main modules.

        # Parameter weight initialization.
        for m in self.modules():  
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 
                m.weight.data.normal_(0, math.sqrt(2. / n)) 
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1) 
                m.bias.data.zero_() 
    
 # In each layer, first perform downsampling using a 3x3 convolution with a stride of 2, then stack residual structures.
    def _make_layer(self, planes, blocks): 
        layers = [] 
        # Downsampling with a stride of 2 and a kernel size of 3.
        layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3, stride=2, padding=1, bias=False)))
        layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
        layers.append(("ds_relu", nn.LeakyReLU(0.1))) 
        # Add the residual structure.
        self.inplanes = planes[1] # Update self.inplanes, indicating that the input channel number for stacking residual blocks in the next stage has also changed to planes[1].
        for i in range(0, blocks):
            layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes))) 
        return nn.Sequential(OrderedDict(layers))

    def forward(self, x): # Define the forward propagation process of the entire DarkNet network.
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        out3 = self.layer3(x) # Save the output as out3.
        out4 = self.layer4(out3)
        out5 = self.layer5(out4)

        return out3, out4, out5

def darknet53():
    model = DarkNet([1, 2, 8, 8, 4])
    return model

# YOLOv3 检测头完整实现：结合 Darknet53 主干网络与多尺度特征融合

基于 Darknet-53 主干网络实现了 YOLOv3 的完整检测头模块，包括3个尺度的特征提取和预测输出。通过卷积模块、上采样和特征融合策略，有效提升了检测不同尺寸目标的能力。输出的三组特征图分别对应大、中、小物体检测，支撑后续边界框回归和分类任务。

In [3]:
def conv2d(filter_in, filter_out, kernel_size): # Define a convolutional block function where the input channels are filter_in, the output channels are filter_out, and the kernel size is kernel_size.
    pad = (kernel_size - 1) // 2 if kernel_size else 0 # Calculate the amount of padding in order to ensure that the feature map size remains unchanged after convolution.
    return nn.Sequential(OrderedDict([ 
        ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)), 
        ("bn", nn.BatchNorm2d(filter_out)), 
        ("relu", nn.LeakyReLU(0.1)), 
    ]))

# In make_last_layers, there are a total of seven convolutions.
# The first five are used for feature extraction, and the last two are used to obtain the YOLO network's prediction results.
def make_last_layers(filters_list, in_filters, out_filter): 
    # Pass in three parameters:
    # filters_list: a list, such as [512, 1024], specifying the intermediate number of channels.
    # in_filters: the number of input feature map channels, e.g., 1024.
    # out_filter: the number of output feature map channels (usually calculated as the number of boxes per grid × number of classes + 5).
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 1), 
        conv2d(filters_list[0], filters_list[1], 3), 
        conv2d(filters_list[1], filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True) # Actually generate the final output tensor, where each grid cell predicts bounding boxes and class probabilities.
    )
    return m 

class YoloBody(nn.Module):
    def __init__(self, anchors_mask, num_classes, pretrained = False): 
        # anchors_mask: which anchors are used at each layer
        # num_classes: the number of object classes to detect
        # pretrained=False: whether to load pretrained Darknet-53 weights
        super(YoloBody, self).__init__()
        # Generate the Darknet-53 backbone model
        # Obtain three valid feature layers with shapes:
        # 52×52×256
        # 26×26×512
        # 13×13×1024
        self.backbone = darknet53() 
        if pretrained:
            self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth")) 
        #   out_filters : [64, 128, 256, 512, 1024]
        out_filters = self.backbone.layers_out_filters # out_filters stores the number of output channels for each stage.

        #   Calculate the number of output channels in yolo_head for the VOC dataset:
        #   final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
        self.last_layer0            = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))
        # First YOLO detection head (handles the feature map with the smallest resolution):
        # Input channels: out_filters[-1] = 1024
        # Intermediate channel list: [512, 1024]
        # Output channels: number of anchors per grid cell × (number of classes + 5)
        self.last_layer1_conv       = conv2d(512, 256, 1)
        self.last_layer1_upsample   = nn.Upsample(scale_factor=2, mode='nearest') 
        self.last_layer1            = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))
        # Second YOLO detection head (handles the medium-resolution feature map):
        # Input channels: out_filters[-2] + 256 (original 512 + 256 from upsampling)
        # Intermediate channel list: [256, 512]
        # Output channels: number of anchors × (number of classes + 5)
        self.last_layer2_conv       = conv2d(256, 128, 1) 
        self.last_layer2_upsample   = nn.Upsample(scale_factor=2, mode='nearest') 
        self.last_layer2            = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))
        # Third YOLO detection head (handles the highest-resolution feature map):
        # Input channels: out_filters[-3] + 128
        # Intermediate channel list: [128, 256]
        # Output channels: still anchor count × (number of classes + 5)

        # The backbone and all three detection heads are ready.

    def forward(self, x):
        # Obtain three valid feature layers with shapes:
        # 52×52×256
        # 26×26×512
        # 13×13×1024
        x2, x1, x0 = self.backbone(x)

        #---------------------------------------------------#
        #   The first feature layer: 52×52×256.
        #   out0 = (batch_size,255,13,13)
        #---------------------------------------------------#
        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
        out0_branch = self.last_layer0[:5](x0)
        out0        = self.last_layer0[5:](out0_branch)

        # 13,13,512 -> 13,13,256 -> 26,26,256
        x1_in = self.last_layer1_conv(out0_branch)
        x1_in = self.last_layer1_upsample(x1_in)

        # 26,26,256 + 26,26,512 -> 26,26,768
        x1_in = torch.cat([x1_in, x1], 1)
        #---------------------------------------------------#
        #   The second feature layer: 52×52×256.
        #   out1 = (batch_size,255,26,26)
        #---------------------------------------------------#
        # 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
        out1_branch = self.last_layer1[:5](x1_in)
        out1        = self.last_layer1[5:](out1_branch)

        # 26,26,256 -> 26,26,128 -> 52,52,128
        x2_in = self.last_layer2_conv(out1_branch)
        x2_in = self.last_layer2_upsample(x2_in)

        # 52,52,128 + 52,52,256 -> 52,52,384
        x2_in = torch.cat([x2_in, x2], 1)
        #---------------------------------------------------#
        #   The third feature layer: 52×52×256.
        #   out3 = (batch_size,255,52,52)
        #---------------------------------------------------#
        # 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
        out2 = self.last_layer2(x2_in)
        return out0, out1, out2

<center>
  <img src="111.png" width="200"/>
</center>

# 对 YOLO 模型输出的特征图进行解码和后处理，得到最终用于评估的目标框坐标和类别预测结果。

实现了 YOLO 检测模型的 输出特征图解码与后处理模块，包括如下功能：
- 多尺度特征图解码：将模型输出的特征图解码为边界框中心、宽高、物体置信度和类别概率，适配三种不同尺寸的特征图（如13×13、26×26、52×52）。
- 先验框调整与归一化：通过网格偏移与指数变换恢复边界框真实尺寸，并将其归一化至输入尺寸比例。
- 坐标反变换：将归一化坐标转换为原图尺寸的实际像素坐标，支持是否使用 LetterBox。
- 非极大值抑制（NMS）：基于 PyTorch 官方的 NMS 算法，去除重复冗余框，保留置信度高、重叠小的框。
- 最终输出格式标准化：返回每张图片的检测结果，包含边界框左上角与右下角坐标、物体置信度、类别置信度和类别编号。

In [6]:
from torchvision.ops import nms
import numpy as np

class DecodeBox(): # Define a class DecodeBox that is specifically used to process the output feature maps of YOLO.
    def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]):
        super(DecodeBox, self).__init__()
        self.anchors        = anchors
        self.num_classes    = num_classes
        self.bbox_attrs     = 5 + num_classes
        self.input_shape    = input_shape
        # anchors: the dimensions of all anchor boxes.
        # num_classes: the number of object classes.
        # input_shape: the input dimensions of the model.
        # anchors_mask: indicates which 3 anchors are used for each scale of the feature map. For example, the 13x13 feature map uses the 6th, 7th, and 8th anchors.
        #-----------------------------------------------------------#
        #   The 13x13 feature layer corresponds to the anchors: [116,90], [156,198], [373,326]
        #   The 26x26 feature layer corresponds to the anchors: [30,61], [62,45], [59,119]
        #   The 52x52 feature layer corresponds to the anchors: [10,13], [16,30], [33,23]
        #-----------------------------------------------------------#
        self.anchors_mask   = anchors_mask

    def decode_box(self, inputs): # Decode the model outputs into bounding box information.
                                    # 'inputs' is a list of feature maps output by the model (corresponding to large, medium, and small scales).
        outputs = [] 
        for i, input in enumerate(inputs): 
            #-----------------------------------------------#
            #   The input 'inputs' contains three tensors, and their shapes are:
            #   batch_size, 75, 13, 13
            #   batch_size, 75, 26, 26
            #   batch_size, 75, 52, 52
            #-----------------------------------------------#
            batch_size      = input.size(0)
            input_height    = input.size(2)
            input_width     = input.size(3)
            # Get the batch size, width, and height of the current feature map.

            # The input is a 416×416 image. If the feature map is downsampled to 13x13, 
            # it means the stride is 32.
            stride_h = self.input_shape[0] / input_height
            stride_w = self.input_shape[1] / input_width
            #-------------------------------------------------#
            # At this point, the obtained 'scaled_anchors' are sized relative to the feature layer.
            #-------------------------------------------------#
            scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors[self.anchors_mask[i]]]

            #-----------------------------------------------#
            #   The input 'inputs' contains three tensors, and their shapes are:
            #   batch_size, 3, 13, 13, 25
            #   batch_size, 3, 26, 26, 25
            #   batch_size, 3, 52, 52, 25
            #-----------------------------------------------#
            # Reshape the feature map to facilitate subsequent operations.
            # The shape becomes [batch, 3, height, width, bbox_attrs].
            prediction = input.view(batch_size, len(self.anchors_mask[i]),
                                    self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()

            #-----------------------------------------------#
            #   Adjustment parameters for the center position of the anchor boxes.
            #-----------------------------------------------#
            x = torch.sigmoid(prediction[..., 0])  
            y = torch.sigmoid(prediction[..., 1])
            #-----------------------------------------------#
            #   Adjustment parameters for the width and height of the anchor boxes.
            #-----------------------------------------------#
            w = prediction[..., 2]
            h = prediction[..., 3]
            #-----------------------------------------------#
            #   Obtain the confidence score, indicating whether there is an object.
            #-----------------------------------------------#
            conf        = torch.sigmoid(prediction[..., 4])
            #-----------------------------------------------#
            #   Class confidence scores.
            #-----------------------------------------------#
            # pred_cls: class probabilities.
            pred_cls    = torch.sigmoid(prediction[..., 5:])
            FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
            LongTensor  = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor

            #----------------------------------------------------------#
            #   Generate the grid: anchor box centers relative to the top-left corner of each grid cell.
            #   Shape: batch_size, 3, 13, 13
            #----------------------------------------------------------#
            grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
                batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor)
            grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
                batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor)

            #----------------------------------------------------------#
            #   Generate the anchor box widths and heights in grid format.
            #   Shape: batch_size, 3, 13, 13
            #----------------------------------------------------------#
            anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
            anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
            anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
            anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)

            #----------------------------------------------------------#
            # Adjust the anchor boxes using the prediction results.
            # First, adjust the center of the anchor boxes by offsetting from the top-left corner of the grid cell.
            # Then, adjust the width and height of the anchor boxes.
            #----------------------------------------------------------#
            pred_boxes          = FloatTensor(prediction[..., :4].shape)
            pred_boxes[..., 0]  = x.data + grid_x
            pred_boxes[..., 1]  = y.data + grid_y
            pred_boxes[..., 2]  = torch.exp(w.data) * anchor_w
            pred_boxes[..., 3]  = torch.exp(h.data) * anchor_h

            #----------------------------------------------------------#
            #   Normalize the output results into decimal format.
            #----------------------------------------------------------#
            _scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor)
            output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale,
                                conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1) # shape = (1, 507, 11)，表示1张图，共507个预测框，每个框11个值
            outputs.append(output.data) 
        return outputs # Finally, return all 25 pieces of information for each box on the 13x13 feature map.

    def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image): # Convert the normalized decimal coordinates back to actual pixel coordinates in the original image size.
        # box_xy: predicted center coordinates (x, y), normalized.
        # box_wh: predicted width and height (w, h), normalized.
        # input_shape: input size of the model.
        # image_shape: original image size.
        # letterbox_image: whether letterbox (padding to keep aspect ratio) was used.
        #-----------------------------------------------------------------#
        # Putting the y-axis first is for easier multiplication with the image height and width.
        #-----------------------------------------------------------------#
        box_yx = box_xy[..., ::-1] 
        box_hw = box_wh[..., ::-1]
        input_shape = np.array(input_shape)
        image_shape = np.array(image_shape)

        if letterbox_image: # 如果有加黑边(letterbox)，需要特殊处理：
            #-----------------------------------------------------------------#
            #   The offset calculated here represents the offset of the valid image area relative to the top-left corner.
            #   'new_shape' refers to the scaled dimensions (width and height).
            #-----------------------------------------------------------------#
            # new_shape: the size of the original image after proportional scaling to fit into the input_shape.
            # offset: the width of the black borders (padding) around the image after scaling.
            # scale: the scaling ratio.
            # Finally, remove the effect of the padding and restore to the position before scaling.
            new_shape = np.round(image_shape * np.min(input_shape/image_shape))
            offset  = (input_shape - new_shape)/2./input_shape
            scale   = input_shape/new_shape

            box_yx  = (box_yx - offset) * scale
            box_hw *= scale
        # Calculate the top-left and bottom-right coordinates:
        # Use the center point and the width/height to derive the top-left and bottom-right points.
        box_mins    = box_yx - (box_hw / 2.)
        box_maxes   = box_yx + (box_hw / 2.)
        # Concatenate to form the final boxes:
        # Merge the coordinates into [x_min, y_min, x_max, y_max].
        # Then multiply by the original image size to convert to actual pixel values.
        boxes  = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1)
        boxes *= np.concatenate([image_shape, image_shape], axis=-1) 
        return boxes # Return the bounding box coordinates in pixel values corresponding to the original image size.

    def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4): # Filter predicted boxes based on confidence scores, and remove duplicate boxes with too much overlap.
        # conf_thres: confidence threshold; discard boxes with scores below this.
        # nms_thres: threshold for Non-Maximum Suppression (NMS).
        #----------------------------------------------------------#
        #   Convert the format of the prediction results to [top-left, bottom-right] format.
        #   prediction shape: [batch_size, num_anchors, 25]
        #----------------------------------------------------------#
        box_corner          = prediction.new(prediction.shape)
        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
        prediction[:, :, :4] = box_corner[:, :, :4] # Assign the coordinates from the first 4 columns of box_corner ([x1, y1, x2, y2]) 
                                                    # back to the first 4 columns of prediction.

        output = [None for _ in range(len(prediction))]
        for i, image_pred in enumerate(prediction): 
            # prediction is a tensor of shape [batch_size, num_anchors, 5 + num_classes].
            # enumerate(prediction) processes the prediction results for each image.
            # image_pred has the shape [num_anchors, 5 + num_classes], representing the prediction info of all anchors for that image.
            #----------------------------------------------------------#
            #   Take the max value over the class prediction part.
            #   class_conf: [num_anchors, 1]    class confidence
            #   class_pred: [num_anchors, 1]    predicted class
            #----------------------------------------------------------#
            class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True) 
            #----------------------------------------------------------#
            #   Use the confidence scores to perform the first round of filtering.
            #----------------------------------------------------------#
            conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
            #----------------------------------------------------------#
            #   Filter the prediction results based on confidence scores.
            #----------------------------------------------------------#
            image_pred = image_pred[conf_mask]
            class_conf = class_conf[conf_mask]
            class_pred = class_pred[conf_mask]
            if not image_pred.size(0):
                continue
            #-------------------------------------------------------------------------#
            #   detections: [num_anchors, 7]
            #   The 7 elements are: x1, y1, x2, y2, object confidence, class confidence, predicted class
            #-------------------------------------------------------------------------#
            detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)

            #------------------------------------------#
            #   Get all the unique classes contained in the prediction results.
            #------------------------------------------#
            unique_labels = detections[:, -1].cpu().unique()

            if prediction.is_cuda: 
                unique_labels = unique_labels.cuda()
                detections = detections.cuda()

            for c in unique_labels: 
                #------------------------------------------#
                #   Get all filtered prediction results for a specific class.
                #------------------------------------------#
                detections_class = detections[detections[:, -1] == c]

                #------------------------------------------#
                #   Using the built-in Non-Maximum Suppression (NMS) provided by the official library is faster!
                #------------------------------------------#
                keep = nms(
                    detections_class[:, :4],
                    detections_class[:, 4] * detections_class[:, 5], 
                    nms_thres
                )
                # detections_class[:, :4]: extract the box coordinates [x1, y1, x2, y2]
                # detections_class[:, 4] * detections_class[:, 5]:
                # objectness × class_conf, used as the overall confidence score for NMS
                # Boxes with higher confidence are kept first
                # nms_thres: IOU threshold for NMS, default is 0.4 (if two boxes overlap more than 0.4, the one with lower score is removed)
                max_detections = detections_class[keep] 
                
                # Add max detections to outputs 
                output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections)) 
            
            if output[i] is not None:# Finally, process each image: convert the predicted boxes from relative coordinates back to original image pixel coordinates.
                output[i]           = output[i].cpu().numpy()
                box_xy, box_wh      = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2] # Convert each box format from [x1, y1, x2, y2] (top-left ➜ bottom-right) back to center point and width-height format:
                # box_xy is the center point position (x_center, y_center)
                # ➜ (x1 + x2)/2, (y1 + y2)/2
                # box_wh is the width and height (w, h)
                # ➜ x2 - x1, y2 - y1
                # This step is to prepare for the next function yolo_correct_boxes(), which requires center point and width-height as input.

                output[i][:, :4]    = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)

        return output 
        # output is a list with a length equal to batch_size. It contains all the decoded predicted boxes
        # from each feature layer for every input image!
        # Each element is a NumPy array with shape [num_detections, 7]
        # Each row is: [x1, y1, x2, y2, obj_conf, class_conf, class_id]