In [1]:
import os
import torch
os.getcwd()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# 数据预处理
project_root = "/kaggle/working/conveyer_belt_detector/"
dataset_root = "/kaggle/input/conveyer-belt-detect/dataset/"

class1_train_pic_root = os.path.join(dataset_root, "class1/train")
class1_val_pic_root = os.path.join(dataset_root, "class1/val")
class2_train_pic_root = os.path.join(dataset_root, "class2/train")
class2_val_pic_root = os.path.join(dataset_root, "class2/val")
class3_train_pic_root = os.path.join(dataset_root, "class3/train")
class3_val_pic_root = os.path.join(dataset_root, "class3/val")
normal_train_pic_root = os.path.join(dataset_root, "normal/train")
normal_val_pic_root = os.path.join(dataset_root, "normal/val")

# 任务一的标注文件
task1_train_class1_orig_anno = os.path.join(dataset_root, "class1/train_infos.csv")
task1_train_class2_orig_anno = os.path.join(dataset_root, "class2/train_infos.csv")
task1_train_class3_orig_anno = os.path.join(dataset_root, "class3/train_infos.csv")
task1_train_normal_orig_anno = os.path.join(dataset_root, "normal/train_infos.csv")

task1_val_class1_orig_anno = os.path.join(dataset_root, "class1/val_infos.csv")
task1_val_class2_orig_anno = os.path.join(dataset_root, "class2/val_infos.csv")
task1_val_class3_orig_anno = os.path.join(dataset_root, "class3/val_infos.csv")
task1_val_normal_orig_anno = os.path.join(dataset_root, "normal/val_infos.csv")

# 任务二的标注文件
task2_train_class1_orig_anno = os.path.join(dataset_root, "class1/class1_train.txt")
task2_train_class2_orig_anno = os.path.join(dataset_root, "class2/class2_train.txt")
task2_train_class3_orig_anno = os.path.join(project_root, "tmp/class3/class3_train.txt")
task2_train_normal_orig_anno = os.path.join(dataset_root, "normal/normal_train.txt")

task2_val_class1_orig_anno = os.path.join(dataset_root, "class1/class1_val.txt")
task2_val_class2_orig_anno = os.path.join(dataset_root, "class2/clsss2_val.txt")
task2_val_class3_orig_anno = os.path.join(project_root, "tmp/class3/class3_val.txt")
task2_val_normal_orig_anno = os.path.join(dataset_root, "normal/normal_val.txt")

# 186 2126 2507

# 汇总的标注文件
task1_train_detect_anno = "/kaggle/input/anno-data-new/train_info_all.csv"
task1_val_detect_anno = os.path.join(project_root, "data/task1/val/val_infos.csv")
task2_train_classes_anno = "/kaggle/input/anno-data-new/classes_all.txt"
task2_val_classes_anno = os.path.join(project_root, "data/task2/val/classes.txt")

pretrain_path="/kaggle/input/nvidia-ssdpyt-fp32-190826pt/nvidia_ssdpyt_fp32_190826.pt"

def make_dir(path):
    dir_name = os.path.dirname(path)
    if not os.path.exists(dir_name):
        print("创建目录", dir_name)
        os.makedirs(dir_name)

make_dir(path=project_root)
make_dir(path=task1_train_detect_anno)
make_dir(path=task1_val_detect_anno)
make_dir(path=task2_train_classes_anno)
make_dir(path=task2_val_classes_anno)

# 预处理任务一的数据：
# 1、将train_infos.csv的filename字段改为绝对路径;
# 2、将xmin、xmax、ymin、ymax的坐标值改为相对坐标比例;
# 3、将class1、class2的train_infos.csv合并为一个大的train_infos.csv。
import csv
from collections import defaultdict

def predeal_task1_anno_file(dest_file, orig_files, pic_roots):
    dest_file_dir = os.path.dirname(p=dest_file)
    if not os.path.exists(dest_file_dir):
        print(f"创建目录{dest_file_dir}")
        os.makedirs(dest_file_dir)
    
    with open(file=dest_file, mode="w", encoding="utf8", newline='') as dest_csv_file:
        header = None
        csv_writer = None
        last_file_index = 0
        cur_index = 0
        for orig_file, pic_root in zip(orig_files, pic_roots):
            with open(file=orig_file, mode="r", encoding="utf8") as tmp_csv_file:
                # 读表头
                raw_line = tmp_csv_file.readline()
                if len(raw_line) <= 0:
                    # 读取下个文件
                    continue

                if header is None:
                    header = raw_line.strip().split(",")
                    csv_writer = csv.DictWriter(f=dest_csv_file, fieldnames=header)
                    csv_writer.writeheader()
                        
                while True:
                    raw_line = tmp_csv_file.readline()
                    if len(raw_line) <= 0:
                        last_file_index = cur_index + 1
                        # 读取下个文件
                        break
                    line_content = raw_line.strip().split(",")

                    new_row = defaultdict()
                    for i in range(len(header)):
                        if header[i] == "index":
                            cur_index = (int)(line_content[0])
                            new_row[header[i]] = last_file_index + cur_index
                        elif header[i] == "filename":
                            new_row[header[i]] = os.path.join(pic_root, line_content[i])
                        elif header[i] == "xmin" or header[i] == "xmax":
                            new_row[header[i]] = float(line_content[i]) / float(line_content[8])
                        elif header[i] == "ymin" or header[i] == "ymax":
                            new_row[header[i]] = float(line_content[i]) / float(line_content[7])
                        else:
                            new_row[header[i]] = line_content[i]


                    csv_writer.writerow(new_row)
        print("导出 " + dest_file + " 成功")

# 预处理class1、class2、class3、normal的标注文件
train_anno_files = [task1_train_class1_orig_anno, task1_train_class2_orig_anno]
train_pic_roots = [class1_train_pic_root, class2_train_pic_root] 

val_anno_files = [task1_val_class1_orig_anno, task1_val_class2_orig_anno]
val_pic_roots = [class1_val_pic_root, class2_val_pic_root] 

# predeal_task1_anno_file(dest_file=task1_train_detect_anno, orig_files=train_anno_files, pic_roots=train_pic_roots)
# predeal_task1_anno_file(dest_file=task1_val_detect_anno, orig_files=val_anno_files, pic_roots=val_pic_roots)

In [3]:
# 预处理任务二的数据
def generate_task2_class3_anno(dest_files, orig_dirs, append_dirs):
    header = ["path", "class"]
    for dest_file, orig_dir, append_dir in zip(dest_files, orig_dirs, append_dirs):
        make_dir(dest_file)
        with open(file=dest_file, mode="w", encoding="utf8", newline="") as dest_file_handle:
            csv_writer = csv.DictWriter(f=dest_file_handle, fieldnames=header)
            for file in os.listdir(orig_dir):
                new_row = defaultdict()
                new_row["path"] = os.path.join(append_dir, file)
                new_row["class"] = 3
                csv_writer.writerow(new_row)
        print(f"导出{dest_file}成功")
                

def predeal_task2_anno_file(dest_file, orig_files, dataset_root):
    dest_file_dir = os.path.dirname(p=dest_file)
    if not os.path.exists(dest_file_dir):
        print(f"创建目录{dest_file_dir}")
        os.makedirs(dest_file_dir)
    
    with open(file=dest_file, mode="w", encoding="utf8", newline='') as dest_csv_file:
        header = ["path", "class"]
        csv_writer = csv.DictWriter(f=dest_csv_file, fieldnames=header)
        for orig_file in orig_files:
            with open(file=orig_file, mode="r", encoding="utf8") as tmp_csv_file:
                while True:
                    raw_line = tmp_csv_file.readline()
                    if len(raw_line) <= 0:
                        # 读取下个文件
                        break
                    line_content = raw_line.strip().split(",")

                    new_row = defaultdict()
                    new_row["path"] = os.path.join(dataset_root, line_content[0][2:])
                    new_row["class"] = int(line_content[1])
                    
                    csv_writer.writerow(new_row)
        print("导出 " + dest_file + " 成功")

dest_files = [task2_train_class3_orig_anno, task2_val_class3_orig_anno]
orig_dirs = [class3_train_pic_root, class3_val_pic_root]
append_dirs = ["./class3/train", "./class3/val"]
# 生成任务二分类3的标注文件
# generate_task2_class3_anno(dest_files=dest_files, orig_dirs=orig_dirs, append_dirs=append_dirs)
        
# 合并class1、class2、class3、normal的标注文件
# train_orig_files = [task2_train_class1_orig_anno, task2_train_class2_orig_anno, task2_train_class3_orig_anno, task2_train_normal_orig_anno] 
# val_orig_files = [task2_val_class1_orig_anno, task2_val_class2_orig_anno, task2_val_class3_orig_anno, task2_val_normal_orig_anno] 

# predeal_task2_anno_file(dest_file=task2_train_classes_anno, orig_files=train_orig_files, dataset_root=dataset_root)
# predeal_task2_anno_file(dest_file=task2_val_classes_anno, orig_files=val_orig_files, dataset_root=dataset_root)


In [4]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.6.tar.gz (24 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pycocotools
  Building wheel for pycocotools (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pycocotools: filename=pycocotools-2.0.6-cp310-cp310-linux_x86_64.whl size=110796 sha256=019078fda887fd000777b96f70d3e30bc40739e131522b08976c111849002223
  Stored in directory: /root/.cache/pip/wheels/58/e6/f9/f87c8f8be098b51b616871315318329cae12cdb618f4caac93
Successfully built pycocotools
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.6


In [5]:
from torchvision.transforms import ToPILImage

from PIL import Image
from PIL.Image import fromarray
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont
from PIL import ImageColor

import datetime
import matplotlib.pyplot as plt

import numpy as np

STANDARD_COLORS = [
    'White', 'Cyan', 'Yellow', 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure',
    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson',
    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'SandyBrown',
    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Wheat', 'WhiteSmoke', 'YellowGreen',
    'Beige', 'Bisque', 'Violet', 'Green', 'Red'
]

def draw_text(draw,
              box: list,
              cls: int,
              score: float,
              category_index: dict,
              color: str,
              font: str = 'arial.ttf',
              font_size: int = 24):
    """
    将目标边界框和类别信息绘制到图片上
    """
    try:
        font = ImageFont.truetype(font, font_size)
    except IOError:
        font = ImageFont.load_default()

    left, top, right, bottom = box
    # If the total height of the display strings added to the top of the bounding
    # box exceeds the top of the image, stack the strings below the bounding box
    # instead of above.
    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
    display_str_heights = [font.getsize(ds)[1] for ds in display_str]
    # Each display_str has a top and bottom margin of 0.05x.
    display_str_height = (1 + 2 * 0.05) * max(display_str_heights)

    if top > display_str_height:
        text_top = top - display_str_height
        text_bottom = top
    else:
        text_top = bottom
        text_bottom = bottom + display_str_height

    for ds in display_str:
        text_width, text_height = font.getsize(ds)
        margin = np.ceil(0.05 * text_width)
        draw.rectangle([(left, text_top),
                        (left + text_width + 2 * margin, text_bottom)], fill=color)
        draw.text((left + margin, text_top),
                  ds,
                  fill='black',
                  font=font)
        left += text_width


def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
    np_image = np.array(image)
    masks = np.where(masks > thresh, True, False)

    # colors = np.array(colors)
    img_to_draw = np.copy(np_image)
    # TODO: There might be a way to vectorize this
    for mask, color in zip(masks, colors):
        img_to_draw[mask] = color

    out = np_image * (1 - alpha) + img_to_draw * alpha
    return fromarray(out.astype(np.uint8))

def draw_objs(image: Image,
              boxes: np.ndarray = None,
              classes: np.ndarray = None,
              scores: np.ndarray = None,
              masks: np.ndarray = None,
              category_index: dict = None,
              box_thresh: float = 0.1,
              mask_thresh: float = 0.5,
              line_thickness: int = 8,
              font: str = 'arial.ttf',
              font_size: int = 24,
              draw_boxes_on_image: bool = True,
              draw_masks_on_image: bool = False):
    """
    将目标边界框信息，类别信息，mask信息绘制在图片上
    Args:
        image: 需要绘制的图片
        boxes: 目标边界框信息
        classes: 目标类别信息
        scores: 目标概率信息
        masks: 目标mask信息
        category_index: 类别与名称字典
        box_thresh: 过滤的概率阈值
        mask_thresh:
        line_thickness: 边界框宽度
        font: 字体类型
        font_size: 字体大小
        draw_[boxes]_on_image:
        draw_masks_on_image:

    Returns:

    """

    # 过滤掉低概率的目标
    idxs = np.greater(scores, box_thresh)
    
    boxes = boxes[idxs]
    classes = classes[idxs]
    scores = scores[idxs]
    if masks is not None:
        masks = masks[idxs]
    if len(boxes) == 0:
        return image

    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]

    if draw_boxes_on_image:
        draw_boxes = []
        for box, cls, score, color in zip(boxes, classes, scores, colors):
            draw_boxes.append((box, cls, score, color))

        # Draw all boxes onto image.
        draw = ImageDraw.Draw(image)
        for draw_box in draw_boxes[-1::-1]:
            left, top, right, bottom = draw_box[0]
            # 绘制目标边界框
            draw.line([(left, top), (left, bottom), (right, bottom),
                       (right, top), (left, top)], width=line_thickness, fill=draw_box[3])
            
            if category_index is not None:
                # 绘制类别和概率信息
                draw_text(draw, draw_box[0].tolist(), int(draw_box[1]), float(draw_box[2]), category_index, draw_box[3], font, font_size)

    if draw_masks_on_image and (masks is not None):
        # Draw all mask onto image.
        image = draw_masks(image, masks, colors, mask_thresh)

    return image

def draw_image(img_file, predict_boxes, predict_classes, predict_scores, category_index,
                show_image_flag=True, save_image_flag=False, save_image_dir = None):
    original_img = Image.open(img_file)
    
    plot_img = draw_objs(original_img,
                         predict_boxes,
                         predict_classes,
                         predict_scores,
                         category_index=category_index,
                         box_thresh=0.5,
                         line_thickness=2,
                         font='simsun.ttc',
                         font_size=20)
    if show_image_flag:
        plt.imshow(plot_img)
        plt.show()
        
    if save_image_flag:
        make_dir(path=save_image_dir)
        save_file = os.path.join(save_image_dir, os.path.basename(img_file))
        
        plot_img.save(save_file)

def plot_loss_and_lr(train_loss, learning_rate):
    try:
        x = list(range(len(train_loss)))
        fig, ax1 = plt.subplots(1, 1)
        ax1.plot(x, train_loss, 'r', label='loss')
        ax1.set_xlabel("epoch")
        ax1.set_ylabel("loss")
        ax1.set_title("Train Loss and lr")
        plt.legend(loc='best')

        ax2 = ax1.twinx()
        ax2.plot(x, learning_rate, label='lr')
        ax2.set_ylabel("learning rate")
        ax2.set_xlim(0, len(train_loss))  # 设置横坐标整数间隔
        plt.legend(loc='best')

        handles1, labels1 = ax1.get_legend_handles_labels()
        handles2, labels2 = ax2.get_legend_handles_labels()
        plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right')

        fig.subplots_adjust(right=0.8)  # 防止出现保存图片显示不全的情况
        fig.savefig('./loss_and_lr{}.png'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
        plt.close()
        print("successful save loss curve! ")
    except Exception as e:
        print(e)


def plot_map(mAP):
    try:
        x = list(range(len(mAP)))
        plt.plot(x, mAP, label='mAp')
        plt.xlabel('epoch')
        plt.yticks([0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50,
                    0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00])
        plt.ylabel('mAP')
        plt.title('Eval mAP')
        plt.xlim(0, len(mAP))
        plt.legend(loc='best')
        plt.savefig('./mAP.png')
        plt.close()
        print("successful save mAP curve!")
    except Exception as e:
        print(e)

def save_pic(image, path, bboxes=None, labels=None, height=None, width=None, category_index=None, bboxes_scores=None):
    make_dir(path)
    image.save(path)
    
    if bboxes is not None and labels is not None:
        bboxes_out = bboxes.copy()
        
        bboxes_out[:, 0], bboxes_out[:, 2] =  bboxes[:, 0] * width, bboxes[:, 2] * width
        bboxes_out[:, 1], bboxes_out[:, 3] =  bboxes[:, 1] * height, bboxes[:, 3] * height

        gt_scores = np.array([1.0 for i in range(len(labels))]) if bboxes_scores is None else bboxes_scores

        draw_image(path, bboxes_out, labels, gt_scores, category_index, save_image_flag=True, save_image_dir=os.path.dirname(path))
    
def save_anchor_pic(image, path, bboxes, height, width, targets=None, targets_label=None, choice=None, bboxes_label=None, bboxes_scores=None):
    toPilImage = ToPILImage()
    if choice is None:
        choice = 0
    image = toPilImage(image[choice, :, :, :])
    
    labels = np.array([0 for _ in range(len(bboxes))]) if bboxes_label is None else bboxes_label
    
    if targets is not None:
        bboxes = torch.cat((targets, bboxes.to(device=device)), dim=0).to(device=device)
        color_count = len(STANDARD_COLORS)
        targets_label = [color_count - 1 for _ in range(len(targets))] if targets_label is None else color_count - targets_label

        labels = np.append(targets_label.cpu(), labels.cpu())
        
    if bboxes_scores is not None:
        scores = []
        
        bboxes_scores = bboxes_scores.tolist()
        target_scores = [1 for _ in range(len(targets))]
        
        scores.extend(target_scores)
        scores.extend(bboxes_scores)
        
        bboxes_scores = np.array(scores)
    
    save_pic(image=image, path=path, bboxes=bboxes.cpu().numpy(), labels=labels, height=height, width=width, bboxes_scores=bboxes_scores)

In [6]:
# src/res50_backbone.py

import torch.nn as nn
import torch

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channel, out_channel, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
                               kernel_size=1, stride=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channel)

        # -----------------------------------------

        self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
                               kernel_size=3, stride=stride, bias=False, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channel)

        # -----------------------------------------

        self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion,
                               kernel_size=1, stride=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channel * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample

    def forward(self, x):
        identity = x
        if self.downsample is not None:
            identity = self.downsample(x)

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, blocks_num, num_classes=1000, include_top=True):
        super(ResNet, self).__init__()
        self.include_top = include_top
        self.in_channel = 64
        self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
                               padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channel)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, blocks_num[0])
        self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
        self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
        self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)

        # 输出层
        if self.include_top:
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.fc = nn.Linear(512 * block.expansion, num_classes)

        # 参数初始化
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')

    def _make_layer(self, block, channel, block_num, stride=1):
        """
            构建模块
        """
        downsample = None

        if stride != 1 or self.in_channel != channel * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(channel * block.expansion))

        layers = []
        layers.append(block(self.in_channel, channel, downsample=downsample, stride=stride))
        self.in_channel = channel * block.expansion

        for _ in range(1, block_num):
            layers.append(block(self.in_channel, channel))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # 最终输出
        if self.include_top:
            x = self.avgpool(x)
            x = torch.flatten(x, 1)
            x = self.fc(x)

        return x


def resnet50(num_classes=1000, include_top=True):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top)

In [7]:
# src/utils.py

from math import sqrt
import itertools

import torch
import torch.nn.functional as F
from torch.jit.annotations import Tuple, List
from torch import nn, Tensor
import numpy as np

def box_area(boxes):
    """
    Computes the area of a set of bounding boxes, which are specified by its
    (x1, y1, x2, y2) coordinates.

    Arguments:
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
            are expected to be in (x1, y1, x2, y2) format

    Returns:
        area (Tensor[N]): area for each box
    """
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


def calc_iou_tensor(boxes1, boxes2):
    """
    Return intersection-over-union (Jaccard index) of boxes.

    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.

    Arguments:
        boxes1 (Tensor[N, 4])
        boxes2 (Tensor[M, 4])

    Returns:
        iou (Tensor[N, M]): the NxM matrix containing the pairwise
            IoU values for every element in boxes1 and boxes2
    """
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    #  When the shapes do not match,
    #  the shape of the returned output tensor follows the broadcasting rules
    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # left-top [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # right-bottom [N,M,2]

    wh = (rb - lt).clamp(min=0)  # [N,M,2]
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    iou = inter / (area1[:, None] + area2 - inter)
    return iou

# This function is from https://github.com/kuangliu/pytorch-ssd.
class Encoder(object):
    """
        Inspired by https://github.com/kuangliu/pytorch-src
        Transform between (bboxes, lables) <-> SSD output

        dboxes: default boxes in size 8732 x 4,
            encoder: input ltrb format, output xywh format
            decoder: input xywh format, output ltrb format

        encode:
            input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
            output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
            criteria : IoU threshold of bboexes

        decode:
            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
            criteria : IoU threshold of bboexes
            max_output : maximum number of output bboxes
    """
    def __init__(self, dboxes):
        self.dboxes = dboxes(order='ltrb')
        self.dboxes_xywh = dboxes(order='xywh').unsqueeze(dim=0)
        self.nboxes = self.dboxes.size(0)  # default boxes的数量
        self.scale_xy = dboxes.scale_xy
        self.scale_wh = dboxes.scale_wh

    def encode(self, bboxes_in, labels_in, criteria=0.5):
        """
        encode:
            input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
            output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
            criteria : IoU threshold of bboexes
        """
        # [nboxes, 8732]
        bboxes_in = bboxes_in.to(device=device)
        dboxes = self.dboxes.to(device=device)
        ious = calc_iou_tensor(bboxes_in, dboxes)  # 计算每个GT与default box的iou
        # [8732,]    每个锚框最可能的类型
        best_dbox_ious, best_dbox_idx = ious.max(dim=0)  # 寻找每个default box匹配到的最大IoU
        # [nboxes,]  每个类型对应的最佳锚框
        best_bbox_ious, best_bbox_idx = ious.max(dim=1)  # 寻找每个GT匹配到的最大IoU

        # 将每个GT匹配到的最佳default box设置为正样本（对应论文中Matching strategy的第一条）
        # set best ious 2.0
        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)  # dim, index, value
        # 将相应default box匹配最大IOU的GT索引进行替换
        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64).to(device=device)
        best_dbox_idx[best_bbox_idx[idx]] = idx

        # filter IoU > 0.5
        # 寻找与GT iou大于0.5的default box,对应论文中Matching strategy的第二条(这里包括了第一条匹配到的信息)
        masks = best_dbox_ious > criteria
        # [8732,]
        labels_out = torch.zeros(self.nboxes, dtype=torch.int64).to(device=device)
        labels_out[masks] = labels_in[best_dbox_idx[masks]]
        # 将default box匹配到正样本的位置设置成对应GT的box信息
        bboxes_out = self.dboxes.clone().to(device=device)
        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]

        # Transform format to xywh format
        x = 0.5 * (bboxes_out[:, 0] + bboxes_out[:, 2])  # x
        y = 0.5 * (bboxes_out[:, 1] + bboxes_out[:, 3])  # y
        w = bboxes_out[:, 2] - bboxes_out[:, 0]  # w
        h = bboxes_out[:, 3] - bboxes_out[:, 1]  # h
        bboxes_out[:, 0] = x
        bboxes_out[:, 1] = y
        bboxes_out[:, 2] = w
        bboxes_out[:, 3] = h
        return bboxes_out, labels_out

    def scale_back_batch(self, bboxes_in, scores_in):
        """
            将box格式从xywh转换回ltrb, 将预测目标score通过softmax处理
            Do scale and transform from xywh to ltrb
            suppose input N x 4 x num_bbox | N x label_num x num_bbox

            bboxes_in: 是网络预测的xywh回归参数
            scores_in: 是预测的每个default box的各目标概率
        """
        if bboxes_in.device == torch.device("cpu"):
            self.dboxes = self.dboxes.cpu()
            self.dboxes_xywh = self.dboxes_xywh.cpu()
        else:
            self.dboxes = self.dboxes.cuda()
            self.dboxes_xywh = self.dboxes_xywh.cuda()

        # Returns a view of the original tensor with its dimensions permuted.
        bboxes_in = bboxes_in.permute(0, 2, 1)
        scores_in = scores_in.permute(0, 2, 1)
        # print(bboxes_in.is_contiguous())

        bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2]   # 预测的x, y回归参数
        bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:]   # 预测的w, h回归参数

        # 将预测的回归参数叠加到default box上得到最终的预测边界框
        bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
        bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]

        # transform format to ltrb
        l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
        t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
        r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
        b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]

        bboxes_in[:, :, 0] = l  # xmin
        bboxes_in[:, :, 1] = t  # ymin
        bboxes_in[:, :, 2] = r  # xmax
        bboxes_in[:, :, 3] = b  # ymax

        return bboxes_in, F.softmax(scores_in, dim=-1)

    def decode_batch(self, bboxes_in, scores_in, criteria=0.45, max_output=200):
        # 将box格式从xywh转换回ltrb（方便后面非极大值抑制时求iou）, 将预测目标score通过softmax处理
        bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)

        outputs = []
        # 遍历一个batch中的每张image数据
        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
            bbox = bbox.squeeze(0)
            prob = prob.squeeze(0)
            outputs.append(self.decode_single_new(bbox, prob, criteria, max_output))
        return outputs

    def decode_single_new(self, bboxes_in, scores_in, criteria, num_output=200):
        """
        decode:
            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
            criteria : IoU threshold of bboexes
            max_output : maximum number of output bboxes
        """
        device = bboxes_in.device
        num_classes = scores_in.shape[-1]

        # 对越界的bbox进行裁剪
        bboxes_in = bboxes_in.clamp(min=0, max=1)

        # [8732, 4] -> [8732, 21, 4]
        bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)

        # create labels for each prediction
        labels = torch.arange(num_classes, device=device)
        labels = labels.view(1, -1).expand_as(scores_in)

        # remove prediction with the background label
        # 移除归为背景类别的概率信息
        bboxes_in = bboxes_in[:, 1:, :]
        scores_in = scores_in[:, 1:]
        labels = labels[:, 1:]

        # batch everything, by making every class prediction be a separate instance
        bboxes_in = bboxes_in.reshape(-1, 4)
        scores_in = scores_in.reshape(-1)
        labels = labels.reshape(-1)

        # remove low scoring boxes
        # 移除低概率目标，self.scores_thresh=0.05
        inds = torch.nonzero(scores_in > 0.05, as_tuple=False).squeeze(1).to(device=device)
        bboxes_in, scores_in, labels = bboxes_in[inds], scores_in[inds], labels[inds]

        # remove empty boxes
        ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
        keep = (ws >= 0.1 / 300) & (hs >= 0.1 / 300)
        keep = keep.nonzero(as_tuple=False).squeeze(1)
        bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]

        # non-maximum suppression
        keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)

        # keep only topk scoring predictions
        keep = keep[:num_output]
        bboxes_out = bboxes_in[keep, :]
        scores_out = scores_in[keep]
        labels_out = labels[keep]

        return bboxes_out, labels_out, scores_out

    # perform non-maximum suppression
    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
        """
        decode:
            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
            criteria : IoU threshold of bboexes
            max_output : maximum number of output bboxes
        """
        # Reference to https://github.com/amdegroot/ssd.pytorch
        bboxes_out = []
        scores_out = []
        labels_out = []

        # 非极大值抑制算法
        # scores_in (Tensor 8732 x nitems), 遍历返回每一列数据，即8732个目标的同一类别的概率
        for i, score in enumerate(scores_in.split(1, 1)):
            # skip background
            if i == 0:
                continue

            # [8732, 1] -> [8732]
            score = score.squeeze(1)

            # 虑除预测概率小于0.05的目标
            mask = score > 0.05
            bboxes, score = bboxes_in[mask, :], score[mask]
            if score.size(0) == 0:
                continue

            # 按照分数从小到大排序
            score_sorted, score_idx_sorted = score.sort(dim=0)

            # select max_output indices
            score_idx_sorted = score_idx_sorted[-max_num:]
            candidates = []

            while score_idx_sorted.numel() > 0:
                idx = score_idx_sorted[-1].item()
                # 获取排名前score_idx_sorted名的bboxes信息 Tensor:[score_idx_sorted, 4]
                bboxes_sorted = bboxes[score_idx_sorted, :]
                # 获取排名第一的bboxes信息 Tensor:[4]
                bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
                # 计算前score_idx_sorted名的bboxes与第一名的bboxes的iou
                iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()

                # we only need iou < criteria
                # 丢弃与第一名iou > criteria的所有目标(包括自己本身)
                score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
                # 保存第一名的索引信息
                candidates.append(idx)

            # 保存该类别通过非极大值抑制后的目标信息
            bboxes_out.append(bboxes[candidates, :])   # bbox坐标信息
            scores_out.append(score[candidates])       # score信息
            labels_out.extend([i] * len(candidates))   # 标签信息

        if not bboxes_out:  # 如果为空的话，返回空tensor，注意boxes对应的空tensor size，防止验证时出错
            return [torch.empty(size=(0, 4)), torch.empty(size=(0,), dtype=torch.int64), torch.empty(size=(0,))]

        bboxes_out = torch.cat(bboxes_out, dim=0).contiguous()
        scores_out = torch.cat(scores_out, dim=0).contiguous()
        labels_out = torch.as_tensor(labels_out, dtype=torch.long).to(device=device)

        # 对所有目标的概率进行排序（无论是什 么类别）,取前max_num个目标
        _, max_ids = scores_out.sort(dim=0)
        max_ids = max_ids[-max_output:]
        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]


# figsize = 300  # 输入网络的图像大小
# feat_size = [38, 19, 10, 5, 3, 1]   # 每个预测层的feature map尺寸
# steps = [8, 16, 32, 64, 100, 300]   # 每个特征层上的一个cell在原图上的跨度
# scales = [21, 45, 99, 153, 207, 261, 315]  # 每个特征层上预测的default box的scale
# aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]  # 每个预测特征层上预测的default box的ratios
class DefaultBoxes(object):
    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, scale_xy=0.1, scale_wh=0.2):
        self.fig_size = fig_size   # 输入网络的图像大小 300
        # [38, 19, 10, 5, 3, 1]
        self.feat_size = feat_size  # 每个预测层的feature map尺寸

        self.scale_xy_ = scale_xy

        self.scale_wh_ = scale_wh

        # [8, 16, 32, 64, 100, 300]
        self.steps = steps    # 每个特征层上的一个cell在原图上的跨度

        # [21, 45, 99, 153, 207, 261, 315]
        self.scales = scales  # 每个特征层上预测的default box的scale

        fk = fig_size / np.array(steps)     # 计算每层特征层的fk

        # [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
        self.aspect_ratios = aspect_ratios  # 每个预测特征层上预测的default box的ratios

        self.default_boxes = []
        # size of feature and number of feature
        # 遍历每层特征层，计算default box
        for idx, sfeat in enumerate(self.feat_size):
            sk1 = scales[idx] / fig_size  # scale转为相对值[0-1]
            sk2 = scales[idx + 1] / fig_size  # scale转为相对值[0-1]
            sk3 = sqrt(sk1 * sk2)
            # 先添加两个1:1比例的default box宽和高
            all_sizes = [(sk1, sk1), (sk3, sk3)]

            # 再将剩下不同比例的default box宽和高添加到all_sizes中
            for alpha in aspect_ratios[idx]:
                w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
                all_sizes.append((w, h))
                all_sizes.append((h, w))

            # 计算当前特征层对应原图上的所有default box
            for w, h in all_sizes:
                for i, j in itertools.product(range(sfeat), repeat=2):  # i -> 行（y）， j -> 列（x）
                    # 计算每个default box的中心坐标（范围是在0-1之间）
                    cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
                    self.default_boxes.append((cx, cy, w, h))

        # 将default_boxes转为tensor格式
        self.dboxes = torch.as_tensor(self.default_boxes, dtype=torch.float32)  # 这里不转类型会报错
        self.dboxes.clamp_(min=0, max=1)  # 将坐标（x, y, w, h）都限制在0-1之间

        # For IoU calculation
        # ltrb is left top coordinate and right bottom coordinate
        # 将(x, y, w, h)转换成(xmin, ymin, xmax, ymax)，方便后续计算IoU(匹配正负样本时)
        self.dboxes_ltrb = self.dboxes.clone()
        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]   # xmin
        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]   # ymin
        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]   # xmax
        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]   # ymax

    @property
    def scale_xy(self):
        return self.scale_xy_

    @property
    def scale_wh(self):
        return self.scale_wh_

    def __call__(self, order='ltrb'):
        # 根据需求返回对应格式的default box
        if order == 'ltrb':
            return self.dboxes_ltrb

        if order == 'xywh':
            return self.dboxes


def dboxes300_coco():
    figsize = 300  # 输入网络的图像大小
    feat_size = [38, 19, 10, 5, 3, 1]   # 每个预测层的feature map尺寸
    steps = [8, 16, 32, 64, 100, 300]   # 每个特征层上的一个cell在原图上的跨度
    scales = [21, 45, 99, 153, 207, 261, 315]  # 每个特征层上预测的default box的scale
    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]  # 每个预测特征层上预测的default box的ratios
    dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
    return dboxes


def nms(boxes, scores, iou_threshold):
    # type: (Tensor, Tensor, float) -> Tensor
    """
    Performs non-maximum suppression (NMS) on the boxes according
    to their intersection-over-union (IoU).

    NMS iteratively removes lower scoring boxes which have an
    IoU greater than iou_threshold with another (higher scoring)
    box.

    Parameters
    ----------
    boxes : Tensor[N, 4])
        boxes to perform NMS on. They
        are expected to be in (x1, y1, x2, y2) format
    scores : Tensor[N]
        scores for each one of the boxes
    iou_threshold : float
        discards all overlapping
        boxes with IoU < iou_threshold

    Returns
    -------
    keep : Tensor
        int64 tensor with the indices
        of the elements that have been kept
        by NMS, sorted in decreasing order of scores
    """
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)


def batched_nms(boxes, scores, idxs, iou_threshold):
    # type: (Tensor, Tensor, Tensor, float) -> Tensor
    """
    Performs non-maximum suppression in a batched fashion.

    Each index value correspond to a category, and NMS
    will not be applied between elements of different categories.

    Parameters
    ----------
    boxes : Tensor[N, 4]
        boxes where NMS will be performed. They
        are expected to be in (x1, y1, x2, y2) format
    scores : Tensor[N]
        scores for each one of the boxes
    idxs : Tensor[N]
        indices of the categories for each one of the boxes.
    iou_threshold : float
        discards all overlapping boxes
        with IoU < iou_threshold

    Returns
    -------
    keep : Tensor
        int64 tensor with the indices of
        the elements that have been kept by NMS, sorted
        in decreasing order of scores
    """
    if boxes.numel() == 0:
        return torch.empty((0,), dtype=torch.int64, device=boxes.device)

    # strategy: in order to perform NMS independently per class.
    # we add an offset to all the boxes. The offset is dependent
    # only on the class idx, and is large enough so that boxes
    # from different classes do not overlap
    # 获取所有boxes中最大的坐标值（xmin, ymin, xmax, ymax）
    max_coordinate = boxes.max()

    # to(): Performs Tensor dtype and/or device conversion
    # 为每一个类别生成一个很大的偏移量
    # 这里的to只是让生成tensor的dytpe和device与boxes保持一致
    offsets = idxs.to(boxes) * (max_coordinate + 1)
    # boxes加上对应层的偏移量后，保证不同类别之间boxes不会有重合的现象
    boxes_for_nms = boxes + offsets[:, None]
    keep = nms(boxes_for_nms, scores, iou_threshold)
    return keep


class PostProcess(nn.Module):
    def __init__(self, dboxes):
        super(PostProcess, self).__init__()
        # [num_anchors, 4] -> [1, num_anchors, 4]
        self.dboxes_xywh = nn.Parameter(dboxes(order='xywh').unsqueeze(dim=0),
                                        requires_grad=False)
        self.scale_xy = dboxes.scale_xy  # 0.1
        self.scale_wh = dboxes.scale_wh  # 0.2

        self.criteria = 0.5
        self.max_output = 100

    def scale_back_batch(self, bboxes_in, scores_in):
        # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
        """
            1）通过预测的boxes回归参数得到最终预测坐标
            2）将box格式从xywh转换回ltrb
            3）将预测目标score通过softmax处理
            Do scale and transform from xywh to ltrb
            suppose input N x 4 x num_bbox | N x label_num x num_bbox

            bboxes_in: [N, 4, 8732]是网络预测的xywh回归参数
            scores_in: [N, label_num, 8732]是预测的每个default box的各目标概率
        """

        # Returns a view of the original tensor with its dimensions permuted.
        # [batch, 4, 8732] -> [batch, 8732, 4]
        bboxes_in = bboxes_in.permute(0, 2, 1)
        # [batch, label_num, 8732] -> [batch, 8732, label_num]
        scores_in = scores_in.permute(0, 2, 1)
        # print(bboxes_in.is_contiguous())

        bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2]   # 预测的x, y回归参数
        bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:]   # 预测的w, h回归参数

        # 将预测的回归参数叠加到default box上得到最终的预测边界框
        bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
        bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]

        # transform format to ltrb
        l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
        t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
        r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
        b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]

        bboxes_in[:, :, 0] = l  # xmin
        bboxes_in[:, :, 1] = t  # ymin
        bboxes_in[:, :, 2] = r  # xmax
        bboxes_in[:, :, 3] = b  # ymax

        # scores_in: [batch, 8732, label_num]
        return bboxes_in, F.softmax(scores_in, dim=-1)

    def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
        # type: (Tensor, Tensor, float, int) -> Tuple[Tensor, Tensor, Tensor]
        """
        decode:
            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
            criteria : IoU threshold of bboexes
            max_output : maximum number of output bboxes
        """
        device = bboxes_in.device
        num_classes = scores_in.shape[-1]

        # 对越界的bbox进行裁剪
        bboxes_in = bboxes_in.clamp(min=0, max=1)

        # [8732, 4] -> [8732, 21, 4]
        bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)

        # create labels for each prediction
        labels = torch.arange(num_classes, device=device)
        # [num_classes] -> [8732, num_classes]
        labels = labels.view(1, -1).expand_as(scores_in)

        # remove prediction with the background label
        # 移除归为背景类别的概率信息
        bboxes_in = bboxes_in[:, 1:, :]  # [8732, 21, 4] -> [8732, 20, 4]
        scores_in = scores_in[:, 1:]  # [8732, 21] -> [8732, 20]
        labels = labels[:, 1:]  # [8732, 21] -> [8732, 20]

        # batch everything, by making every class prediction be a separate instance
        bboxes_in = bboxes_in.reshape(-1, 4)  # [8732, 20, 4] -> [8732x20, 4]
        scores_in = scores_in.reshape(-1)  # [8732, 20] -> [8732x20]
        labels = labels.reshape(-1)  # [8732, 20] -> [8732x20]

        # remove low scoring boxes
        # 移除低概率目标，self.scores_thresh=0.05
        # inds = torch.nonzero(scores_in > 0.05).squeeze(1)
        inds = torch.where(torch.gt(scores_in, 0.05))[0]
        bboxes_in, scores_in, labels = bboxes_in[inds, :], scores_in[inds], labels[inds]

        # remove empty boxes
        ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
        keep = (ws >= 1 / 300) & (hs >= 1 / 300)
        # keep = keep.nonzero().squeeze(1)
        keep = torch.where(keep)[0]
        bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]

        # non-maximum suppression
        keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)

        # keep only topk scoring predictions
        keep = keep[:num_output]
        bboxes_out = bboxes_in[keep, :]
        scores_out = scores_in[keep]
        labels_out = labels[keep]

        return bboxes_out, labels_out, scores_out

    def forward(self, bboxes_in, scores_in):
        # 通过预测的boxes回归参数得到最终预测坐标, 将预测目标score通过softmax处理
        bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)

        outputs = torch.jit.annotate(List[Tuple[Tensor, Tensor, Tensor]], [])
        # 遍历一个batch中的每张image数据
        # bboxes: [batch, 8732, 4]
        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):  # split_size, split_dim
            # bbox: [1, 8732, 4]
            bbox = bbox.squeeze(0)
            prob = prob.squeeze(0)
            outputs.append(self.decode_single_new(bbox, prob, self.criteria, self.max_output))
        return outputs

In [8]:
# src/ssd_model.py

import torch
from torch import nn, Tensor
from torch.jit.annotations import List

from torchvision import models

# from .res50_backbone import resnet50
# from .utils import dboxes300_coco, Encoder, PostProcess


class Backbone(nn.Module):
    """
        定义一个backbone
    """

    def __init__(self, pretrain_path=None):
        super(Backbone, self).__init__()

        # 定义一个resnet50
        net = models.resnet50(pretrained=True, progress=True)

        # 后续的通道数
        self.out_channels = [2048, 1024, 1024, 512, 512, 512]

        # 加载预训练模型
        if pretrain_path is not None:
            net.load_state_dict(torch.load(pretrain_path))

        # 截取特征提出部分
        self.feature_extractor = nn.Sequential(*list(net.children())[:7])

        # 修改其中的属性
        conv4_block1 = self.feature_extractor[-1][0]
        # 修改conv4_block1的stride，从2->1
        conv4_block1.conv1.stride = (1, 1)
        conv4_block1.conv2.stride = (1, 1)
        conv4_block1.downsample[0].stride = (1, 1)

    def forward(self, x):
        x = self.feature_extractor(x)
        return x
    
# 主模型
class SSD300(nn.Module):
    """
        SSD300主模型
    """

    def __init__(self, backbone=None, num_classes=21):
        """
            设置分类的数量

        """
        super(SSD300, self).__init__()

        # 参数校验
        if backbone is None:
            raise Exception("backbone is None")

        if not hasattr(backbone, "out_channels"):
            raise Exception("the backbone not has attribute: out_channel")

        self.feature_extractor = backbone

        self.num_classes = num_classes
        
        # [b, 1024, 76, 76] -> [b, 2048, 38, 38]
        self.conv = nn.Conv2d(in_channels=1024, out_channels=2048, kernel_size=3, padding=1, stride=2)

        # 构建自定义的特征层
        # out_channels = [2048, 1024, 1024, 512, 512, 512] for resnet50
        self._build_additional_features(self.feature_extractor.out_channels)
        
        # 每个特征层上每个特征点对应的锚框数量
        self.num_defaults = [4, 6, 6, 6, 4, 4]

        self.belt_pos = nn.Conv2d(in_channels=512, out_channels=2, kernel_size=1, padding=0)
        
        location_extractors = []
        confidence_extractors = []

        for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
            # nd is number_default_boxes, oc is output_channel
            location_extractors.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
            confidence_extractors.append(nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))

        self.loc = nn.ModuleList(location_extractors)
        self.conf = nn.ModuleList(confidence_extractors)
        self._init_weights()

        # 模型锚框的生成策略
        default_box = dboxes300_coco()

        # 下面三个是核心
        self.compute_loss = Loss(default_box)
        self.encoder = Encoder(default_box)
        self.postprocess = PostProcess(default_box)

    def _build_additional_features(self, input_size):
        """
        为backbone(resnet50)添加额外的一系列卷积层，得到相应的一系列特征提取器
        :param input_size:
        :return:
        """
        additional_blocks = []
        # input_size = [2048, 1024, 1024, 512, 512, 512] for resnet50
        # shape :          76   [38,   19,   10,   5,   3]
        # input_size[:-1]：1024 [2048, 1024, 1024, 512, 512]
        # middle_channels：     [512,  512,  256,  256, 256]
        # input_size[1:]：      [1024, 1024, 512,  512, 512]
        middle_channels = [512, 512, 256, 256, 256]
        for i, (input_ch, output_ch, middle_ch) in enumerate(zip(input_size[:-1], input_size[1:], middle_channels)):
            padding, stride = (1, 2) if i < 3 else (0, 1)
            layer = nn.Sequential(
                nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False),
                nn.BatchNorm2d(middle_ch),
                nn.ReLU(inplace=True),
                nn.Conv2d(middle_ch, output_ch, kernel_size=3, padding=padding, stride=stride, bias=False),
                nn.BatchNorm2d(output_ch),
#                 nn.ReLU(inplace=True),
            )
            additional_blocks.append(layer)
            
        self.additional_blocks = nn.ModuleList(additional_blocks)
        
        shortcut_blocks = [
            # [b, 1024, 76, 76] -> [b, 1024, 19, 19]
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1, stride=4),
            # [b, 2048, 38, 38] -> [b, 1024, 10, 10]
            nn.Conv2d(in_channels=2048, out_channels=1024, kernel_size=1, stride=4),
            # [b, 1024, 19, 19] -> [b, 512, 5, 5]
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=4),
            # [b, 1024, 10, 10] -> [b, 512, 3, 3]
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=4),
            # [b, 512, 5, 5] -> [b, 512, 3, 3]
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=1, stride=2)
        ]
        
        self.shortcut_blocks = nn.ModuleList(shortcut_blocks)

    def _init_weights(self):
        """
            要不要自己去初始化权重？？？
        """
        layers = [self.conv, *self.additional_blocks, *self.shortcut_blocks, *self.loc, *self.conf]
        for layer in layers:
            for param in layer.parameters():
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)

    # Shape the classifier to the view of bboxes
    def bbox_view(self, features, loc_extractor, conf_extractor):
        locs = []
        confs = []
        for f, l, c in zip(features, loc_extractor, conf_extractor):
            # [batch, n*4, feat_size, feat_size] -> [batch, 4, -1]
            locs.append(l(f).view(f.size(0), 4, -1))
            # [batch, n*classes, feat_size, feat_size] -> [batch, classes, -1]
            confs.append(c(f).view(f.size(0), self.num_classes, -1))

        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
        return locs, confs

    def forward(self, eopch, image, targets=None):
        # [b, 3, 600, 600] -> [b, 1024, 76, 76]
        x = self.feature_extractor(image)
        
        shortcuts = []
        shortcuts.append(self.shortcut_blocks[0](x))
        
        # [b, 1024, 76, 76] -> [b, 2048, 38, 38]
        x = self.conv(x)

        # Feature Map 2048, 19x19x1024, 10x10x1024, 5x5x512, 3x3x512, 1x1x512
        detection_features = torch.jit.annotate(List[Tensor], [])  # [x]
        detection_features.append(x)
            
        relu_inplace = nn.ReLU(inplace=True)
        relu = nn.ReLU(inplace=False)
        
        additional_blocks_count = len(self.additional_blocks)
        for i in range(additional_blocks_count):
            # for sub_layer in layer.children():
            #    try:
            #        print(f"{sub_layer}, weight.grad:{sub_layer.weight.grad}")
            #    except:
            #        pass
            
            # shape          :  76   [38,   19,   10,   5,   3]
            # input_size[:-1]： 1024 [2048, 1024, 1024, 512, 512]
            # middle_channels：      [512,  512,  256,  256, 256]
            # input_size[1:]：       [1024, 1024, 512,  512, 512]
            if i < 4:
                shortcuts.append(self.shortcut_blocks[i + 1](x))
            
            if i >= 1:
                x = self.additional_blocks[i](relu_inplace(x + shortcuts[i - 1]))
                if i == additional_blocks_count - 1:
                    x = relu_inplace(x)
            else:
                x = relu_inplace(self.additional_blocks[i](x))
        
            detection_features.append(x)
            
        # [b, 512, 1, 1] -> [b, 2, 1, 1]
        ptask2_label = self.belt_pos(x)
        # [b, 2, 1, 1] -> [b, 2]
        ptask2_label = ptask2_label.reshape(-1, 2)

        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
        locs, confs = self.bbox_view(detection_features, self.loc, self.conf)

        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
        # 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732

        if self.training:
            if targets is None:
                raise ValueError("In training mode, targets should be passed")
            # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
            bboxes_out = targets['boxes']
            bboxes_out = bboxes_out.transpose(1, 2).contiguous()
            # print(bboxes_out.is_contiguous())
            labels_out = targets['labels']
            # print(labels_out.is_contiguous())

#             if epoch % 20 == 0:
#                 self.show_train_effect(image, locs, confs, bboxes_out, labels_out)
            
            # ploc, plabel, gloc, glabel
            loss = self.compute_loss(locs, 
                                     confs, 
                                     bboxes_out, 
                                     labels_out, 
                                     ptask2_label, 
                                     targets['task2_label'])
            return {"total_losses": loss}

        # 将预测回归参数叠加到default box上得到最终预测box，并执行非极大值抑制虑除重叠框
        # results = self.encoder.decode_batch(locs, confs)
        results = {}
        results[PREDICT_RESULT_TASK1] = self.postprocess(locs, confs)
        results[PREDICT_RESULT_TASK2] = ptask2_label
        return results
    
    def show_train_effect(self, imgs, locs_pred, confs_pred, bboxes_out, labels_out):
        
        batch_size = locs_pred.shape[0]
        # 从当前批次任选一图片，展示预测的偏移准确度效果
        choice = random.randint(0, batch_size - 1)
        # [32, 4, 8732], [32, 3, 8732] -> [4, 8732], [3, 8732]
        locs_pred, confs_pred = locs_pred[choice], confs_pred[choice]
        # [32, 4, 8732], [32, 8732] -> [4, 8732], [8732] 
        bboxes_out, labels_out = bboxes_out[choice], labels_out[choice]
        
        # [4, 8732] -> [8732, 4]
        locs_pred = locs_pred.permute(1, 0)
        # [3, 8732] -> [8732, 3]
        confs_pred = confs_pred.permute(1, 0)
        # [4, 8732] -> [8732, 4]
        bboxes_out = bboxes_out.permute(1, 0)
        
        # [8732] -> [8732]
        true_flag = labels_out > 0
        
        # [8732, 4] -> [N', 4]
        locs_pred = locs_pred[true_flag]
        # [8732, 4] -> [N']
        confs_pred = confs_pred[true_flag]
        
        pred_scores, confs_pred = torch.max(input=confs_pred, dim=1)
        # [8732, 4] -> [N', 4]
        bboxes_out = bboxes_out[true_flag]
        # [8732] -> [N']
        labels_out = labels_out[true_flag]
        
        bboxes = torch.zeros_like(bboxes_out).to(device=device)
        bboxes[:, 0] = bboxes_out[:, 0] - bboxes_out[:, 2] * 0.5
        bboxes[:, 1] = bboxes_out[:, 1] - bboxes_out[:, 3] * 0.5
        bboxes[:, 2] = bboxes_out[:, 0] + bboxes_out[:, 2] * 0.5
        bboxes[:, 3] = bboxes_out[:, 1] + bboxes_out[:, 3] * 0.5
        
        dboxes_xywh = dboxes300_coco()(order='xywh').to(device=device)
        dboxes_xywh = dboxes_xywh[true_flag]
        locs_pred[:, :2] = 0.1 * locs_pred[:, :2]   # 预测的x, y回归参数
        locs_pred[:, 2:] = 0.2 * locs_pred[:, 2:]   # 预测的w, h回归参数

        # 将预测的回归参数叠加到default box上得到最终的预测边界框
        locs_pred[:, :2] = locs_pred[:, :2] * dboxes_xywh[:, 2:] + dboxes_xywh[:, :2]
        locs_pred[:, 2:] = locs_pred[:, 2:].exp() * dboxes_xywh[:, 2:]

        # transform format to ltrb
        l = locs_pred[:, 0] - 0.5 * locs_pred[:, 2]
        t = locs_pred[:, 1] - 0.5 * locs_pred[:, 3]
        r = locs_pred[:, 0] + 0.5 * locs_pred[:, 2]
        b = locs_pred[:, 1] + 0.5 * locs_pred[:, 3]

        locs_pred[:, 0] = l  # xmin
        locs_pred[:, 1] = t  # ymin
        locs_pred[:, 2] = r  # xmax
        locs_pred[:, 3] = b  # ymax
        
#         print(f"train: bboxes:{bboxes[:5].detach()}, labels_out:{labels_out[:5].detach()}")
#         print(f"train: locs_pred:{locs_pred[:5].detach()}, confs_pred:{labels_out[:5].detach()}, pred_scores:{pred_scores}")
        
        save_anchor_pic(image=imgs, 
                        path="./output/train_pred.jpg", 
                        bboxes=locs_pred.detach(), 
                        bboxes_label = confs_pred.detach(), 
                        bboxes_scores=pred_scores,
                        height=600, 
                        width=600, 
                        targets=bboxes, 
                        targets_label = labels_out.detach(),
                        choice=choice)

class Loss(nn.Module):
    """
        Implements the loss as the sum of the followings:
        1. Confidence Loss: All labels, with hard negative mining
        2. Localization Loss: Only on positive labels
        Suppose input dboxes has the shape 8732x4
    """

    def __init__(self, dboxes):
        super(Loss, self).__init__()
        # Two factor are from following links
        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
        self.scale_xy = 1.0 / dboxes.scale_xy  # 10
        self.scale_wh = 1.0 / dboxes.scale_wh  # 5

        self.location_loss = nn.SmoothL1Loss(reduction='none')
        # [num_anchors, 4] -> [4, num_anchors] -> [1, 4, num_anchors]
        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0),
                                   requires_grad=False)

        self.confidence_loss = nn.CrossEntropyLoss(reduction='none')
        self.task2_loss = nn.CrossEntropyLoss()

    def _location_vec(self, loc):
        # type: (Tensor) -> Tensor
        """
        Generate Location Vectors
        计算ground truth相对anchors的回归参数
        :param loc: anchor匹配到的对应GTBOX Nx4x8732
        :return:
        """
        
        gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :]  # Nx2x8732
        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()  # Nx2x8732
        return torch.cat((gxy, gwh), dim=1).contiguous()

    def forward(self, ploc, plabel, gloc, glabel, ptask2_label, gtask2_label):
        # type: (Tensor, Tensor, Tensor, Tensor, Tensor) -> Tensor
        """
            ploc, plabel: Nx4x8732, Nxlabel_numx8732
                predicted location and labels

            gloc, glabel: Nx4x8732, Nx8732
                ground truth location and labels
            ptask2_label, gtask2_label: Nx2, Nx2
        """
        # 获取正样本的mask  Tensor: [N, 8732]
        mask = torch.gt(glabel, 0)  # (gt: >)
        # mask1 = torch.nonzero(glabel)
        # 计算一个batch中的每张图片的正样本个数 Tensor: [N]
        pos_num = mask.sum(dim=1)

        # 计算gt的location回归参数 Tensor: [N, 4, 8732]
        vec_gd = self._location_vec(gloc)

        # sum on four coordinates, and mask
        # 计算定位损失(只有正样本)
        loc_loss = self.location_loss(ploc, vec_gd).sum(dim=1)  # Tensor: [N, 8732]
        loc_loss = (mask.float() * loc_loss).sum(dim=1)  # Tenosr: [N]
        
        # hard negative mining Tenosr: [N, 8732]
        con = self.confidence_loss(plabel, glabel)

        # positive mask will never selected
        # 获取负样本
        con_neg = con.clone()
        con_neg[mask] = 0.0
        # 按照confidence_loss降序排列 con_idx(Tensor: [N, 8732])
        _, con_idx = con_neg.sort(dim=1, descending=True)
        # 第一次降序sort，置信度越高者，排在越前面,
        # 第二次对con_idx做升序排列，则排序结果又恢复为con_neg中的原有排序顺序，而每个位置对应在con_rank的值，
        # 记录的是其在con_idx的排序顺序，也就是元素序列值越小，confidence值越大，这样就可以直接mask con_neg，
        # 获取指定位置的元素, 比如：
        # >>> mask = torch.as_tensor([[True, False, False, True], [True, False, False, False]])
        # >>> mask
        # tensor([[True, False, False, True],
        #         [True, False, False, False]])
        # >>> con[mask]
        # tensor([0.2000, 0.3400, 0.5000])
        # >>> con_neg = con.clone()
        # >>> con_neg
        # tensor([[0.2000, 0.3000, 0.1200, 0.3400],
        #         [0.5000, 0.2000, 0.6000, 0.7000]])
        # >>> con_neg = con[mask]
        # >>> con_neg
        # tensor([0.2000, 0.3400, 0.5000])
        # >>> con_neg = con.clone()
        # >>> con_neg
        # tensor([[0.2000, 0.3000, 0.1200, 0.3400],
        #         [0.5000, 0.2000, 0.6000, 0.7000]])
        # >>> con_neg[mask] = 0
        # >>> con_neg
        # tensor([[0.0000, 0.3000, 0.1200, 0.0000],
        #         [0.0000, 0.2000, 0.6000, 0.7000]])
        # >>> _, con_idx = torch.sort(con_neg, descending=True)
        # >>> _
        # tensor([[0.3000, 0.1200, 0.0000, 0.0000],
        #         [0.7000, 0.6000, 0.2000, 0.0000]])
        # >>> con_idx
        # tensor([[1, 2, 0, 3],
        #         [3, 2, 1, 0]])
        # >>> _, con_rank = torch.sort(con_idx)
        # >>> _
        # tensor([[0, 1, 2, 3],
        #         [0, 1, 2, 3]])
        # >>> con_rank
        # tensor([[2, 0, 1, 3],
        #         [3, 2, 1, 0]])
        # >>> con * torch.lt(con_rank, 2)
        # tensor([[0.0000, 0.3000, 0.1200, 0.0000],
        #         [0.0000, 0.0000, 0.6000, 0.7000]])
        _, con_rank = con_idx.sort(dim=1)  # 这个步骤比较巧妙

        # number of negative three times positive
        # 用于损失计算的负样本数是正样本的3倍（在原论文Hard negative mining部分），
        # 但不能超过总样本数8732
        neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
        neg_mask = torch.lt(con_rank, neg_num)  # (lt: <) Tensor [N, 8732]

        # confidence最终loss使用选取的正样本loss+选取的负样本loss
        con_loss = (con * (mask.float() + neg_mask.float())).sum(dim=1)  # Tensor [N]

        # eg. [15, 3, 5, 0] -> [1.0, 1.0, 1.0, 0.0]
        num_mask = torch.gt(pos_num, 0).float()  # 统计一个batch中的每张图像中是否存在正样本
        pos_num = pos_num.float().clamp(min=1e-6)  # 防止出现分母为零的情况
        
        loc_loss = (loc_loss * num_mask / pos_num).mean(dim=0)  # 只计算存在正样本的图像损失
        con_loss = (con_loss * num_mask / pos_num).mean(dim=0)  # 只计算存在正样本的图像损失
        
        task2_loss = self.task2_loss(ptask2_label, gtask2_label)
        
        # avoid no object detected
        # 避免出现图像中没有GTBOX的情况
        total_loss = 0.5 * (loc_loss + con_loss) + 0.5 * task2_loss
        
#         print(f"---loss: con_loss:{con_loss:.4}, loc_loss:{loc_loss:.4}, task2_loss:{task2_loss:.4}, total_loss:{total_loss:.4}")
        
        return total_loss

In [9]:
# train_utils/distributed_utils.py

from collections import defaultdict, deque
import datetime
import pickle
import time
import errno
import os

import torch
import torch.distributed as dist


class SmoothedValue(object):
    """Track a series of values and provide access to smoothed values over a
    window or the global series average.
    """
    def __init__(self, window_size=20, fmt=None):
        if fmt is None:
            fmt = "{value:.4f} ({global_avg:.4f}) hist: {all_hist_data}"
        self.deque = deque(maxlen=window_size)  # deque简单理解成加强版list
        self.total = 0.0
        self.count = 0
        self.fmt = fmt

    def update(self, value, n=1):
        self.deque.append(value)
        self.count += n
        self.total += value * n

    def synchronize_between_processes(self):
        """
        Warning: does not synchronize the deque!
        """
        if not is_dist_avail_and_initialized():
            return
        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
        dist.barrier()
        dist.all_reduce(t)
        t = t.tolist()
        self.count = int(t[0])
        self.total = t[1]

    @property
    def median(self):  # @property 是装饰器，这里可简单理解为增加median属性(只读)
        d = torch.tensor(list(self.deque))
        return d.median().item()

    @property
    def avg(self):
        d = torch.tensor(list(self.deque), dtype=torch.float32)
        return d.mean().item()

    @property
    def global_avg(self):
        return self.total / self.count

    @property
    def max(self):
        return max(self.deque)

    @property
    def value(self):
        return self.deque[-1]

    @property
    def all_hist_data(self):
        delimiter = " "
        hist_data = []
        for data in self.deque:
            hist_data.append("{:.2f}".format(data))

        return "[" + delimiter.join(hist_data) + "]"

    def __str__(self):
        return self.fmt.format(
            median=self.median,
            avg=self.avg,
            global_avg=self.global_avg,
            max=self.max,
            value=self.value,
            all_hist_data=self.all_hist_data)


def all_gather(data):
    """
    Run all_gather on arbitrary picklable data (not necessarily tensors)
    Args:
        data: any picklable object
    Returns:
        list[data]: list of data gathered from each rank
    """
    world_size = get_world_size()
    if world_size == 1:
        return [data]

    # serialized to a Tensor
    buffer = pickle.dumps(data)
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to("cuda")

    # obtain Tensor size of each rank
    local_size = torch.tensor([tensor.numel()], device="cuda")
    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
    dist.all_gather(size_list, local_size)
    size_list = [int(size.item()) for size in size_list]
    max_size = max(size_list)

    # receiving Tensor from all ranks
    # we pad the tensor because torch all_gather does not support
    # gathering tensors of different shapes
    tensor_list = []
    for _ in size_list:
        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
    if local_size != max_size:
        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
        tensor = torch.cat((tensor, padding), dim=0)
    dist.all_gather(tensor_list, tensor)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list


def reduce_dict(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that all processes
    have the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:  # 单GPU的情况
        return input_dict
    with torch.no_grad():  # 多GPU的情况
        names = []
        values = []
        # sort the keys so that they are consistent across processes
        for k in sorted(input_dict.keys()):
            names.append(k)
            values.append(input_dict[k])
        values = torch.stack(values, dim=0)
        dist.all_reduce(values)
        if average:
            values /= world_size

        reduced_dict = {k: v for k, v in zip(names, values)}
        return reduced_dict


class MetricLogger(object):
    def __init__(self, delimiter="\t"):
        self.meters = defaultdict(SmoothedValue)
        self.delimiter = delimiter

    def update(self, **kwargs):
        for k, v in kwargs.items():
            if isinstance(v, torch.Tensor):
                v = v.item()
            assert isinstance(v, (float, int))
            self.meters[k].update(v)

    def __getattr__(self, attr):
        if attr in self.meters:
            return self.meters[attr]
        if attr in self.__dict__:
            return self.__dict__[attr]
        raise AttributeError("'{}' object has no attribute '{}'".format(
            type(self).__name__, attr))

    def __str__(self):
        loss_str = []
        for name, meter in self.meters.items():
            loss_str.append(
                "{}: {}".format(name, str(meter))
            )
        return self.delimiter.join(loss_str)

    def synchronize_between_processes(self):
        for meter in self.meters.values():
            meter.synchronize_between_processes()

    def add_meter(self, name, meter):
        self.meters[name] = meter

    def log_every(self, iterable, print_freq, header=None):
        i = 0
        if not header:
            header = ""
        start_time = time.time()
        end = time.time()
        iter_time = SmoothedValue(fmt='{avg:.4f}')
        data_time = SmoothedValue(fmt='{avg:.4f}')
        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
        if torch.cuda.is_available():
            log_msg = self.delimiter.join([header,
                                           '[{0' + space_fmt + '}/{1}]',
                                           'eta: {eta}',
                                           '{meters}',
                                           'time: {time}',
                                           'data: {data}',
                                           'max mem: {memory:.0f}'])
        else:
            log_msg = self.delimiter.join([header,
                                           '[{0' + space_fmt + '}/{1}]',
                                           'eta: {eta}',
                                           '{meters}',
                                           'time: {time}',
                                           'data: {data}'])
        MB = 1024.0 * 1024.0
        for obj in iterable:
            data_time.update(time.time() - end)
            yield obj
            iter_time.update(time.time() - end)
            if i % print_freq == 0 or i == len(iterable) - 1:
                eta_second = iter_time.global_avg * (len(iterable) - i)
                eta_string = str(datetime.timedelta(seconds=eta_second))
                if torch.cuda.is_available():
                    print(log_msg.format(i, len(iterable),
                                         eta=eta_string,
                                         meters=str(self),
                                         time=str(iter_time),
                                         data=str(data_time),
                                         memory=torch.cuda.max_memory_allocated() / MB))
                else:
                    print(log_msg.format(i, len(iterable),
                                         eta=eta_string,
                                         meters=str(self),
                                         time=str(iter_time),
                                         data=str(data_time)))
            i += 1
            end = time.time()
        total_time = time.time() - start_time
        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
        print('{} Total time: {} ({:.4f} s / it)'.format(header,
                                                         total_time_str,

                                                         total_time / len(iterable)))


def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):

    def f(x):
        """根据step数返回一个学习率倍率因子"""
        if x >= warmup_iters:  # 当迭代数大于给定的warmup_iters时，倍率因子为1
            return 1
        alpha = float(x) / warmup_iters
        # 迭代过程中倍率因子从warmup_factor -> 1
        return warmup_factor * (1 - alpha) + alpha

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)


def mkdir(path):
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise


def setup_for_distributed(is_master):
    """
    This function disables when not in master process
    """
    import builtins as __builtin__
    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop('force', False)
        if is_master or force:
            builtin_print(*args, **kwargs)

    __builtin__.print = print


def is_dist_avail_and_initialized():
    """检查是否支持分布式环境"""
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True


def get_world_size():
    if not is_dist_avail_and_initialized():
        return 1
    return dist.get_world_size()


def get_rank():
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()


def is_main_process():
    return get_rank() == 0


def save_on_master(*args, **kwargs):
    if is_main_process():
        torch.save(*args, **kwargs)


def init_distributed_mode(args):
    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ['WORLD_SIZE'])
        args.gpu = int(os.environ['LOCAL_RANK'])
    elif 'SLURM_PROCID' in os.environ:
        args.rank = int(os.environ['SLURM_PROCID'])
        args.gpu = args.rank % torch.cuda.device_count()
    else:
        print('Not using distributed mode')
        args.distributed = False
        return

    args.distributed = True

    torch.cuda.set_device(args.gpu)
    args.dist_backend = 'nccl'
    print('| distributed init (rank {}): {}'.format(
        args.rank, args.dist_url), flush=True)
    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                         world_size=args.world_size, rank=args.rank)
    torch.distributed.barrier()
    setup_for_distributed(args.rank == 0)

In [10]:
# train_utils/coco_eval.py
import json
import copy
from collections import defaultdict

import numpy as np
import torch

from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO
import pycocotools.mask as mask_util

# from train_utils.distributed_utils import all_gather


class CocoEvaluator(object):
    def __init__(self, coco_gt, iou_types):
        assert isinstance(iou_types, (list, tuple))
        coco_gt = copy.deepcopy(coco_gt)
        self.coco_gt = coco_gt

        self.iou_types = iou_types
        self.coco_eval = {}
        for iou_type in iou_types:
            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)

        self.img_ids = []
        self.eval_imgs = {k: [] for k in iou_types}

    def update(self, predictions):
        img_ids = list(np.unique(list(predictions.keys())))
        self.img_ids.extend(img_ids)

        for iou_type in self.iou_types:
            results = self.prepare(predictions, iou_type)
            coco_dt = loadRes(self.coco_gt, results) if results else COCO()
            coco_eval = self.coco_eval[iou_type]

            coco_eval.cocoDt = coco_dt
            coco_eval.params.imgIds = list(img_ids)
            img_ids, eval_imgs = evaluate_inner(coco_eval)

            self.eval_imgs[iou_type].append(eval_imgs)

    def synchronize_between_processes(self):
        for iou_type in self.iou_types:
            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])

    def accumulate(self):
        for coco_eval in self.coco_eval.values():
            coco_eval.accumulate()

    def summarize(self):
        for iou_type, coco_eval in self.coco_eval.items():
            print("IoU metric: {}".format(iou_type))
            coco_eval.summarize()

    def prepare(self, predictions, iou_type):
        if iou_type == "bbox":
            return self.prepare_for_coco_detection(predictions)
        elif iou_type == "segm":
            return self.prepare_for_coco_segmentation(predictions)
        elif iou_type == "keypoints":
            return self.prepare_for_coco_keypoint(predictions)
        else:
            raise ValueError("Unknown iou type {}".format(iou_type))

    def prepare_for_coco_detection(self, predictions):
        coco_results = []
        for original_id, prediction in predictions.items():
            if len(prediction) == 0:
                continue

            # xmin, ymin, xmax, ymax
            boxes = prediction["boxes"]
            boxes = convert_to_xywh(boxes)
            boxes = boxes.tolist()
            scores = prediction["scores"].tolist()
            labels = prediction["labels"].tolist()

            coco_results.extend(
                [
                    {
                        "image_id": original_id,
                        "category_id": labels[k],
                        "bbox": box,
                        "score": scores[k],
                    }
                    for k, box in enumerate(boxes)
                ]
            )
        return coco_results

    def prepare_for_coco_segmentation(self, predictions):
        coco_results = []
        for original_id, prediction in predictions.items():
            if len(prediction) == 0:
                continue

            scores = prediction["scores"]
            labels = prediction["labels"]
            masks = prediction["masks"]

            masks = masks > 0.5

            scores = prediction["scores"].tolist()
            labels = prediction["labels"].tolist()

            rles = [
                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
                for mask in masks
            ]
            for rle in rles:
                rle["counts"] = rle["counts"].decode("utf-8")

            coco_results.extend(
                [
                    {
                        "image_id": original_id,
                        "category_id": labels[k],
                        "segmentation": rle,
                        "score": scores[k],
                    }
                    for k, rle in enumerate(rles)
                ]
            )
        return coco_results

    def prepare_for_coco_keypoint(self, predictions):
        coco_results = []
        for original_id, prediction in predictions.items():
            if len(prediction) == 0:
                continue

            boxes = prediction["boxes"]
            boxes = convert_to_xywh(boxes).tolist()
            scores = prediction["scores"].tolist()
            labels = prediction["labels"].tolist()
            keypoints = prediction["keypoints"]
            keypoints = keypoints.flatten(start_dim=1).tolist()

            coco_results.extend(
                [
                    {
                        "image_id": original_id,
                        "category_id": labels[k],
                        'keypoints': keypoint,
                        "score": scores[k],
                    }
                    for k, keypoint in enumerate(keypoints)
                ]
            )
        return coco_results


def convert_to_xywh(boxes):
    xmin, ymin, xmax, ymax = boxes.unbind(1)
    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)


def merge(img_ids, eval_imgs):
    all_img_ids = all_gather(img_ids)
    all_eval_imgs = all_gather(eval_imgs)

    merged_img_ids = []
    for p in all_img_ids:
        merged_img_ids.extend(p)

    merged_eval_imgs = []
    for p in all_eval_imgs:
        merged_eval_imgs.append(p)

    merged_img_ids = np.array(merged_img_ids)
    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)

    # keep only unique (and in sorted order) images
    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
    merged_eval_imgs = merged_eval_imgs[..., idx]

    return merged_img_ids, merged_eval_imgs


def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
    img_ids, eval_imgs = merge(img_ids, eval_imgs)
    img_ids = list(img_ids)
    eval_imgs = list(eval_imgs.flatten())

    coco_eval.evalImgs = eval_imgs
    coco_eval.params.imgIds = img_ids
    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)


#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################

# Ideally, pycocotools wouldn't have hard-coded prints
# so that we could avoid copy-pasting those two functions

def createIndex(self):
    # create index
    # print('creating index...')
    anns, cats, imgs = {}, {}, {}
    imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
    if 'annotations' in self.dataset:
        for ann in self.dataset['annotations']:
            imgToAnns[ann['image_id']].append(ann)
            anns[ann['id']] = ann

    if 'images' in self.dataset:
        for img in self.dataset['images']:
            imgs[img['id']] = img

    if 'categories' in self.dataset:
        for cat in self.dataset['categories']:
            cats[cat['id']] = cat

    if 'annotations' in self.dataset and 'categories' in self.dataset:
        for ann in self.dataset['annotations']:
            catToImgs[ann['category_id']].append(ann['image_id'])

    # print('index created!')

    # create class members
    self.anns = anns
    self.imgToAnns = imgToAnns
    self.catToImgs = catToImgs
    self.imgs = imgs
    self.cats = cats


maskUtils = mask_util


def loadRes(self, resFile):
    """
    Load result file and return a result api object.
    :param   resFile (str)     : file name of result file
    :return: res (obj)         : result api object
    """
    res = COCO()
    res.dataset['images'] = [img for img in self.dataset['images']]

    # print('Loading and preparing results...')
    # tic = time.time()
    if isinstance(resFile, (str, bytes)):
        anns = json.load(open(resFile))
    elif type(resFile) == np.ndarray:
        anns = self.loadNumpyAnnotations(resFile)
    else:
        anns = resFile
    assert type(anns) == list, 'results in not an array of objects'
    annsImgIds = [ann['image_id'] for ann in anns]
    assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
        'Results do not correspond to current coco set'
    if 'caption' in anns[0]:
        imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
        res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
        for id, ann in enumerate(anns):
            ann['id'] = id + 1
    elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
        for id, ann in enumerate(anns):
            bb = ann['bbox']
            x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
            if 'segmentation' not in ann:
                ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
            ann['area'] = bb[2] * bb[3]
            ann['id'] = id + 1
            ann['iscrowd'] = 0
    elif 'segmentation' in anns[0]:
        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
        for id, ann in enumerate(anns):
            # now only support compressed RLE format as segmentation results
            ann['area'] = maskUtils.area(ann['segmentation'])
            if 'bbox' not in ann:
                ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
            ann['id'] = id + 1
            ann['iscrowd'] = 0
    elif 'keypoints' in anns[0]:
        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
        for id, ann in enumerate(anns):
            s = ann['keypoints']
            x = s[0::3]
            y = s[1::3]
            x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
            ann['area'] = (x2 - x1) * (y2 - y1)
            ann['id'] = id + 1
            ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
    # print('DONE (t={:0.2f}s)'.format(time.time()- tic))

    res.dataset['annotations'] = anns
    createIndex(res)
    return res


def evaluate_inner(self):
    '''
    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
    :return: None
    '''
    # tic = time.time()
    # print('Running per image evaluation...')
    p = self.params
    # add backward compatibility if useSegm is specified in params
    if p.useSegm is not None:
        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
    # print('Evaluate annotation type *{}*'.format(p.iouType))
    p.imgIds = list(np.unique(p.imgIds))
    if p.useCats:
        p.catIds = list(np.unique(p.catIds))
    p.maxDets = sorted(p.maxDets)
    self.params = p

    self._prepare()
    # loop through images, area range, max detection number
    catIds = p.catIds if p.useCats else [-1]

    if p.iouType == 'segm' or p.iouType == 'bbox':
        computeIoU = self.computeIoU
    elif p.iouType == 'keypoints':
        computeIoU = self.computeOks
    self.ious = {
        (imgId, catId): computeIoU(imgId, catId)
        for imgId in p.imgIds
        for catId in catIds}

    evaluateImg = self.evaluateImg
    maxDet = p.maxDets[-1]
    evalImgs = [
        evaluateImg(imgId, catId, areaRng, maxDet)
        for catId in catIds
        for areaRng in p.areaRng
        for imgId in p.imgIds
    ]
    # this is NOT in the pycocotools code, but could be done outside
    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
    self._paramsEval = copy.deepcopy(self.params)
    # toc = time.time()
    # print('DONE (t={:0.2f}s).'.format(toc-tic))
    return p.imgIds, evalImgs

#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################

In [11]:
# train_utils/coco_utils.py

from tqdm import tqdm

import torch
import torchvision
import torch.utils.data
from pycocotools.coco import COCO


def convert_to_coco_api(ds):
    coco_ds = COCO()
    # annotation IDs need to start at 1, not 0
    ann_id = 1
    dataset = {'images': [], 'categories': [], 'annotations': []}
    categories = set()
    for img_idx in range(len(ds)):
        # find better way to get target
        targets = ds.coco_index(img_idx)
        if not targets:
            continue
        image_id = targets[TARGET_FIELD_IMAGE_ID].item()
        img_dict = {}
        img_dict['id'] = image_id
        img_dict['height'] = targets[TARGET_FIELD_HEIGHT_WIDTH][0]
        img_dict['width'] = targets[TARGET_FIELD_HEIGHT_WIDTH][1]
        dataset['images'].append(img_dict)

        # xmin, ymin, xmax, ymax
        bboxes = targets[TARGET_FIELD_TASK1_ANCHORS]

        # (xmin, ymin, xmax, ymax) to (xmin, ymin, w, h)
        bboxes[:, 2:] -= bboxes[:, :2]
        # 将box的相对坐标信息（0-1）转为绝对值坐标
        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * img_dict["width"]
        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * img_dict["height"]
        bboxes = bboxes.tolist()
        labels = targets[TARGET_FIELD_TASK1_LABELS].tolist()
        # 注意这里的boxes area也要进行转换，否则导致(small, medium, large)计算错误
        areas = (targets[TARGET_FIELD_TASK1_AREA] * img_dict["width"] * img_dict["height"]).tolist()
        iscrowd = targets[TARGET_FIELD_TASK1_ISCROWD].tolist()
        num_objs = len(bboxes)
        for i in range(num_objs):
            ann = {}
            ann['image_id'] = image_id
            ann['bbox'] = bboxes[i]
            ann['category_id'] = labels[i]
            categories.add(labels[i])
            ann['area'] = areas[i]
            ann['iscrowd'] = iscrowd[i]
            ann['id'] = ann_id
            dataset['annotations'].append(ann)
            ann_id += 1
    dataset['categories'] = [{'id': i} for i in sorted(categories)]
    coco_ds.dataset = dataset
    coco_ds.createIndex()
    return coco_ds


def get_coco_api_from_dataset(dataset):
    for _ in range(10):
        if isinstance(dataset, torchvision.datasets.CocoDetection):
            break
        if isinstance(dataset, torch.utils.data.Subset):
            dataset = dataset.dataset
    if isinstance(dataset, torchvision.datasets.CocoDetection):
        return dataset.coco
    return convert_to_coco_api(dataset)


In [12]:
# group_by_aspect_ratio.py

import bisect
from collections import defaultdict
import copy
from itertools import repeat, chain
import math
import numpy as np

import torch
import torch.utils.data
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.model_zoo import tqdm
import torchvision

from PIL import Image


def _repeat_to_at_least(iterable, n):
    repeat_times = math.ceil(n / len(iterable))
    repeated = chain.from_iterable(repeat(iterable, repeat_times))
    return list(repeated)


class GroupedBatchSampler(BatchSampler):
    """
    Wraps another sampler to yield a mini-batch of indices.
    It enforces that the batch only contain elements from the same group.
    It also tries to provide mini-batches which follows an ordering which is
    as close as possible to the ordering from the original sampler.
    Arguments:
        sampler (Sampler): Base sampler.
        group_ids (list[int]): If the sampler produces indices in range [0, N),
            `group_ids` must be a list of `N` ints which contains the group id of each sample.
            The group ids must be a continuous set of integers starting from
            0, i.e. they must be in the range [0, num_groups).
        batch_size (int): Size of mini-batch.
    """
    def __init__(self, sampler, group_ids, batch_size):
        if not isinstance(sampler, Sampler):
            raise ValueError(
                "sampler should be an instance of "
                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
            )
        self.sampler = sampler
        self.group_ids = group_ids
        self.batch_size = batch_size

    def __iter__(self):
        buffer_per_group = defaultdict(list)
        samples_per_group = defaultdict(list)

        num_batches = 0
        for idx in self.sampler:
            group_id = self.group_ids[idx]
            buffer_per_group[group_id].append(idx)
            samples_per_group[group_id].append(idx)
            if len(buffer_per_group[group_id]) == self.batch_size:
                yield buffer_per_group[group_id]
                num_batches += 1
                del buffer_per_group[group_id]
            assert len(buffer_per_group[group_id]) < self.batch_size

        # now we have run out of elements that satisfy
        # the group criteria, let's return the remaining
        # elements so that the size of the sampler is
        # deterministic
        expected_num_batches = len(self)
        num_remaining = expected_num_batches - num_batches
        if num_remaining > 0:
            # for the remaining batches, take first the buffers with largest number
            # of elements
            for group_id, _ in sorted(buffer_per_group.items(),
                                      key=lambda x: len(x[1]), reverse=True):
                remaining = self.batch_size - len(buffer_per_group[group_id])
                samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
                buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
                assert len(buffer_per_group[group_id]) == self.batch_size
                yield buffer_per_group[group_id]
                num_remaining -= 1
                if num_remaining == 0:
                    break
        assert num_remaining == 0

    def __len__(self):
        return len(self.sampler) // self.batch_size


def _compute_aspect_ratios_slow(dataset, indices=None):
    print("Your dataset doesn't support the fast path for "
          "computing the aspect ratios, so will iterate over "
          "the full dataset and load every image instead. "
          "This might take some time...")
    if indices is None:
        indices = range(len(dataset))

    class SubsetSampler(Sampler):
        def __init__(self, indices):
            self.indices = indices

        def __iter__(self):
            return iter(self.indices)

        def __len__(self):
            return len(self.indices)

    sampler = SubsetSampler(indices)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=1, sampler=sampler,
        num_workers=14,  # you might want to increase it for faster processing
        collate_fn=lambda x: x[0])
    aspect_ratios = []
    with tqdm(total=len(dataset)) as pbar:
        for _i, (img, _) in enumerate(data_loader):
            pbar.update(1)
            height, width = img.shape[-2:]
            aspect_ratio = float(width) / float(height)
            aspect_ratios.append(aspect_ratio)
    return aspect_ratios


def _compute_aspect_ratios_custom_dataset(dataset, indices=None):
    if indices is None:
        indices = range(len(dataset))
    aspect_ratios = []
    for i in indices:
        height, width = dataset.get_height_and_width(i)
        aspect_ratio = float(width) / float(height)
        aspect_ratios.append(aspect_ratio)
    return aspect_ratios


def _compute_aspect_ratios_coco_dataset(dataset, indices=None):
    if indices is None:
        indices = range(len(dataset))
    aspect_ratios = []
    for i in indices:
        img_info = dataset.coco.imgs[dataset.ids[i]]
        aspect_ratio = float(img_info["width"]) / float(img_info["height"])
        aspect_ratios.append(aspect_ratio)
    return aspect_ratios


def _compute_aspect_ratios_voc_dataset(dataset, indices=None):
    if indices is None:
        indices = range(len(dataset))
    aspect_ratios = []
    for i in indices:
        # this doesn't load the data into memory, because PIL loads it lazily
        width, height = Image.open(dataset.images[i]).size
        aspect_ratio = float(width) / float(height)
        aspect_ratios.append(aspect_ratio)
    return aspect_ratios


def _compute_aspect_ratios_subset_dataset(dataset, indices=None):
    if indices is None:
        indices = range(len(dataset))

    ds_indices = [dataset.indices[i] for i in indices]
    return compute_aspect_ratios(dataset.dataset, ds_indices)


def compute_aspect_ratios(dataset, indices=None):
    if hasattr(dataset, "get_height_and_width"):
        return _compute_aspect_ratios_custom_dataset(dataset, indices)

    if isinstance(dataset, torchvision.datasets.CocoDetection):
        return _compute_aspect_ratios_coco_dataset(dataset, indices)

    if isinstance(dataset, torchvision.datasets.VOCDetection):
        return _compute_aspect_ratios_voc_dataset(dataset, indices)

    if isinstance(dataset, torch.utils.data.Subset):
        return _compute_aspect_ratios_subset_dataset(dataset, indices)

    # slow path
    return _compute_aspect_ratios_slow(dataset, indices)


def _quantize(x, bins):
    bins = copy.deepcopy(bins)
    bins = sorted(bins)
    # bisect_right：寻找y元素按顺序应该排在bins中哪个元素的右边，返回的是索引
    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
    return quantized


def create_aspect_ratio_groups(dataset, k=0):
    # 计算所有数据集中的图片width/height比例
    aspect_ratios = compute_aspect_ratios(dataset)
    # 将[0.5, 2]区间划分成2*k+1等份
    bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]

    # 统计所有图像比例在bins区间中的位置索引
    groups = _quantize(aspect_ratios, bins)
    # count number of elements per group
    # 统计每个区间的频次
    counts = np.unique(groups, return_counts=True)[1]
    fbins = [0] + bins + [np.inf]
    print("Using {} as bins for aspect ratio quantization".format(fbins))
    print("Count of instances per bin: {}".format(counts))
    return groups

In [36]:
# train_utils/train_eval_utils.py

import math
import sys
import time

import torch
import torchvision
from torchvision.utils import save_image

# from train_utils import get_coco_api_from_dataset, CocoEvaluator
# import train_utils.distributed_utils as utils


def train_one_epoch(model, optimizer, data_loader, device, epoch,
                    print_freq=20, warmup=False):
    model.train()
    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.20f}'))
    header = 'Epoch: [{}]'.format(epoch)
    lr_scheduler = None
    if epoch == 0 and warmup is True:  # 当训练第一轮（epoch=0）时，启用warmup训练方式，可理解为热身训练
        warmup_factor = 5.0 / 10000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    mloss = torch.zeros(1).to(device)  # mean losses
    for i, [images, targets] in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        # batch inputs information
        images = torch.stack(images, dim=0)
        
        grid_image = torchvision.utils.make_grid(images, nrow=4, padding=5)
        save_image(grid_image, "image.png")
        print("saving grid image")

        boxes = []
        labels = []
        img_id = []
        task2_label = []
        for t in targets:
            boxes.append(t[TARGET_FIELD_TASK1_ANCHORS])
            labels.append(t[TARGET_FIELD_TASK1_LABELS])
            img_id.append(t[TARGET_FIELD_IMAGE_ID])
            task2_label.append(t[TARGET_FIELD_TASK2_LABEL])
        targets = {"boxes": torch.stack(boxes, dim=0),
                   "labels": torch.stack(labels, dim=0),
                   "image_id": torch.as_tensor(img_id),
                   "task2_label":torch.as_tensor(task2_label)}

        images = images.to(device)

        targets = {k: v.to(device) for k, v in targets.items()}
        losses_dict = model(epoch, images, targets)
        losses = losses_dict["total_losses"]

        # reduce losses over all GPUs for logging purpose
        losses_dict_reduced = reduce_dict(losses_dict)
        losses_reduce = losses_dict_reduced["total_losses"]

        loss_value = losses_reduce.detach()
        # 记录训练损失
        mloss = (mloss * i + loss_value) / (i + 1)  # update mean losses

        if not math.isfinite(loss_value):  # 当计算的损失为无穷大时停止训练
            print("Loss is {}, stopping training".format(loss_value))
            print(losses_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:  # 第一轮使用warmup训练方式
            lr_scheduler.step()

        # metric_logger.update(loss=losses, **loss_dict_reduced)
        metric_logger.update(**losses_dict_reduced)
        now_lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(lr=now_lr)

    return mloss, now_lr

@torch.no_grad()
def evaluate(epoch, model, data_loader, device, data_set=None):

    cpu_device = torch.device("cpu")
    model.eval()
    metric_logger = MetricLogger(delimiter="  ")
    header = "Test: "

    if data_set is None:
        data_set = get_coco_api_from_dataset(data_loader.dataset)
    iou_types = _get_iou_types(model)
    coco_evaluator = CocoEvaluator(data_set, iou_types)

    task2_pred = []
    task2_target = []
    for images, targets in metric_logger.log_every(data_loader, 100, header):
        images = torch.stack(images, dim=0).to(device)

        if device != torch.device("cpu"):
            torch.cuda.synchronize(device)

        model_time = time.time()
        #  list((bboxes_out, labels_out, scores_out), ...)
        results = model(epoch, images, targets=None)
        task1_results = results[PREDICT_RESULT_TASK1]
        task2_results = results[PREDICT_RESULT_TASK2]
        
        task2_pred.extend(task2_results.argmax(dim=1).tolist())
        task2_target.extend([t[TARGET_FIELD_TASK2_LABEL].item() for t in targets])
        
        model_time = time.time() - model_time

        outputs = []
        for index, (bboxes_out, labels_out, scores_out) in enumerate(task1_results):
            # 将box的相对坐标信息（0-1）转为绝对值坐标(xmin, ymin, xmax, ymax)
            height_width = targets[index][TARGET_FIELD_HEIGHT_WIDTH]
            
            # 还原回原图尺度
            gt_boxes = targets[index][TARGET_FIELD_TASK1_ANCHORS]
            
            gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * height_width[1]
            gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * height_width[0]
            
            targets[index][TARGET_FIELD_TASK1_ANCHORS] = gt_boxes
            
            bboxes_out[:, [0, 2]] = bboxes_out[:, [0, 2]] * height_width[1]
            bboxes_out[:, [1, 3]] = bboxes_out[:, [1, 3]] * height_width[0]

            info = {"boxes": bboxes_out.to(cpu_device),
                    "labels": labels_out.to(cpu_device),
                    "scores": scores_out.to(cpu_device)}
            outputs.append(info)

        res = {target[TARGET_FIELD_IMAGE_ID].item(): output for target, output in zip(targets, outputs)}
    
        choice = random.randint(0, images.shape[0] - 1)
        
        height_width = targets[choice][TARGET_FIELD_HEIGHT_WIDTH]
        
        result = task1_results[choice]
        bboxes = result[0]
        bboxes[:, 0], bboxes[:, 2] = bboxes[:, 0] / height_width[1], bboxes[:, 2] / height_width[1]
        bboxes[:, 1], bboxes[:, 3] = bboxes[:, 1] / height_width[0], bboxes[:, 3] / height_width[0]
        
        labels = result[1]
        
        gt_boxes = targets[index][TARGET_FIELD_TASK1_ANCHORS]
        gt_boxes[:, 0], gt_boxes[:, 2] = gt_boxes[:, 0] / height_width[1], gt_boxes[:, 2] / height_width[1]
        gt_boxes[:, 1], gt_boxes[:, 3] = gt_boxes[:, 1] / height_width[0], gt_boxes[:, 3] / height_width[0]
        
        gt_labels = targets[index][TARGET_FIELD_TASK1_LABELS]
        filter_cond = gt_labels > 0
        print(f"evaluate: gtboxes:{gt_boxes[filter_cond]}, gt_labels:{gt_labels[filter_cond]}")
        print(f"evaluate: bboxes:{bboxes[:5].detach()}, labels:{labels[:5].detach()}")
        
        if epoch % 5 == 0:
            target_bboxes = targets[choice][TARGET_FIELD_TASK1_ANCHORS]
            targets_label = targets[choice][TARGET_FIELD_TASK1_LABELS]
            save_anchor_pic(image=images, path="./output/evaluate_pred.jpg", bboxes=bboxes, 
                            bboxes_label = labels, height=600, width=600, 
                            targets=target_bboxes, targets_label=targets_label,
                            choice=choice)
        
        evaluator_time = time.time()
        coco_evaluator.update(res)
        evaluator_time = time.time() - evaluator_time
        metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    task2_pred = np.array(task2_pred)
    task2_target = np.array(task2_target)
    
    print(f"task2_pred:{task2_pred}")
    print(f"task2_target:{task2_target}")
    print(f"task2准确率：{np.array(task2_pred == task2_target).mean()}")
    
    coco_info = coco_evaluator.coco_eval[iou_types[0]].stats.tolist()  # numpy to list

    return coco_info


def _get_iou_types(model):
    model_without_ddp = model
    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
        model_without_ddp = model.module
    iou_types = ["bbox"]
    return iou_types

In [37]:
from torch.utils.data import Dataset
import os
import torch
import csv
from collections import defaultdict
from PIL import Image

# 任务1标注文件字段索引
TASK1_CSV_FIELD_ID_INDEX = 0
TASK1_CSV_FIELD_FILENAME_INDEX = 1
TASK1_CSV_FIELD_LABEL_INDEX = 2
TASK1_CSV_FIELD_XMIN_INDEX = 3
TASK1_CSV_FIELD_YMIN_INDEX = 4
TASK1_CSV_FIELD_XMAX_INDEX = 5
TASK1_CSV_FIELD_YMAX_INDEX = 6
TASK1_CSV_FIELD_HEIGHT_INDEX = 7
TASK1_CSV_FIELD_WIDTH_INDEX = 8
TASK1_CSV_FIELD_CHANNEL_INDEX = 9
TASK1_CSV_FIELD_TASK_TYPE_INDEX=10
# 任务2标注文件字段索引
TASK2_CSV_FIELD_FILENAME_INDEX = 0
TASK2_CSV_FIELD_LABEL_INDEX = 1

TARGET_FIELD_TASK1_ANCHORS = "task1_anchors"
TARGET_FIELD_TASK1_LABELS = "task1_labels"
TARGET_FIELD_TASK2_LABEL = "task2_label"
TARGET_FIELD_IMAGE_ID = "image_id"
TARGET_FIELD_TASK1_AREA = "task1_area"
TARGET_FIELD_TASK1_ISCROWD = "task1_iscrowd"
TARGET_FIELD_HEIGHT_WIDTH = "height_width"
TARGET_FIELD_IMAGE_DATA = "image_data"
TARGET_FIELD_TASK_TYPE="task_type"

TASK_TYPE_ALL=1
TASK_TYPE_ONLY_TASK2=2

PREDICT_RESULT_TASK1 = "task1_result"
PREDICT_RESULT_TASK2 = "task2_result"

INVALID_FILE_INDEX = "-1"

IMAGE_SIZE = 300

TASK2_LABEL_COUNT = 2

# 数据预处理
project_root = "/kaggle/working/conveyer_belt_detector/"
dataset_root = "/kaggle/input/conveyer-belt-detect/dataset/"

# 汇总的标注文件
task1_train_detect_anno = "/kaggle/input/anno-data-new/train_info_all_orig.csv"
# task1_train_detect_anno = "/kaggle/input/anno-data-new/train_info_all.csv"
# task1_train_detect_anno = "/kaggle/input/anno-data-new/train_info_all_base_trans.csv"
task1_val_detect_anno = os.path.join(project_root, "data/task1/val/val_infos.csv")
task2_train_classes_anno = "/kaggle/input/anno-data-new/classes_all_orig.txt"
# task2_train_classes_anno = "/kaggle/input/anno-data-new/classes_all.txt"
# task2_train_classes_anno = "/kaggle/input/anno-data-new/classes_all_base_trans.txt"
task2_val_classes_anno = os.path.join(project_root, "data/task2/val/classes.txt")

pretrain_path="/kaggle/input/nvidia-ssdpyt-fp32-190826pt/nvidia_ssdpyt_fp32_190826.pt"

def make_dir(path):
    dir_name = os.path.dirname(path)
    if not os.path.exists(dir_name):
        print("创建目录", dir_name)
        os.makedirs(dir_name)

make_dir(path=project_root)
make_dir(path=task1_train_detect_anno)
make_dir(path=task1_val_detect_anno)
make_dir(path=task2_train_classes_anno)
make_dir(path=task2_val_classes_anno)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def transform_task2_label(label):
    return 1 if label == 3 else label


def recover_task2_label(label):
    return 3 if label == 1 else label


class MyDataset(Dataset):
    def __init__(self, task1_file="train_info_all.csv", task2_file="classes_all.txt", transforms=None,
                 val_data_flag=False):
        super(MyDataset, self).__init__()
        self.task1_file = task1_file
        self.task2_file = task2_file
        self.transforms = transforms

        self.images = []
        self.task1_anchors = []

        self.task1_labels = []
        self.task2_labels = []

        self.height_widths = []
        # 默认全流程, 后续根据标注文件进行修改
        self.task_types = []

        self.max_anchors_count_in_image = 0

        self.val_data_flag = val_data_flag

        self._read()

    def _read(self):
        def generate_empty_task2_labels_item():
            return {}

        task2_data = defaultdict(generate_empty_task2_labels_item)
        with open(file=self.task2_file, mode="r", encoding="utf8") as f:
            while True:
                line = f.readline().strip()
                if len(line) <= 0:
                    break

                line = line.split(",")

                task2_data[line[TASK2_CSV_FIELD_FILENAME_INDEX]] = int(line[TASK2_CSV_FIELD_LABEL_INDEX])

        with open(file=self.task1_file, mode="r", encoding="utf8") as f:
            line = f.readline()
            last_index = INVALID_FILE_INDEX
            anchors_count_in_image = 0
            while True:
                line = f.readline().strip()
                if len(line) <= 0:
                    break

                line = line.split(",")

                if line[TASK1_CSV_FIELD_ID_INDEX] != last_index:
                    if last_index != INVALID_FILE_INDEX:
                        if len(line) < 11 or int(line[10]) == 1:
                            self.images.append(last_file_name)
                            self.task1_anchors.append(task1_anchors)
                            self.task1_labels.append(task1_labels)
                            task2_label = task2_data[line[TASK1_CSV_FIELD_FILENAME_INDEX]]
                            self.task2_labels.append(transform_task2_label(label=task2_label))
                            self.height_widths.append(
                                [int(line[TASK1_CSV_FIELD_HEIGHT_INDEX]), int(line[TASK1_CSV_FIELD_WIDTH_INDEX])])

                            if not self.val_data_flag:
                                self.task_types.append(int(line[TASK1_CSV_FIELD_TASK_TYPE_INDEX]))
                        #                         print(task1_anchors)
                        #                         print(task1_labels)
                        #                         print(task2_data[line[TASK1_CSV_FIELD_FILENAME_INDEX]])
                        #                         print(self.height_widths)

                    task1_anchors = []
                    task1_labels = []

                    if self.max_anchors_count_in_image < anchors_count_in_image:
                        self.max_anchors_count_in_image = anchors_count_in_image

                    anchors_count_in_image = 0

                anchor_str = line[TASK1_CSV_FIELD_XMIN_INDEX:TASK1_CSV_FIELD_YMAX_INDEX + 1]
                task1_anchors.append([float(ele) for ele in anchor_str])
                task1_labels.append(int(line[TASK1_CSV_FIELD_LABEL_INDEX]))

                anchors_count_in_image += 1

                last_index = line[TASK1_CSV_FIELD_ID_INDEX]
                last_file_name = line[TASK1_CSV_FIELD_FILENAME_INDEX]

    def __getitem__(self, idx):
        image_id = torch.tensor([idx])
        # 图像数据
        image = self.images[idx]

        # BGR => RGB
        image = Image.open(fp=image).convert('RGB')

        # 真实框（标注结果）
        # (N, 4), N为标注框个数
        task1_anchors = []
        for anchor in self.task1_anchors[idx]:
            task1_anchors.append(torch.tensor(data=anchor).float())
        #         print(task1_anchors)

        # 任务一标签
        # (N), N为标注框个数
        task1_labels = []
        for label in self.task1_labels[idx]:
            # 标签是long类型
            task1_labels.append(torch.tensor(data=label).long())

        task1_anchors = torch.stack(tensors=task1_anchors, dim=0)
        task1_labels = torch.as_tensor(data=task1_labels, dtype=torch.int64)

        task2_label = torch.as_tensor(data=self.task2_labels[idx], dtype=torch.int64)
        image_id = torch.tensor(data=[image_id], dtype=torch.int64)
        height_width = torch.as_tensor(data=self.height_widths[idx], dtype=torch.int64)

        target = {}
        target[TARGET_FIELD_TASK1_ANCHORS] = task1_anchors.to(device=device)
        target[TARGET_FIELD_TASK1_LABELS] = task1_labels.to(device=device)
        target[TARGET_FIELD_TASK2_LABEL] = task2_label.to(device=device)
        target[TARGET_FIELD_IMAGE_ID] = image_id.to(device=device)
        target[TARGET_FIELD_HEIGHT_WIDTH] = height_width.to(device=device)

        if not self.val_data_flag:
            task_type = torch.as_tensor(self.task_types[idx], dtype=torch.int64)
            target[TARGET_FIELD_TASK_TYPE] = task_type.to(device=device)

        # 图像预处理
        if self.transforms is not None:
            image, target = self.transforms(image, target)

        # 返回结果
        return image, target

    def __len__(self):
        return len(self.images)

    @staticmethod
    def collate_fn(batch):
        images, targets = tuple(zip(*batch))
        return images, targets

    def coco_index(self, idx):
        """
        该方法是专门为 pycocotools 统计标签信息准备，不对图像和标签作任何处理
        由于不用去读取图片，可大幅缩减统计时间

        Args:
            idx: 输入需要获取图像的索引
        """
        boxes = []
        labels = []

        boxes.extend(self.task1_anchors[idx])
        labels.extend(self.task1_labels[idx])
        iscrowds = [0 for _ in range(len(boxes))]

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        iscrowds = torch.as_tensor(iscrowds, dtype=torch.int64)
        height_width = torch.as_tensor(self.height_widths[idx], dtype=torch.int64)
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        target = {}
        target[TARGET_FIELD_TASK1_ANCHORS] = boxes.to(device=device)
        target[TARGET_FIELD_TASK1_LABELS] = labels.to(device=device)
        target[TARGET_FIELD_IMAGE_ID] = image_id.to(device=device)
        target[TARGET_FIELD_TASK1_AREA] = area.to(device=device)
        target[TARGET_FIELD_TASK1_ISCROWD] = iscrowds.to(device=device)
        target[TARGET_FIELD_HEIGHT_WIDTH] = height_width.to(device=device)

        return target

In [38]:
# transforms.py

import random

import numpy as np
import torch
import torchvision.transforms as t
from torchvision.transforms import functional as F_torchvision

# from src import dboxes300_coco, calc_iou_tensor, Encoder


class Compose(object):
    """组合多个transform函数"""
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image, target=None):
        for trans in self.transforms:
            image, target = trans(image, target)
        return image, target


class ToTensor(object):
    """将PIL图像转为Tensor"""
    def __call__(self, image, target):
        image = F_torchvision.to_tensor(image).contiguous()
        return image, target
    
class RandomHorizontalFlip(object):
    """随机水平翻转图像以及bboxes,该方法应放在ToTensor后"""
    def __init__(self, prob=0.5):
        self.prob = prob

    def __call__(self, image, target):
        if random.random() < self.prob:
            # height, width = image.shape[-2:]
            image = image.flip(-1)  # 水平翻转图片
            bbox = target[TARGET_FIELD_TASK1_ANCHORS]
            # bbox: xmin, ymin, xmax, ymax
            # bbox[:, [0, 2]] = width - bbox[:, [2, 0]]  # 翻转对应bbox坐标信息
            bbox[:, [0, 2]] = 1.0 - bbox[:, [2, 0]]  # 翻转对应bbox坐标信息
            target[TARGET_FIELD_TASK1_ANCHORS] = bbox
        return image, target


class SSDCropping(object):
    """
    根据原文，对图像进行裁剪,该方法应放在ToTensor前
    Cropping for SSD, according to original paper
    Choose between following 3 conditions:
    1. Preserve the original image
    2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
    3. Random crop
    Reference to https://github.com/chauhan-utk/src.DomainAdaptation
    """
    def __init__(self):
        self.sample_options = (
            # 不做裁剪
            None,
            # 最小和最大 IoU
            (0.1, None),
            (0.3, None),
            (0.5, None),
            (0.7, None),
            (0.9, None),
            # 不做限制
            (None, None),
        )
        self.dboxes = dboxes300_coco()
        
    

    def __call__(self, image, target):
        # 死循环，确保一定会返回结果
        while True:
            mode = random.choice(self.sample_options)

            if mode is None:  # 不做随机裁剪处理
                return image, target

            htot, wtot = target[TARGET_FIELD_HEIGHT_WIDTH]

            min_iou, max_iou = mode
            min_iou = float('-inf') if min_iou is None else min_iou
            max_iou = float('+inf') if max_iou is None else max_iou
            # Implementation use 5 iteration to find possible candidate
            for _ in range(5):
                # 0.3*0.3 approx. 0.1
                w = random.uniform(0.3, 1.0)
                h = random.uniform(0.3, 1.0)
                if w/h < 0.5 or w/h > 2:  # 保证宽高比例在0.5-2之间
                    continue
#                 w = random.uniform(0.4, 1.0)
#                 h = w

                # left 0 ~ wtot - w, top 0 ~ htot - h
                left = random.uniform(0, 1.0 - w)
                top = random.uniform(0, 1.0 - h)

                right = left + w
                bottom = top + h

                # boxes的坐标是在0-1之间的
                bboxes = target[TARGET_FIELD_TASK1_ANCHORS]
                new_box = torch.tensor([[left, top, right, bottom]]).to(device=device)
                ious = calc_iou_tensor(bboxes, new_box)
                # tailor all the bboxes and return
                # all(): Returns True if all elements in the tensor are True, False otherwise.
                if not ((ious > min_iou) & (ious < max_iou)).all():
                    continue

                # 计算所有目标框的中心点
                xc = 0.5 * (bboxes[:, 0] + bboxes[:, 2])
                yc = 0.5 * (bboxes[:, 1] + bboxes[:, 3])
                # 查看哪些目标框的中心点没有在被截取的图像中
                masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)

                # 如果所有的gt box的中心点都不在采样的patch中，则重新找
                if not masks.any():
                    continue

                # 修改采样patch中的所有gt box的坐标（防止出现越界的情况）
                bboxes[bboxes[:, 0] < left, 0] = left
                bboxes[bboxes[:, 1] < top, 1] = top
                bboxes[bboxes[:, 2] > right, 2] = right
                bboxes[bboxes[:, 3] > bottom, 3] = bottom

                # 虑除不在采样patch中的gt box
                bboxes = bboxes[masks, :]

                # 获取在采样patch中的gt box的标签
                labels = target[TARGET_FIELD_TASK1_LABELS]
                labels = labels[masks]

                # 裁剪 patch
                left_idx = int(left * wtot)
                top_idx = int(top * htot)
                right_idx = int(right * wtot)
                bottom_idx = int(bottom * htot)
                image = image.crop((left_idx, top_idx, right_idx, bottom_idx))
                # 调整裁剪后的bboxes坐标信息

                bboxes[:, 0] = (bboxes[:, 0] - left) / w
                bboxes[:, 1] = (bboxes[:, 1] - top) / h
                bboxes[:, 2] = (bboxes[:, 2] - left) / w
                bboxes[:, 3] = (bboxes[:, 3] - top) / h
                
#                 image, bboxes = self.pad_img(image, bboxes)

                # 更新crop后的gt box坐标信息以及标签信息
                target[TARGET_FIELD_TASK1_ANCHORS] = bboxes
                target[TARGET_FIELD_TASK1_LABELS] = labels
                return image, target


class Resize(object):
    """对图像进行resize处理,该方法应放在ToTensor前"""
    def __init__(self, size=(600, 600)):
        self.resize = t.Resize(size)
        
    def __call__(self, image, target):
        image = self.resize(image)
        return image, target


class ColorJitter(object):
    """对图像颜色信息进行随机调整,该方法应放在ToTensor前"""
    def __init__(self, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05):
        self.trans = t.ColorJitter(brightness, contrast, saturation, hue)

    def __call__(self, image, target):
        image = self.trans(image)
        return image, target


class Normalization(object):
    """对图像标准化处理,该方法应放在ToTensor后"""
    def __init__(self, mean=None, std=None):
        if mean is None:
            mean = [0.3441988, 0.34242108, 0.3464927]
        if std is None:
            std = [0.19682558, 0.19890308, 0.2000567]
        self.normalize = t.Normalize(mean=mean, std=std)

    def __call__(self, image, target):
        image = self.normalize(image)
        return image, target


class AssignGTtoDefaultBox(object):
    """ 将 DefaultBox 与 GT进行匹配 """
    def __init__(self):
        self.default_box = dboxes300_coco()
        self.encoder = Encoder(self.default_box)

    def __call__(self, image, target):
        boxes = target[TARGET_FIELD_TASK1_ANCHORS]
        labels = target[TARGET_FIELD_TASK1_LABELS]
        # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
        bboxes_out, labels_out = self.encoder.encode(boxes, labels)
        target[TARGET_FIELD_TASK1_ANCHORS] = bboxes_out
        target[TARGET_FIELD_TASK1_LABELS] = labels_out
        return image, target

In [39]:
# plot_curv.py

import datetime
import matplotlib.pyplot as plt


def plot_loss_and_lr(train_loss, learning_rate):
    try:
        x = list(range(len(train_loss)))
        fig, ax1 = plt.subplots(1, 1)
        ax1.plot(x, train_loss, 'r', label='loss')
        ax1.set_xlabel("epoch")
        ax1.set_ylabel("loss")
        ax1.set_title("Train Loss and lr")
        plt.legend(loc='best')

        ax2 = ax1.twinx()
        ax2.plot(x, learning_rate, label='lr')
        ax2.set_ylabel("learning rate")
        ax2.set_xlim(0, len(train_loss))  # 设置横坐标整数间隔
        plt.legend(loc='best')

        handles1, labels1 = ax1.get_legend_handles_labels()
        handles2, labels2 = ax2.get_legend_handles_labels()
        plt.legend(handles1 + handles2, labels1 + labels2, loc='upper right')

        fig.subplots_adjust(right=0.8)  # 防止出现保存图片显示不全的情况
        fig.savefig('./loss_and_lr{}.png'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
        plt.close()
        print("successful save loss curve! ")
    except Exception as e:
        print(e)


def plot_map(mAP):
    try:
        x = list(range(len(mAP)))
        plt.plot(x, mAP, label='mAp')
        plt.xlabel('epoch')
        plt.ylabel('mAP')
        plt.title('Eval mAP')
        plt.xlim(0, len(mAP))
        plt.legend(loc='best')
        plt.savefig('./mAP.png')
        plt.close()
        print("successful save mAP curve!")
    except Exception as e:
        print(e)

In [40]:
# train_ssd300.py
import os
import datetime

import torch

from torch.utils.data import DataLoader

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

def create_model(num_classes=2 + 1):
    # 先构建一个 backbone
    backbone = Backbone()

    # 再构建一个SSD300
    model = SSD300(backbone=backbone, num_classes=num_classes)

#     pre_ssd_path="/kaggle/input/nvidia-ssdpyt-fp32-190826pt/nvidia_ssdpyt_fp32_190826.pt"
    pre_ssd_path="/kaggle/input/nvidia-ssdpyt-fp32-190826pt/pretrain-ssd300-54.pth"
    if not os.path.exists(pre_ssd_path):
        raise FileNotFoundError("pretrain-ssd300-54.pth not find in {}".format(pre_ssd_path))
    pre_model_dict = torch.load(pre_ssd_path, map_location='cpu')
    pre_weights_dict = pre_model_dict["model"]

    # 删除类别预测器权重，注意，回归预测器的权重可以重用，因为不涉及num_classes
    del_conf_loc_dict = {}
    for k, v in pre_weights_dict.items():
        split_key = k.split(".")
        if "conf" in split_key:
            continue
        del_conf_loc_dict.update({k: v})

    missing_keys, unexpected_keys = model.load_state_dict(del_conf_loc_dict, strict=False)
    if len(missing_keys) != 0 or len(unexpected_keys) != 0:
        print("missing_keys: ", missing_keys)
        print("unexpected_keys: ", unexpected_keys)

    return model

def main(parser_data):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # 保存训练完成之后的权重
    if not os.path.exists("save_weights"):
        os.mkdir("save_weights")


    # 定义一个结果文件
    results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    """
        1，数据读取后的处理工作
            - 类型转换
            - 数据增强
    """

    data_transform = {
        "train": Compose([
#             SSDCropping(), # 图像切割
                          Resize(),   # 统一大小
#                           ColorJitter(), # 颜色抖动
                          ToTensor(),  # 转张量
#                           RandomHorizontalFlip(), # 水平翻转
#                           Normalization(), # 标准化
                          AssignGTtoDefaultBox()]), # 处理目标框和锚框

        "val": Compose([Resize(),
                        ToTensor(),
                        Normalization()])
  }

    # 定义训练集
    train_dataset = MyDataset(task1_file=task1_train_detect_anno,
                              task2_file=task2_train_classes_anno,
                              transforms=data_transform["train"])

    # 注意训练时，batch_size必须大于1
    batch_size = parser_data.batch_size

    assert batch_size > 1, "batch size must be greater than 1"

    # 防止最后一个batch_size=1，如果最后一个batch_size=1就舍去
    drop_last = True if len(train_dataset) % batch_size == 1 else False

    # 数据预处理多少线程
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   # num_workers=nw,
                                   collate_fn=train_dataset.collate_fn,
                                   drop_last=drop_last)

    # 定义验证集
    val_dataset = MyDataset(task1_file=task1_val_detect_anno,
                            task2_file=task2_val_classes_anno,
                            transforms=data_transform["val"],
                            val_data_flag=True)
    val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  # num_workers=nw,
                                                  collate_fn=train_dataset.collate_fn)
    # 定义模型
    model = create_model(num_classes=args.num_classes + 1)
    model.to(device)

    # 定义优化器
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params=params, lr=0.001,
                                momentum=0.9, weight_decay=0.0005)
    # 学习率调度器
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,
                                                   step_size=5,
                                                   gamma=0.8)

    # 如果指定了上次训练保存的权重文件地址，则接着上次结果接着训练
    if parser_data.resume != "":
        checkpoint = torch.load(parser_data.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        parser_data.start_epoch = checkpoint['epoch'] + 1
        print("the training process from epoch{}...".format(parser_data.start_epoch))
        
    optimizer.param_groups[0]["lr"] = 0.000512

    train_loss = []
    learning_rate = []
    val_map = []

    # 提前加载验证集数据，以免每次验证时都要重新加载一次数据，节省时间
    val_data = get_coco_api_from_dataset(val_data_loader.dataset)

    for epoch in range(parser_data.start_epoch, parser_data.epochs):
        mean_loss, lr = train_one_epoch(model=model, optimizer=optimizer,
                                        data_loader=train_data_loader,
                                        device=device, epoch=epoch,
                                        print_freq=10)
        train_loss.append(mean_loss.item())
        learning_rate.append(lr)
        
        print(f"Epoch:{epoch} finish, mean_loss:{mean_loss}, lr:{lr}")

        # 更新学习率
        lr_scheduler.step()

        # 测试数据
        coco_info = evaluate(epoch=epoch, model=model, data_loader=val_data_loader,
                             device=device, data_set=val_data)

        # write into txt
        with open(results_file, "a") as f:
            # 写入的数据包括coco指标还有loss和learning rate
            result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))]
            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
            f.write(txt + "\n")

        val_map.append(coco_info[1])  # pascal mAP

        # save weights
        save_files = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'epoch': epoch}
        torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch))

    # plot loss and lr curve
    if len(train_loss) != 0 and len(learning_rate) != 0:
        from plot_curve import plot_loss_and_lr
        plot_loss_and_lr(train_loss, learning_rate)

    # plot mAP curve
    if len(val_map) != 0:
        from plot_curve import plot_map
        plot_map(val_map)

if __name__ == '__main__':

    import argparse

    parser = argparse.ArgumentParser(description=__doc__)

    # 检测的目标类别个数，不包括背景(替换：自己的检测类别)
    parser.add_argument('--num_classes', default=2, type=int, help='num_classes')
    # 文件保存地址
    parser.add_argument('--output-dir', default='./save_weights', help='path where to save')
    # 若需要接着上次训练，则指定上次训练保存权重文件地址 ./save_weights/ssd300-136.pth ./save_weights/ssd300-11.pth /kaggle/input/ssd300-train/ssd300-49.pth
    parser.add_argument('--resume', default='./save_weights/ssd300-120.pth', type=str, help='resume from checkpoint')
    # 指定接着从哪个epoch数开始训练
    parser.add_argument('--start_epoch', default=121, type=int, help='start epoch')
    # 训练的总epoch数
    parser.add_argument('--epochs', default=300, type=int, metavar='N',
                        help='number of total epochs to run')
    # 训练的batch size
    parser.add_argument('--batch_size', default=16, type=int, metavar='N',
                        help='batch size when training.')

    args = parser.parse_args(args=[])

    # 检查保存权重文件夹是否存在，不存在则创建
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    main(args)


missing_keys:  ['belt_pos.weight', 'belt_pos.bias', 'conf.0.weight', 'conf.0.bias', 'conf.1.weight', 'conf.1.bias', 'conf.2.weight', 'conf.2.bias', 'conf.3.weight', 'conf.3.bias', 'conf.4.weight', 'conf.4.bias', 'conf.5.weight', 'conf.5.bias']
unexpected_keys:  []
the training process from epoch121...
creating index...
index created!
saving grid image


KeyboardInterrupt: 

In [None]:
import os
# 导出预测文件
def export_predict_result(image_root, train_weights, task1_file, task2_file):
    make_dir(path=task1_file)
    make_dir(path=task2_file)
    # 判断设备
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # 构建模型
    # 目标检测数 + 背景
    num_classes = 2 + 1
    model = create_model(num_classes=num_classes)

    # 加载权重
    model.load_state_dict(torch.load(train_weights, map_location='cpu')['model'])
    model.to(device)

    model.eval()
    with torch.no_grad():
        # initial model 到底是为什么？
        init_img = torch.zeros((1, 3, 600, 600), device=device)
        model(1, init_img)
        
        with open(file=task1_file, mode="w", encoding="utf8", newline='') as task1_csv_file:
            task1_header = ["filename", "label", "xmin", "ymin", "width", "height", "confidence"]
            task1_csv_writer = csv.DictWriter(f=task1_csv_file, fieldnames=task1_header)
            task1_csv_writer.writeheader()

            with open(file=task2_file, mode="w", encoding="utf8", newline='') as task2_csv_file:
#                 task2_header = ["filename", "label"]
#                 task2_csv_writer = csv.DictWriter(f=task2_csv_file, fieldnames=task2_header)
#                 task2_csv_writer.writeheader()
            
                for image_file_name in os.listdir(image_root):
                    # 读取图像
                    original_img = Image.open(os.path.join(image_root, image_file_name))

                    data_transform = Compose([Resize(),
                                              ToTensor(),
                                              Normalization()])
                    img, _ = data_transform(original_img)
                    # 改为批量预测
                    img = torch.unsqueeze(img, dim=0)

                    time_start = time_synchronized()
                    results = model(1, img.to(device))
                    time_end = time_synchronized()
                    print("{}, inference+NMS time: {}".format(image_file_name, time_end - time_start))
                    
                    predictions = results[PREDICT_RESULT_TASK1][0]    # bboxes_out, labels_out, scores_out
                    predict_boxes = predictions[0].to("cpu").numpy()
                    if len(predict_boxes) > 0:
                        predict_boxes[:, [0, 2]] = predict_boxes[:, [0, 2]] * original_img.size[0]
                        predict_boxes[:, [1, 3]] = predict_boxes[:, [1, 3]] * original_img.size[1]

                        predict_classes = predictions[1].to("cpu").numpy()
                        predict_scores = predictions[2].to("cpu").numpy()
                        
                        for i in range(len(predict_boxes)):
                            if predict_scores[i] < 0.10:
                                continue
                            
                            task1_new_row = defaultdict()
                            
                            task1_new_row["filename"] = image_file_name
                            task1_new_row["label"] = predict_classes[i]

                            task1_new_row["xmin"] = int(predict_boxes[i, 0])
                            task1_new_row["ymin"] = int(predict_boxes[i, 1])
                            task1_new_row["width"] = int(predict_boxes[i, 2] - predict_boxes[i, 0])
                            task1_new_row["height"] = int(predict_boxes[i, 3] - predict_boxes[i, 1])

                            task1_new_row["confidence"] = round(predict_scores[i], 2)
                            
                            task1_csv_writer.writerow(task1_new_row)
                    
#                     task2_result = results[PREDICT_RESULT_TASK2].argmax(dim=1)[0]
                    
#                     task2_new_row = defaultdict()
#                     task2_new_row["filename"] = image_file_name
#                     task2_new_row["label"] = recover_task2_label(task2_result.item())
                    
#                     task2_csv_writer.writerow(task2_new_row)
                    
        print("导出 " + task1_file + " 成功")
        print("导出 " + task2_file + " 成功")

# test_image_root = os.path.join(dataset_root, "test", "images")
# export_predict_result(image_root=test_image_root, 
#                       train_weights="./save_weights/ssd300-20.pth",
#                       task1_file="submit/submit1.csv",
#                       task2_file="submit/submit2.csv")