In [5]:
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l
from tqdm import tqdm

In [6]:
""" 
# 卷积朴素实现
def conv2d(input, weight, bias=None, stride=1, padding=0):
    batch_size, in_channels, in_height, in_width = input.size()
    out_channels, _, kernel_height, kernel_width = weight.size()
    out_height = int((in_height + 2 * padding - kernel_height) / stride) + 1
    out_width = int((in_width + 2 * padding - kernel_width) / stride) + 1
    padded_input = torch.nn.functional.pad(input, (padding, padding, padding, padding))
    output = torch.zeros(batch_size, out_channels, out_height, out_width)
    for b in range(batch_size):
        for c_out in range(out_channels):
            for h_out in range(out_height):
                for w_out in range(out_width):
                    h_start = h_out * stride
                    w_start = w_out * stride
                    h_end = h_start + kernel_height
                    w_end = w_start + kernel_width
                    input_patch = padded_input[b, :, h_start:h_end, w_start:w_end]
                    output[b, c_out, h_out, w_out] = torch.sum(input_patch * weight[c_out]) + (bias[c_out] if bias is not None else 0)
    return output
"""

# """
def conv2d(inputs, weight, bias=None, stride=1, padding=0):
    padded_input = torch.nn.functional.pad(inputs, (padding, padding, padding, padding))
    output = torch.nn.functional.conv2d(padded_input, weight, bias=bias, stride=stride)
    return output
# """

In [7]:
# """
# max 池化朴素实现
def max_pool2d(inputs, kernel_size, stride=None, padding=0):
    batch_size, channels, height, width = inputs.size()
    output_height = int((height + 2 * padding - kernel_size) / stride) + 1
    output_width = int((width + 2 * padding - kernel_size) / stride) + 1
    unfolded = torch.nn.functional.unfold(
        inputs,
        kernel_size=kernel_size,
        dilation=1,
        padding=padding,
        stride=stride
    )
    unfolded = unfolded.view(batch_size, channels, -1, output_height, output_width)
    output, _ = torch.max(unfolded, dim=2)
    return output
# """

"""
def max_pool2d(input, kernel_size, stride=1, padding=0):
    output = torch.nn.functional.max_pool2d(input, kernel_size, stride=stride, padding=padding)
    return output
"""

'\ndef max_pool2d(input, kernel_size, stride=1, padding=0):\n    output = torch.nn.functional.max_pool2d(input, kernel_size, stride=stride, padding=padding)\n    return output\n'

In [8]:
def dropout(X, p):
    return X * (torch.rand_like(X) > p).float() / (1 - p)

In [9]:
def relu(x):
    return torch.max(torch.zeros_like(x), x)

def leaky_reLu(x, negative_slope=0.01):
    return torch.where(x >= 0, x, negative_slope * x)

In [10]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

In [11]:
# """
# 全局平均汇聚层朴素实现
def AdaptiveAvgPool2d(x, output_size):
    batch_size, channels, height, width = x.size()
    output_h, output_w = output_size
    stride_h = height // output_h
    stride_w = width // output_w
    output = []
    for i in range(output_h):
        for j in range(output_w):
            h_start = i * stride_h
            h_end = min(h_start + stride_h, height)
            w_start = j * stride_w
            w_end = min(w_start + stride_w, width)
            pool_region = x[:, :, h_start:h_end, w_start:w_end]
            pool_avg = torch.mean(pool_region, dim=(2, 3))
            output.append(pool_avg)
    output = torch.stack(output, dim=2)
    output = output.view(batch_size, channels, output_h, output_w)

    return output
# """

"""
def AdaptiveAvgPool2d(x, output_size):
    return torch.nn.AdaptiveAvgPool2d(output_size)(x)
"""

'\ndef AdaptiveAvgPool2d(x, output_size):\n    return torch.nn.AdaptiveAvgPool2d(output_size)(x)\n'

In [12]:
import torchvision.models as tvmodel
resnet = tvmodel.resnet34(pretrained=True)
resnet_out_channel = resnet.fc.in_features
resnet = nn.Sequential(*list(resnet.children())[:-2])
GL_CLASSES = ['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
           'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train',
           'bottle', 'chair', 'diningtable', 'pottedplant', 'sofa', 'tvmonitor']
GL_NUMBBOX = 2
GL_NUMGRID = 7



In [116]:
class YOLOv1Net:
    def __init__(self):
        self._init_params()
        
    def _init_params(self):
        """初始化参数"""
        def normal(shape):
            return torch.nn.init.xavier_uniform_(torch.empty(shape))
        """输出通道 * 输入通道 * 卷积核边长 * 卷积核边长"""
        """卷积层"""
        W_1 = normal((1024, resnet_out_channel, 3, 3)); b_1 = torch.zeros(1024, device=device)
        W_2 = normal((1024, 1024, 3, 3)); b_2 = torch.zeros(1024, device=device)
        W_3 = normal((1024, 1024, 3, 3)); b_3 = torch.zeros(1024, device=device)
        W_4 = normal((1024, 1024, 3, 3)); b_4 = torch.zeros(1024, device=device)
        convs = [W_1, b_1, W_2, b_2, W_3, b_3, W_4, b_4]
        W_1 = normal((GL_NUMGRID * GL_NUMGRID * 1024, 4096)); b_1 = torch.zeros(4096, device=device)
        W_2 = normal((4096, GL_NUMGRID * GL_NUMGRID * (5 * GL_NUMBBOX + len(GL_CLASSES)))); 
        b_2 = torch.zeros(GL_NUMGRID * GL_NUMGRID * (5 * GL_NUMBBOX + len(GL_CLASSES)), device=device)
        linears = [W_1, b_1, W_2, b_2]
        self.params = [convs, linears]; self.flt_params = []
        for params in [convs, linears]:
            for param in params:
                self.flt_params.append(param)
        for param in self.flt_params:
            param.requires_grad_(True)        

    def _forward(self, X):
        """推理函数"""
        convs, linears = self.params
        X = resnet(X)
        W_1, b_1, W_2, b_2, W_3, b_3, W_4, b_4 = convs
        X = relu(conv2d(X, W_1, b_1, padding=1))
        X = relu(conv2d(X, W_2, b_2, stride=2, padding=1))
        X = relu(conv2d(X, W_3, b_3, padding=1))
        X = relu(conv2d(X, W_4, b_4, padding=1))
        X = X.reshape(X.size(0), -1)
        W_1, b_1, W_2, b_2 = linears
        X = relu(X @ W_1 + b_1)
        X = sigmoid(X @ W_2 + b_2)
        Y = X.reshape(-1, (5 * GL_NUMBBOX + len(GL_CLASSES)), GL_NUMGRID, GL_NUMGRID)
        return Y
    
    def __call__(self, X):
        return self._forward(X)
    
    def update(self, X, y, lr):
        """更新函数"""
        y_hat = self._forward(X)
        l = self._loss(y_hat, y)
        l.mean().backward()
        self.grad_clipping(1)
        """sgd"""
        with torch.no_grad():
            for param in self.flt_params:
                param -= lr * param.grad / X.shape[0]
                param.grad.zero_()
        return l
            
    def _loss(self, pred, labels):
        """
        pred: (batchsize, 30, 7, 7) 的网络输出数据
        labels: (batchsize, 30, 7, 7) 的样本标签数据
        return: 当前批次样本的平均损失
        """
        num_gridx, num_gridy = labels.size()[-2:]  
        num_b = 2  # 每个网格的 bbox 数量
        num_cls = 20  # 类别数量
        noobj_confi_loss = 0.  # 不含目标的网格损失(只有置信度损失)
        coor_loss = 0.  # 含有目标的bbox的坐标损失
        obj_confi_loss = 0.  # 含有目标的bbox的置信度损失
        class_loss = 0.  # 含有目标的网格的类别损失
        n_batch = labels.size()[0]  # batchsize的大小
        
        def calculate_iou(bbox1, bbox2):
            """计算 bbox1 = (x1, y1, x2, y2) 和 bbox2 = (x3, y3, x4, y4) 两个 bbox 的 iou"""
            x1, y1, x2, y2 = bbox1
            x3, y3, x4, y4 = bbox2
            intersect_width = max(0, min(x2, x4) - max(x1, x3))
            intersect_height = max(0, min(y2, y4) - max(y1, y3))
            intersection_area = intersect_width * intersect_height
            area1 = (x2 - x1) * (y2 - y1)
            area2 = (x4 - x3) * (y4 - y3)
            iou = intersection_area / (area1 + area2 - intersection_area) if intersection_area > 0 else 0
            return iou
        for i in range(n_batch): 
            for n in range(7):
                for m in range(7):
                    if labels[i, 4, m, n] == 1: # 如果包含物体
                        bbox1_pred_xyxy = ((pred[i, 0, m, n] + n) / num_gridx - pred[i, 2, m, n] / 2,
                                           (pred[i, 1, m, n] + m) / num_gridy - pred[i, 3, m, n] / 2,
                                           (pred[i, 0, m, n] + n) / num_gridx + pred[i, 2, m, n] / 2,
                                           (pred[i, 1, m, n] + m) / num_gridy + pred[i, 3, m, n] / 2)
                        bbox2_pred_xyxy = ((pred[i, 5, m, n] + n) / num_gridx - pred[i, 7, m, n] / 2,
                                           (pred[i, 6, m, n] + m) / num_gridy - pred[i, 8, m, n] / 2,
                                           (pred[i, 5, m, n] + n) / num_gridx + pred[i, 7, m, n] / 2,
                                           (pred[i, 6, m, n] + m) / num_gridy + pred[i, 8, m, n] / 2)
                        bbox_gt_xyxy = ((labels[i, 0, m, n] + n) / num_gridx - labels[i, 2, m, n] / 2,
                                        (labels[i, 1, m, n] + m) / num_gridy - labels[i, 3, m, n] / 2,
                                        (labels[i, 0, m, n] + n) / num_gridx + labels[i, 2, m, n] / 2,
                                        (labels[i, 1, m, n] + m) / num_gridy + labels[i, 3, m, n] / 2)
                        iou1 = calculate_iou(bbox1_pred_xyxy, bbox_gt_xyxy)
                        iou2 = calculate_iou(bbox2_pred_xyxy, bbox_gt_xyxy)
                        coor_loss += 5 * (torch.sum((pred[i, [0, 1], m, n] - labels[i, [0, 1], m, n]) ** 2)
                                          + torch.sum((pred[i, [2, 3], m, n].sqrt() - 
                                                       labels[i, [2, 3], m, n].sqrt()) ** 2))
                        obj_confi_loss += (pred[i, 4, m, n] - iou1) ** 2 \
                                        if iou1 >= iou2 else (pred[i, 9, m, n] - iou2) ** 2
                        noobj_confi_loss += 0.5 * ((pred[i, 9, m, n] - iou2) ** 2 \
                                        if iou1 >= iou2 else (pred[i, 4, m, n] - iou1) ** 2)
                        class_loss += torch.sum((pred[i, 10:, m, n] - labels[i, 10:, m, n]) ** 2)
                    else:
                        noobj_confi_loss += 0.5 * torch.sum(pred[i, [4, 9], m, n] ** 2)
        loss = coor_loss + obj_confi_loss + noobj_confi_loss + class_loss
        return loss / n_batch
    
    def grad_clipping(self, theta):
        norm = torch.sqrt(sum([torch.sum(p ** 2) for p in self.flt_params]))
        if norm > theta:
            for param in self.flt_params:
                param.grad[:] *= theta / norm

In [117]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import random
from PIL import Image
import torchvision.transforms as transforms

In [118]:
class MyDataset(Dataset):
    def __init__(self, dataset_dir, seed=None, mode="train", train_val_ratio=0.9, trans=None):
        random.seed(seed)
        self.dataset_dir = dataset_dir
        self.mode = mode
        img_list_txt = os.path.join(dataset_dir, mode.replace("val", "train") + ".txt")
        label_csv = os.path.join(dataset_dir, mode.replace("val", "train") + ".csv")
        self.img_list = open(img_list_txt).read().splitlines()
        self.label = np.loadtxt(label_csv, dtype=np.float32)
        self.num_all_data = len(self.img_list)
        all_ids = list(range(self.num_all_data))
        num_train = int(train_val_ratio * self.num_all_data)
        self.use_ids = all_ids[:num_train] if mode == "train" else all_ids[num_train:]
        self.trans = trans

    def __len__(self):
        return len(self.use_ids)

    def __getitem__(self, item):
        id = self.use_ids[item]
        label = torch.tensor(self.label[id, :])
        img_path = self.img_list[id]
        img = Image.open(img_path)
        if self.trans is None:
            trans = transforms.Compose([transforms.ToTensor(),])
        else:
            trans = self.trans
        img = trans(img)
        return img, label


In [119]:
epoch, batch_size, lr, num_epochs = 50, 5, 0.1, 1
dataset_dir = "../DataSet/VOCdevkit/VOC2012/voc2012_forYolov1/"
dataset = MyDataset(dataset_dir)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
len(train_dataloader)

1703

In [120]:
net = YOLOv1Net()

In [121]:
for epoch in range(num_epochs):
    metrics = [0, 0]
    for i, (X, y) in enumerate(train_dataloader):
        y = y.view(batch_size, GL_NUMGRID, GL_NUMGRID, -1)
        y = y.permute(0, 3, 1, 2)
        l = net.update(X, y, lr=lr)
        metrics[0] += l * batch_size; metrics[1] += batch_size
        print('i %d loss %f' % (i + 1, metrics[0] / metrics[1]))
    print('epoch %d loss %f' % (epoch + 1, metrics[0] / metrics[1]))

i 1 loss 18.734579
i 2 loss 18.448498
i 3 loss 17.891035
i 4 loss 17.330986
i 5 loss 17.035566
i 6 loss 16.567728
i 7 loss 16.269590
i 8 loss 15.972232
i 9 loss 15.714748
i 10 loss 15.333399
i 11 loss 15.174784
i 12 loss 14.869288
i 13 loss 14.533778
i 14 loss 14.218907
i 15 loss 14.013656


KeyboardInterrupt: 

In [51]:
weight_path = '../YOLOv1/weights/net.pkl'
torch.save(net, weight_path)

In [53]:
net = torch.load(weight_path)

In [122]:
def nms_1cls(dets, thresh):
    """
    单类别NMS
    dets: ndarray, nx5, dets[i, 0:4] 分别是 bbox 坐标；dets[i, 4] 是置信度 score
    thresh: NMS 算法设置的 iou 阈值
    """
    x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]
    keep = []

    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        iou = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(iou <= thresh)[0]
        order = order[inds + 1]
    return keep

def nms_multi_cls(dets, thresh, n_cls):
    """
    多类别的NMS算法
    dets: ndarray, nx6, dets[i, 0:4] 是 bbox 坐标；dets[i, 4] 是置信度 score；dets[i, 5] 是类别序号；
    thresh: NMS 算法的阈值；
    n_cls: 是类别总数
    """
    keeps_index = []
    for i in range(n_cls):
        order_i = np.where(dets[:, 5] == i)[0]
        det = dets[dets[:, 5] == i, 0:5]
        if det.shape[0] == 0:
            keeps_index.append([])
            continue
        keep = nms_1cls(det, thresh)
        keeps_index.append(order_i[keep])
    return keeps_index

In [138]:
def labels2bbox(matrix):
    """
    将网络输出的 7*7*30 的数据转换为 bbox 的 (98, 25) 的格式，然后再将 NMS 处理后的结果返回
    matrix: 注意，输入的数据中，bbox坐标的格式是 (px,py,w,h)，需要转换为 (x1,y1,x2,y2) 的格式再输入NMS
    return: 返回NMS处理后的结果,bboxes.shape = (-1, 6), 0:4 是(x1,y1,x2,y2), 4是conf， 5是cls
    """
    if matrix.size()[0:2] != (7,7):
        raise ValueError("Error: Wrong labels size: ", matrix.size(), " != (7,7)")
    matrix = matrix.numpy()
    bboxes = np.zeros((98, 6))
    matrix = matrix.reshape(49, -1)
    bbox = matrix[:, :10].reshape(98, 5)
    r_grid = np.array(list(range(7)))
    r_grid = np.repeat(r_grid, repeats=14, axis=0)
    c_grid = np.array(list(range(7)))
    c_grid = np.repeat(c_grid, repeats=2, axis=0)[np.newaxis, :]
    c_grid = np.repeat(c_grid, repeats=7, axis=0).reshape(-1)
    bboxes[:, 0] = np.maximum((bbox[:, 0] + c_grid) / 7.0 - bbox[:, 2] / 2.0, 0)
    bboxes[:, 1] = np.maximum((bbox[:, 1] + r_grid) / 7.0 - bbox[:, 3] / 2.0, 0)
    bboxes[:, 2] = np.minimum((bbox[:, 0] + c_grid) / 7.0 + bbox[:, 2] / 2.0, 1)
    bboxes[:, 3] = np.minimum((bbox[:, 1] + r_grid) / 7.0 + bbox[:, 3] / 2.0, 1)
    bboxes[:, 4] = bbox[:, 4]
    cls = np.argmax(matrix[:, 10:], axis=1)
    cls = np.repeat(cls, repeats=2, axis=0)
    bboxes[:, 5] = cls
    keepid = nms_multi_cls(bboxes, thresh=0.01, n_cls=20)
    ids = []
    for x in keepid:
        ids = ids + list(x)
    ids = sorted(ids)
    return bboxes[ids, :]

In [139]:
def draw_bbox(img, bbox):
    """
    根据bbox的信息在图像上绘制 bounding box
    :param img: 绘制bbox的图像
    :param bbox: 是(n,6)的尺寸，0:4是(x1,y1,x2,y2), 4是conf， 5是cls
    """
    h, w = img.shape[0:2]
    n = bbox.shape[0]
    for i in range(n):
        confidence = bbox[i, 4]
        if confidence<0.2:
            continue
        p1 = (int(w * bbox[i, 0]), int(h * bbox[i, 1]))
        p2 = (int(w * bbox[i, 2]), int(h * bbox[i, 3]))
        cls_name = GL_CLASSES[int(bbox[i, 5])]
        print(cls_name, p1, p2)
        cv2.rectangle(img, p1, p2, COLOR[int(bbox[i, 5])])
        cv2.putText(img, cls_name, p1, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
        cv2.putText(img, str(confidence), (p1[0],p1[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
    cv2.imshow("bbox", img)
    cv2.waitKey(0)

In [140]:
import cv2
COLOR = [(255,0,0),(255,125,0),(255,255,0),(255,0,125),(255,0,250),
         (255,125,125),(255,125,250),(125,125,0),(0,255,125),(255,0,0),
         (0,0,255),(125,0,255),(0,125,255),(0,255,255),(125,125,255),
         (0,255,0),(125,255,125),(255,255,255),(100,100,100),(0,0,0),]  # 用来标识20个类别的bbox颜色，可自行设定
test_image_dir = './Test_Images/'
img_list = os.listdir(test_image_dir)
trans = transforms.Compose([transforms.ToTensor(),])
for img_name in img_list:
    img_path = os.path.join(test_image_dir, img_name)
    img = Image.open(img_path).convert('RGB')
    img = trans(img)
    img = torch.unsqueeze(img, dim=0)
    print(img_name, img.shape)
    preds = torch.squeeze(net(img), dim=0).detach().cpu()
    preds = preds.permute(1,2,0)
    bbox = labels2bbox(preds)
    draw_img = cv2.imread(img_path)
    draw_bbox(draw_img, bbox)

2007_000027.jpg torch.Size([1, 3, 448, 448])
person (0, 0) (116, 174)
horse (0, 0) (222, 184)
cat (235, 0) (448, 144)
bicycle (330, 0) (448, 167)
bird (0, 0) (238, 261)
sheep (303, 0) (448, 205)
dog (200, 62) (380, 254)
diningtable (0, 121) (100, 299)
train (160, 62) (422, 359)
bus (63, 124) (276, 438)
boat (95, 158) (334, 400)
chair (160, 180) (414, 391)
pottedplant (64, 222) (238, 448)
sofa (0, 337) (138, 448)
tvmonitor (12, 324) (184, 448)
bottle (105, 350) (318, 448)
car (130, 342) (448, 448)
cow (235, 302) (448, 448)
