# 构建数据集

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from yaml import safe_load
from os.path import join, exists, getsize
from os import listdir
from PIL import Image

def collate_fn(data):
    imgs, labels = zip(*data)
    assert len(imgs) == len(labels)
 
    for i, lb in enumerate(labels):
        lb[:, 0] = i  # add target image index for build_targets()

    labels = torch.cat(labels, 0)
    imgs = torch.stack(imgs)
    return imgs, labels


class URPC2020_DataSet(Dataset):
    def __init__(self, data_yaml_path="hg.yaml", type="val", transforms=None):
        self.data_yaml_path = data_yaml_path
        self.transforms = transforms
        with open(self.data_yaml_path, 'r') as file:
            self.data_dict = safe_load(file)
        self.nc = self.data_dict['nc']
        self.root_path = self.data_dict["path"]
        self.classes = self.data_dict["names"]

        if(type == "train"):
            self.images_path = join(self.root_path, self.data_dict["train"])
        elif type == "val":
            self.images_path = join(self.root_path, self.data_dict["val"])
        else:
            self.images_path = join(self.root_path, self.data_dict["test"])

        self.labels_path = f"{self.root_path}/{type}/labels"
        labels = listdir(self.labels_path)
        self.labels = self.filter_labels(labels)
    
    def filter_labels(self, labels):
        # filter empty labels
        labels = [label for label in labels if getsize(join(self.labels_path, label)) != 0]
        return labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        label = self.labels[index]
        image = label[:-3] + "jpeg" #jpg
        # image = label[:-3] + "jpg"
        image_path = f"{self.images_path}/{image}"
        label_path = f"{self.labels_path}/{label}"

        img = Image.open(image_path).convert("RGB")
        label = []
        with open(label_path, "r") as file:
            for line in file:
                line_strip = line.strip().split(" ") #index in batch, cls, bbox
                label.append([int(0), int(line_strip[0]), float(line_strip[1]), float(line_strip[2]), float(line_strip[3]), float(line_strip[4])])
        img = self.transforms(img)
        label = torch.tensor(label, dtype=torch.float32)
        return img, label


def group_adjust(labels, batch_size):
    # 创建一个字典，按照第一列的值进行分组
    grouped_labels = [[] for i in range(batch_size)]
    for row in iter(labels):
        i = int(row[0].item())
        grouped_labels[i].append(row[1:])
    try:
        grouped_labels = [torch.stack(group) for group in grouped_labels]
    finally:
        return grouped_labels


def build_dataset(yaml_path="hg.yaml", batch_size=16, data_transform=None):
    if(data_transform is None):
        data_transform = transforms.Compose([transforms.ToTensor(),
                                            transforms.Resize((640, 640), antialias=True), # image resize but label is ratio, so label is not need to change
                                            ])
    # Create a custom dataset
    dataset = URPC2020_DataSet(yaml_path, transforms=data_transform)

    # Create a DataLoader
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            num_workers=4,
                            shuffle=True,
                            pin_memory=True,
                            collate_fn=collate_fn)

    return dataloader

In [10]:
# data_loader = build_dataset(yaml_path="/home/szt/projects/ultralytics/urpc2020.yaml")
data_loader = build_dataset()
for i, (imgs, labels) in enumerate(data_loader):
    # print(imgs[0] * 255)
    print(imgs.size())
    print(labels.size())
    print(labels[0])
    if i > 2: break

torch.Size([16, 3, 640, 640])
torch.Size([18, 6])
tensor([0.0000, 1.0000, 0.8356, 0.4021, 0.0234, 0.0309])
torch.Size([16, 3, 640, 640])
torch.Size([18, 6])
tensor([0.0000, 1.0000, 0.2440, 0.4335, 0.0316, 0.0382])
torch.Size([16, 3, 640, 640])
torch.Size([16, 6])
tensor([0.0000, 1.0000, 0.7019, 0.5897, 0.0340, 0.0421])
torch.Size([16, 3, 640, 640])
torch.Size([16, 6])
tensor([0.0000, 1.0000, 0.4841, 0.8573, 0.0328, 0.0509])


# build a network

In [11]:
import torch
from torch import nn

class Conv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=1, bias=False, groups=1): # default half the size
        super(Conv, self).__init__()
        # (w-k+2p)/s + 1
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias, groups=groups)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.LeakyReLU()

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class ResidualBlock(nn.Module):
    def __init__(self, channels, use_residual=True, num_repeats=3):
        super(ResidualBlock, self).__init__()
        self.layers = nn.ModuleList([nn.Sequential(
                    Conv(channels, channels//2, kernel_size=1, stride=1, padding=0),
                    Conv(channels//2, channels, kernel_size=3, stride=1, padding=1),
                    nn.LeakyReLU()
                ) for _ in range(num_repeats)])

        self.use_residual = use_residual
        self.num_repeats = num_repeats

    def forward(self, x):
        for layer in self.layers:
            if self.use_residual:
                x = x + layer(x)
            else:
                x = layer(x)
        return x

class Head(nn.Module):
    def __init__(self, in_channels, num_classes=4, num_anchors=3):
        super(Head, self).__init__()
        self.cv1 = nn.Sequential(Conv(in_channels, in_channels // 2, stride=1), 
                                 nn.Conv2d(in_channels // 2, num_anchors*5, kernel_size=1), nn.Sigmoid())
        self.cv2 = nn.Sequential(Conv(in_channels, in_channels // 2, stride=1),
                                 nn.Conv2d(in_channels // 2, num_classes, kernel_size=1), nn.Sigmoid())

    def forward(self, x):
        pred_bbox = self.cv1(x)
        pred_cls = self.cv2(x)
        return pred_bbox, pred_cls

class MyYolo(nn.Module):
    def __init__(self, in_channels=3, num_classes=4, num_anchors=3):
        super(MyYolo, self).__init__()

        self.architecture = [
            (in_channels, 64), #0 output 320
            (64, 128), #1 160
            # ("R", 64),
            (128, 256), #3 80
            # ("R", 128),
            (256, 512), #5 40
            # ("R", 256),
            (512, 256), #7 20
            # ("R", 512),
            ("H", 256, num_classes, num_anchors) #9 bbox, cls
        ]

        self.layers = nn.ModuleList([self._make_layers(param) for param in self.architecture])

    def _make_layers(self, param):
        if param[0] == "R":
            layer = ResidualBlock(param[1])
        elif param[0] == "H":
            layer = Head(param[1], param[2], param[3])
        else:
            layer = Conv(param[0], param[1])
        return layer

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

model = MyYolo(num_anchors=1)
x = torch.randn(16, 3, 640, 640)
out = model(x)
print(out[0].shape, out[1].shape)

torch.Size([16, 5, 20, 20]) torch.Size([16, 4, 20, 20])


In [12]:
torch.onnx.export(model, x, "model.onnx")
print(out[0][0])

tensor([[[0.6075, 0.4703, 0.6192,  ..., 0.7241, 0.6771, 0.6529],
         [0.6089, 0.4849, 0.4578,  ..., 0.3859, 0.5285, 0.6075],
         [0.5780, 0.4753, 0.4189,  ..., 0.6317, 0.5106, 0.5558],
         ...,
         [0.6844, 0.4472, 0.5856,  ..., 0.5703, 0.5543, 0.4631],
         [0.6090, 0.5100, 0.4512,  ..., 0.5842, 0.6372, 0.5784],
         [0.6323, 0.5441, 0.4877,  ..., 0.3968, 0.5896, 0.6099]],

        [[0.5469, 0.4532, 0.6111,  ..., 0.5280, 0.4268, 0.4072],
         [0.5500, 0.5316, 0.5149,  ..., 0.5007, 0.5976, 0.4753],
         [0.5883, 0.4758, 0.4316,  ..., 0.5264, 0.7196, 0.5177],
         ...,
         [0.5940, 0.5914, 0.6142,  ..., 0.6765, 0.6312, 0.5553],
         [0.5340, 0.6161, 0.5746,  ..., 0.4790, 0.4905, 0.3956],
         [0.5785, 0.5229, 0.5069,  ..., 0.5929, 0.6472, 0.4688]],

        [[0.5237, 0.5032, 0.4582,  ..., 0.4228, 0.5642, 0.5198],
         [0.5171, 0.4279, 0.4945,  ..., 0.6745, 0.4865, 0.4765],
         [0.5206, 0.5412, 0.5162,  ..., 0.3948, 0.4515, 0.

In [13]:
# 迭代数据集
batch_size = 16
# data_loader = build_dataset(yaml_path="/home/szt/projects/ultralytics/urpc2020.yaml", batch_size=batch_size, data_transform=None)
data_loader = build_dataset(batch_size=batch_size, data_transform=None)

# utils functions

In [14]:
# copy from yolov8

import numpy as np
def clip_boxes(boxes, shape):
    """
    Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.

    Args:
        boxes (torch.Tensor): the bounding boxes to clip
        shape (tuple): the shape of the image

    Returns:
        (torch.Tensor | numpy.ndarray): Clipped boxes
    """
    if isinstance(boxes, torch.Tensor):  # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
        boxes[..., 0] = boxes[..., 0].clamp(0, shape[1])  # x1
        boxes[..., 1] = boxes[..., 1].clamp(0, shape[0])  # y1
        boxes[..., 2] = boxes[..., 2].clamp(0, shape[1])  # x2
        boxes[..., 3] = boxes[..., 3].clamp(0, shape[0])  # y2
    else:  # np.array (faster grouped)
        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
    return boxes

def xyxy2xywh(x):
    """
    Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
    top-left corner and (x2, y2) is the bottom-right corner.

    Args:
        x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.

    Returns:
        y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height) format.
    """
    assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
    y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)  # faster than clone/copy
    y[..., 0] = (x[..., 0] + x[..., 2]) / 2  # x center
    y[..., 1] = (x[..., 1] + x[..., 3]) / 2  # y center
    y[..., 2] = x[..., 2] - x[..., 0]  # width
    y[..., 3] = x[..., 3] - x[..., 1]  # height
    return y


def xywh2xyxy(x):
    """
    Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
    top-left corner and (x2, y2) is the bottom-right corner.

    Args:
        x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.

    Returns:
        y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
    """
    assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
    y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)  # faster than clone/copy
    dw = x[..., 2] / 2  # half-width
    dh = x[..., 3] / 2  # half-height
    y[..., 0] = x[..., 0] - dw  # top left x
    y[..., 1] = x[..., 1] - dh  # top left y
    y[..., 2] = x[..., 0] + dw  # bottom right x
    y[..., 3] = x[..., 1] + dh  # bottom right y
    return y

def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
    """
    Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height, normalized) format. x, y,
    width and height are normalized to image dimensions.

    Args:
        x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
        w (int): The width of the image. Defaults to 640
        h (int): The height of the image. Defaults to 640
        clip (bool): If True, the boxes will be clipped to the image boundaries. Defaults to False
        eps (float): The minimum value of the box's width and height. Defaults to 0.0

    Returns:
        y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height, normalized) format
    """
    if clip:
        x = clip_boxes(x, (h - eps, w - eps))
    assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
    y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)  # faster than clone/copy
    y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w  # x center
    y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h  # y center
    y[..., 2] = (x[..., 2] - x[..., 0]) / w  # width
    y[..., 3] = (x[..., 3] - x[..., 1]) / h  # height
    return y


# post process

In [25]:
import numpy as np
def bbox_iou(bbox1, bbox2, eps=1e-8):
    """
    Compute the Intersection over Union (IoU) of two bounding boxes.
    :param bbox1: bounding box No.1.
    :param bbox2: bounding box No.2.
    :return: IoU of bbox1 and bbox2.
    """
    x1, y1, w1, h1, _ = bbox1
    xmin1, ymin1 = x1 - w1 / 2.0, y1 - h1 / 2.0
    xmax1, ymax1 = x1 + w1 / 2.0, y1 + h1 / 2.0
    x2, y2, w2, h2, _ = bbox2
    xmin2, ymin2 = x2 - w2 / 2.0, y2 - h2 / 2.0
    xmax2, ymax2 = x2 + w2 / 2.0, y2 + h2 / 2.0

    # Calculate intersection coordinates
    xx1 = np.max([xmin1, xmin2])
    yy1 = np.max([ymin1, ymin2])
    xx2 = np.min([xmax1, xmax2])
    yy2 = np.min([ymax1, ymax2])

    # Calculate intersection area
    w = np.max([0.0, xx2 - xx1])
    h = np.max([0.0, yy2 - yy1])
    area_intersection = w * h

    # Calculate union area (subtract overlapping area to avoid double counting)
    area1 = (xmax1 - xmin1) * (ymax1 - ymin1)
    area2 = (xmax2 - xmin2) * (ymax2 - ymin2)
    area_union = area1 + area2 - area_intersection

    # Calculate IoU
    iou = area_intersection / (area_union + eps)
    return iou

def bboxes_iou(bbox1, bboxes):
    # return iou of bbox1 and bboxes
    ious = []
    for bbox2 in bboxes:
        iou = bbox_iou(bbox1.detach().numpy(), bbox2.detach().numpy())
        ious.append(iou)
    return torch.tensor(ious)



# loss
box_loss = nn.MSELoss()
cls_loss = nn.CrossEntropyLoss()
def calculate_loss(pred_bbox, pred_cls, gt_bboxes, gt_clss):
    b_loss = box_loss(pred_bbox, gt_bboxes)
    c_loss = cls_loss(pred_cls, gt_clss)
    loss = b_loss + c_loss
    return loss


# assigner
def pred_gt_assigner(pred_boxes, gt_boxes, iou_threshold=0.5, max_num_negatives=3):

    h = len(gt_boxes)
    w = pred_boxes.shape[0]
    assgined_dict = {}
    for i in range(h):
        assgined_dict[i] = []

    # 遍历每一个真实标签框  
    for i, gt_box in enumerate(gt_boxes):  
        
        # 遍历每一个预测框  
        for j, pred_box in enumerate(pred_boxes):  
            # 计算IoU  
            iou = bbox_iou(gt_box, pred_box)
            if(iou > iou_threshold):
                assgined_dict[i].append(j)
    
    return assgined_dict

# train

In [28]:

from torchvision.ops import nms
from torch.optim import SGD
from tqdm import tqdm
import time

optimizer = SGD(model.parameters(), lr=0.01, weight_decay=0.005)  # 使用随机梯度下降优化器  
epochs = 10  # 训练10轮

for epoch in tqdm(range(epochs)):
    start_time = time.time()

    loss_t = []
    for e, batch in enumerate(data_loader):
        features, labels = batch
        b_t = features.shape[0]
        labels = group_adjust(labels, b_t) # b, n, 5; list of tensor
        if(len(labels) < b_t):
            continue
        
        # b, 5, 20, 20; b, 4, 20, 20
        pred_boxes, pred_classes = model(features) # first! consider output box as center and width, height
        pred_boxes = pred_boxes.permute(0, 2, 3, 1)
        pred_classes = pred_classes.permute(0, 2, 3, 1)

        # b, 20, 20, 5; b, 20, 20, 4
        gt_boxes = torch.zeros(b_t, 20, 20, 5)
        gt_classes = torch.zeros(b_t, 20, 20, 4)
        for i, label in enumerate(labels):
            for j, lable_ in enumerate(label):
                x_idx, y_idx = int(lable_[1] * 20), int(lable_[2] * 20)
                x_idx = np.clip(x_idx, 0, 19)
                y_idx = np.clip(y_idx, 0, 19)
                gt_boxes[i, x_idx, y_idx, 0] = lable_[1] * 20.0 - x_idx
                gt_boxes[i, x_idx, y_idx, 1] = lable_[2] * 20.0 - y_idx
                gt_boxes[i, x_idx, y_idx, 2:4] = lable_[3:]
                gt_boxes[i, x_idx, y_idx, 4] = 1
                gt_classes[i, x_idx, y_idx, int(lable_[0])] = 1
        
        have_obg = (gt_boxes[..., 4] == 1)
        no_obj = ~have_obg

        # print(gt_boxes.shape, gt_classes.shape)
        # print(have_obg.shape)

        # print(pred_boxes[0], gt_boxes[0])
        loss_coor = ((gt_boxes[..., :2] - pred_boxes[..., :2]) ** 2 \
                    + (torch.sqrt(pred_boxes[..., 2:4]) - torch.sqrt(gt_boxes[..., 2:4])) ** 2).sum(dim=-1) * have_obg
        
        loss_confidence = (gt_boxes[..., 4] - pred_boxes[..., 4]) ** 2
        # print(loss_coor.shape)

        loss_class = ((pred_classes - gt_classes) ** 2).sum(dim=-1) * have_obg
        # print(loss_class.shape)
        
        loss_noOb = (gt_boxes[..., 4] - pred_boxes[..., 4]) ** 2 * no_obj

        loss = (1.0 * loss_coor + loss_confidence + loss_class + 0.1 * loss_noOb).mean()

        loss_t.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    end_time = time.time()
    print("epoch: {epoch:4}, time: {time:^10.4f}, loss: {loss:^10.4f}".format(epoch=epoch, time=end_time-start_time, loss=np.mean(loss_t)))

# bbox1 = torch.tensor([100, 100, 50, 50])
# bbox2 = torch.tensor([75, 75, 50, 50])
# iou = bbox_iou(bbox1, bbox2)
# print(iou)

 10%|█         | 1/10 [08:09<1:13:23, 489.27s/it]

epoch:    0, time:  489.2695 , loss:   0.0893  


 20%|██        | 2/10 [16:16<1:05:06, 488.30s/it]

epoch:    1, time:  487.6144 , loss:   0.0270  


 30%|███       | 3/10 [24:24<56:55, 487.96s/it]  

epoch:    2, time:  487.5502 , loss:   0.0182  


 40%|████      | 4/10 [32:28<48:37, 486.26s/it]

epoch:    3, time:  483.6625 , loss:   0.0148  


 50%|█████     | 5/10 [40:29<40:22, 484.50s/it]

epoch:    4, time:  481.3634 , loss:   0.0129  


 60%|██████    | 6/10 [48:32<32:15, 483.83s/it]

epoch:    5, time:  482.5470 , loss:   0.0120  


 70%|███████   | 7/10 [56:34<24:09, 483.30s/it]

epoch:    6, time:  482.1927 , loss:   0.0112  


 80%|████████  | 8/10 [1:04:36<16:05, 482.94s/it]

epoch:    7, time:  482.1649 , loss:   0.0106  


 90%|█████████ | 9/10 [1:12:38<08:02, 482.76s/it]

epoch:    8, time:  482.3536 , loss:   0.0101  


100%|██████████| 10/10 [1:20:40<00:00, 484.04s/it]

epoch:    9, time:  481.6631 , loss:   0.0098  





In [29]:
torch.save(model, "hg_model.pth") # save model

# predict and visualize

In [36]:
import cv2

img = Image.open("/home/szt/datasets/hg_underwater_target/纯色曝光1000us距离1m正常水下不同角度位置橙色背景/1702521983.607.24.jpeg").convert("RGB")
original_img = cv2.imread("/home/szt/datasets/hg_underwater_target/纯色曝光1000us距离1m正常水下不同角度位置橙色背景/1702521983.607.24.jpeg", cv2.COLOR_BGR2RGB)
original_img = cv2.resize(original_img, (640, 640))
data_transform = transforms.Compose([transforms.ToTensor(),
                                            transforms.Resize((640, 640), antialias=True), # image resize but label is ratio, so label is not need to change
                                            ])
model = torch.load("./hg_model.pth")
img = data_transform(img)
img = torch.unsqueeze(img, 0)
out = model(img)
bboxes, clss = out

bt = bboxes.shape[0]
print(bboxes[0].shape, clss[0].shape)
box = bboxes[0].permute(1,2,0)
cs = clss[0].permute(1,2,0)

# 生成预测框
pred_boxes = torch.zeros_like(box)
pred_score = torch.zeros(20, 20, 1)
pred_class = torch.zeros(20, 20, 1)
for i in range(20):
    for j in range(20):
        x, y, w, h, confidence = (box[i][j][0] + i) * 640, (box[i][j][1] + j) / 20.0 * 640, box[i][j][2] * 640, box[i][j][3] * 640, box[i][j][4]
        pred_boxes[i, j, :] = torch.stack([x, y, w, h, confidence])
        score, cata = cs[i][j].max(dim=-1)
        pred_class[i, j, :] = cata
        pred_score[i, j, :] = score

pred_boxes = pred_boxes.view(-1, 5)
pred_class = pred_class.view(-1, 1)
pred_score = pred_score.view(-1, 1)
print(pred_class.shape, pred_boxes.shape)

torch.Size([5, 20, 20]) torch.Size([4, 20, 20])
torch.Size([400, 1]) torch.Size([400, 5])


In [40]:
# nms, non-maximum suppression
def nms_(pred_bbox, iou_threshold=0.5, score_threshold=0.3):
    # pred_bbox: h*w, num_anchors*4
    # pred_cls: h*w
    indices = []
    
    # according to cls scores, sort the bbox
    confidence = pred_bbox[..., 4]
    # print(confidence.shape)
    sorted_indices = torch.argsort(confidence, descending=True)
    # print(sorted_indices)
    while sorted_indices.size(0) > 0:
        i = sorted_indices[0]
        if(confidence[i] < score_threshold): break
        indices.append(i)
        if sorted_indices.size(0) == 1:
            break
        iou = bboxes_iou(pred_bbox[i], pred_bbox[sorted_indices[1:]])
        sorted_indices = sorted_indices[1:][iou < iou_threshold]
    # 
    return  [] if len(indices) == 0 else torch.stack(indices) 

# 执行NMS
keep_indices = nms_(pred_boxes, 0.5, 0.05)
# print(keep_indices)
# print(pred_boxes[keep_indices], pred_class[keep_indices], pred_score[keep_indices])

# keep_indices中存储了保留的预测框的索引
# 可以根据这些索引获取最终的目标框
# filer score
for box, s in zip(pred_boxes[keep_indices], pred_class[keep_indices]):
    cv2.putText(original_img, str(int(s.item())), (int(box[0] - box[2] / 2), int(box[1] - box[3] / 2)), cv2.FONT_ITALIC, 1.0, (0, 255, 0), 1)
    cv2.rectangle(original_img, (int(box[0] - box[2] / 2), int(box[1] - box[3] / 2)), (int(box[0] + box[2] / 2), int(box[1] + box[3] / 2)), (0, 255, 0), 1)
cv2.imwrite("test.jpg", original_img)

True

In [42]:
import cv2
import numpy as np
original_img = cv2.imread("/home/szt/datasets/hg_underwater_target/hard_samples_test/1702521478.434.3239.jpeg", cv2.COLOR_BGR2RGB)
original_img = cv2.resize(original_img, (640, 640))

for i, (imgs, labels) in enumerate(data_loader):
    b_t = features.shape[0]
    labels = group_adjust(labels, b_t) # b, n, 5; list of tensor
    if(len(labels) < b_t):
        continue
    
    # b, 5, 20, 20; b, 4, 20, 20
    # pred_boxes, pred_classes = model(features) # first! consider output box as center and width, height
    img = imgs[0].permute(1,2,0)
    img = img.detach().cpu().numpy()
    img = img * 255
    img = img.astype(np.uint8)
    print(img.shape)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    for j, lable_ in enumerate(labels[0]):
        (c, x, y, w, h) = lable_
        minx, miny = x - w / 2, y - h / 2
        maxx, maxy = x + w / 2, y + h / 2
        
        minx, miny = minx.item() * 640, miny.item() * 640
        maxx, maxy = maxx.item() * 640, maxy.item() * 640
        cv2.rectangle(img, (int(minx), int(miny)), (int(maxx), int(maxy)), (255, 255, 0), 1)

    cv2.imwrite("test.jpg", img)

    break

(640, 640, 3)
