In [3]:
import sys
!{sys.executable} -m pip install import-ipynb



# YOLO目标检测训练数据加载器

将训练/验证用的数据（图片路径 + 边框信息）转换成模型可以使用的格式，支持图像增强、归一化、标签转换等操作，并可以批量打包数据用于训练。

In [4]:
import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data.dataset import Dataset

import import_ipynb
from utils import cvtColor, preprocess_input


class YoloDataset(Dataset): 
    def __init__(self, annotation_lines, input_shape, num_classes, train):
        super(YoloDataset, self).__init__()
        self.annotation_lines   = annotation_lines   # annotation_lines: Each line follows the format image_path x1,y1,x2,y2,class_id ...
        self.input_shape        = input_shape
        self.num_classes        = num_classes
        self.length             = len(self.annotation_lines)
        self.train              = train  # train: Indicates whether it is in training mode, which determines whether data augmentation is applied.

    def __len__(self):
        return self.length

    def __getitem__(self, index): 
        index       = index % self.length 
        #---------------------------------------------------#
        #   Random data augmentation is applied during training  
        #   No random data augmentation is applied during validation
        #---------------------------------------------------#
        image, box  = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train)
        # image: The augmented image (already resized)  
        # box: Bounding box information for each object, in the format [x1, y1, x2, y2, class_id]
        image       = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
        box         = np.array(box, dtype=np.float32) 
        if len(box) != 0:
            # Convert box format from [x1, y1, x2, y2] to [x_center, y_center, w, h], and normalize the values
            box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
            box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0] 
            # Normalize the bounding box coordinates to the range [0, 1] to facilitate network learning.
            box[:, 2:4] = box[:, 2:4] - box[:, 0:2] # Calculate width and height: w = x2 - x1, h = y2 - y1
            box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2 # Calculate center coordinates: x_center = x1 + w / 2, y_center = y1 + h / 2
        return image, box 
        # Return one image along with the information of multiple bounding boxes.

    def rand(self, a=0, b=1):
        return np.random.rand()*(b-a) + a 

    def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True): # 作用：从一行标注中读取图像，执行数据增强，返回图像数据和对应的目标框坐标。
        line    = annotation_line.split()
        #------------------------------#
        # Read the image and convert it to an RGB image
        #------------------------------#
        image   = Image.open(line[0])
        image   = cvtColor(image)
        #------------------------------#
        # Obtain the original image height and width, as well as the target height and width
        #------------------------------#
        iw, ih  = image.size
        h, w    = input_shape
        #------------------------------#
        #   Obtain the predicted bounding boxes
        #------------------------------#
        box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])

        # When no augmentation is applied:
        if not random:
            scale = min(w/iw, h/ih)
            nw = int(iw*scale)
            nh = int(ih*scale)
            dx = (w-nw)//2
            dy = (h-nh)//2

            #---------------------------------#
            #   Pad the extra parts of the image with gray bars.
            #---------------------------------#
            image       = image.resize((nw,nh), Image.BICUBIC)
            new_image   = Image.new('RGB', (w,h), (128,128,128))
            new_image.paste(image, (dx, dy))
            image_data  = np.array(new_image, np.float32)

            #---------------------------------#
            # Adjust the ground truth bounding boxes
            #---------------------------------#
            if len(box)>0:
                np.random.shuffle(box)
                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
                box[:, 0:2][box[:, 0:2]<0] = 0
                box[:, 2][box[:, 2]>w] = w
                box[:, 3][box[:, 3]>h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
                box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box

            return image_data, box
            
        # If random augmentation is applied:      
        #------------------------------------------#
        #   Scale the image and apply aspect ratio distortions to its height and width
        #------------------------------------------#
        new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
        scale = self.rand(.25, 2)
        if new_ar < 1:
            nh = int(scale*h)
            nw = int(nh*new_ar)
        else:
            nw = int(scale*w)
            nh = int(nw/new_ar)
        image = image.resize((nw,nh), Image.BICUBIC)
        
        #------------------------------------------#
        # Pad the extra areas of the image with gray bars
        #------------------------------------------#
        dx = int(self.rand(0, w-nw))
        dy = int(self.rand(0, h-nh))
        new_image = Image.new('RGB', (w,h), (128,128,128))
        new_image.paste(image, (dx, dy))
        image = new_image

        #------------------------------------------#
        #   Flip the image
        #------------------------------------------#
        flip = self.rand()<.5
        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)

        image_data      = np.array(image, np.uint8)
        
        #---------------------------------#
        #   Apply color space transformation to the image  
        #   Calculate the parameters for the color space transformation
        #---------------------------------#
        r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
        #---------------------------------#
        # Convert the image to HSV color space
        #---------------------------------#
        hue, sat, val   = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
        dtype           = image_data.dtype
        #---------------------------------#
        # Apply the transformation
        #---------------------------------#
        x       = np.arange(0, 256, dtype=r.dtype)
        lut_hue = ((x * r[0]) % 180).astype(dtype)
        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)

        image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
        image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
        
        #---------------------------------#
        # Adjust the ground truth bounding boxes
        #---------------------------------#
        if len(box)>0:
            np.random.shuffle(box)
            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
            if flip: box[:, [0,2]] = w - box[:, [2,0]]
            box[:, 0:2][box[:, 0:2]<0] = 0
            box[:, 2][box[:, 2]>w] = w
            box[:, 3][box[:, 3]>h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
            box = box[np.logical_and(box_w>1, box_h>1)] 
        return image_data, box 
        # image_data: The augmented image, type is np.array
        # box: The updated bounding boxes after augmentation, shape [N, 5], where the last dimension is the class ID

def yolo_dataset_collate(batch):
    images = []
    bboxes = []
    for img, box in batch:
        images.append(img)
        bboxes.append(box)
    images = torch.from_numpy(np.array(images)).type(torch.FloatTensor)
    bboxes = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in bboxes]
    return images, bboxes