# 语义分割和数据集

最重要的语义分割数据集之一是Pascal VOC2012

In [1]:
%matplotlib inline
import os
import torch
import torchvision
from d2l import torch as d2l

#下载原始数据集
d2l.DATA_HUB['voc2012'] = (d2l.DATA_URL + 'VOCtrainval_11-May-2012.tar',
                           '4e443f8a2eca6b1dac8a6c57641b67dd40621a49')
#解压
voc_dir = d2l.download_extract('voc2012', 'VOCdevkit/VOC2012')

Downloading ..\data\VOCtrainval_11-May-2012.tar from http://d2l-data.s3-accelerate.amazonaws.com/VOCtrainval_11-May-2012.tar...


ConnectionError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))

## 将所有输入的图像和标签读入内存

In [2]:
#数据格式VOC
def read_voc_images(voc_dir, is_train=True):
    """读取所有VOC图像并标注。"""
    txt_fname = os.path.join(voc_dir, 'ImageSets', 'Segmentation',
                             'train.txt' if is_train else 'val.txt')
    #彩色图片
    mode = torchvision.io.image.ImageReadMode.RGB
    with open(txt_fname, 'r') as f:
        images = f.read().split()
    features, labels = [], []
    for i, fname in enumerate(images):
        features.append(
            torchvision.io.read_image(
                os.path.join(voc_dir, 'JPEGImages', f'{fname}.jpg')))
        #像素的label怎么存储呢，最好的是存储成一个图片，最好不要存jpg,jpg会对图像压缩，边缘会有模糊
        #这张图片就是label,跟image的尺寸大小一样
        #每个pixel的值对应的是像素的标号
        labels.append(
            torchvision.io.read_image(
                os.path.join(voc_dir, 'SegmentationClass', f'{fname}.png'),
                mode))
    return features, labels

train_features, train_labels = read_voc_images(voc_dir, True)

NameError: name 'voc_dir' is not defined

## 绘制前五个输入图像及其标签

In [3]:
n = 5
imgs = train_features[0:n] + train_labels[0:n]
#读进来把chaennl放到最后
#画的时候把channel放到后面
imgs = [img.permute(1, 2, 0) for img in imgs]
d2l.show_images(imgs, 2, n);
#不同的label对应不同的颜色

NameError: name 'train_features' is not defined

![image.png](attachment:image.png)

## 列举RGB颜色值和类名

In [4]:
#每一个label对应的颜色是什么
#这个数据集里面的readme会告诉你
VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
                [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
                [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
                [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
                [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                [0, 64, 128]]

VOC_CLASSES = [
    'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
    'person', 'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']

## RGB值和标签之间的换算

In [5]:
def voc_colormap2label():
    """构建从RGB到VOC类别索引的映射。"""
    colormap2label = torch.zeros(256**3, dtype=torch.long)
    for i, colormap in enumerate(VOC_COLORMAP):
        #通过这个把一个tuple换算成一个整型
        #转成label对应的数值
        colormap2label[(colormap[0] * 256 + colormap[1]) * 256 +
                       colormap[2]] = i
        #字典类型
    return colormap2label

#colormap是给定的图片
def voc_label_indices(colormap, colormap2label):
    """将VOC标签中的RGB值映射到它们的类别索引。"""
    colormap = colormap.permute(1, 2, 0).numpy().astype('int32')
    #RGB换算成整型
    idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256 +
           colormap[:, :, 2])
    #拿到每一个像素对应的label
    return colormap2label[idx]

In [6]:
y = voc_label_indices(train_labels[0], voc_colormap2label())
#画出来一部分小的区域看下 每个像素对应的值
y[105:115, 130:140], VOC_CLASSES[1]

NameError: name 'train_labels' is not defined

![image.png](attachment:image.png)

## 使用图像增广中的随机裁剪，裁剪输入图像和标签的相同区域

In [7]:
#原始图像裁剪之后，label也要做相应的裁剪，不然就对应不起来了
#一定要保证原图和标签可以对应起来，再看一下效果
#裁剪图片和label的随机要固定住，不然裁剪出来的是不一样的
#VOC每个图片大小不一样，如果不crop,就要resize,拉伸时通过插值来的，原始图像可以插值，但是label不能插值
def voc_rand_crop(feature, label, height, width):
    """随机裁剪特征和标签图像。"""
    #会返回一个裁剪的框，框的四个坐标
    #对图片裁剪出来那个框，对label也裁剪出来那个框
    rect = torchvision.transforms.RandomCrop.get_params(
        feature, (height, width))
    feature = torchvision.transforms.functional.crop(feature, *rect)
    label = torchvision.transforms.functional.crop(label, *rect)
    return feature, label

imgs = []
for _ in range(n):
    imgs += voc_rand_crop(train_features[0], train_labels[0], 200, 300)

imgs = [img.permute(1, 2, 0) for img in imgs]
d2l.show_images(imgs[::2] + imgs[1::2], 2, n);

NameError: name 'train_features' is not defined

![image.png](attachment:image.png)

## 自定义语义分割数据集类

In [11]:
class VOCSegDataset(torch.utils.data.Dataset):
    """一个用于加载VOC数据集的自定义数据集。"""
    def __init__(self, is_train, crop_size, voc_dir):
        self.transform = torchvision.transforms.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        self.crop_size = crop_size
        features, labels = read_voc_images(voc_dir, is_train=is_train)
        self.features = [
            self.normalize_image(feature)
            for feature in self.filter(features)]
        #原始图片去掉了，label也要去掉
        self.labels = self.filter(labels)
        #构造是有一定开销的，所以在init里面构造了
        self.colormap2label = voc_colormap2label()
        print('read ' + str(len(self.features)) + ' examples')

    def normalize_image(self, img):
        return self.transform(img.float())

    #如果图片小于裁剪的图片尺寸的话，就不要这个图片。 
    def filter(self, imgs):
        return [
            img for img in imgs if (img.shape[1] >= self.crop_size[0] and
                                    img.shape[2] >= self.crop_size[1])]
    #label里面的RGB值需要换成对应的
    def __getitem__(self, idx):
        feature, label = voc_rand_crop(self.features[idx], self.labels[idx],
                                       *self.crop_size)
        return (feature, voc_label_indices(label, self.colormap2label))

    def __len__(self):
        return len(self.features)

## 读取数据集

In [9]:
crop_size = (320, 480)
voc_train = VOCSegDataset(True, crop_size, voc_dir)
voc_test = VOCSegDataset(False, crop_size, voc_dir)

NameError: name 'voc_dir' is not defined

In [12]:
batch_size = 64
train_iter = torch.utils.data.DataLoader(
    voc_train, batch_size, shuffle=True, drop_last=True,
    num_workers=d2l.get_dataloader_workers())
for X, Y in train_iter:
    print(X.shape)
    print(Y.shape)
    break

NameError: name 'voc_train' is not defined

## 整合所有组件

In [None]:
def load_data_voc(batch_size, crop_size):
    """加载VOC语义分割数据集。"""
    voc_dir = d2l.download_extract('voc2012',
                                   os.path.join('VOCdevkit', 'VOC2012'))
    num_workers = d2l.get_dataloader_workers()
    train_iter = torch.utils.data.DataLoader(
        VOCSegDataset(True, crop_size, voc_dir), batch_size, shuffle=True,
        drop_last=True, num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(
        VOCSegDataset(False, crop_size, voc_dir), batch_size, drop_last=True,
        num_workers=num_workers)
    return train_iter, test_iter