## 作业三：图像理解————简单图像语义分割
本次作业目的是让同学们体验基本的图像理解任务。
环境依赖：
- Python
- jittor
- tqdm
- pillow

需要完成的内容：
- 补全TODO标记的内容
- 试着自己调整网络结构以及各项网络参数，看看不同网络的效果。

需要提交的内容：
- 补全后的代码（.ipynb文件），只需要提交初始U-NET的版本。
- `merged_val_img/`目录下的实验结果。提交5张图片即可。
- 实验报告，包括调整网络结构对实验结果的影响等。

In [2]:
# 环境配置。
import jittor as jt
import jittor.nn as nn
from jittor.dataset import Dataset
from typing import List
import numpy as np
import os
import shutil
import random
from PIL import Image
from tqdm import tqdm

jt.flags.use_cuda = True


[38;5;2m[i 1122 15:16:25.934901 96 log.cc:351] Load log_sync: 1[m
[38;5;2m[i 1122 15:16:25.938353 96 compiler.py:956] Jittor(1.3.10.0) src: /home/ubuntu/.local/lib/python3.10/site-packages/jittor[m
[38;5;2m[i 1122 15:16:25.942656 96 compiler.py:957] g++ at /usr/bin/g++(11.4.0)[m
[38;5;2m[i 1122 15:16:25.943340 96 compiler.py:958] cache_path: /home/ubuntu/.cache/jittor/jt1.3.10/g++11.4.0/py3.10.18/Linux-5.15.0-1xdd/AMDEPYC740224-xf1/4253/default[m


[38;5;2m[i 1122 15:16:25.971524 96 install_cuda.py:96] cuda_driver_version: [12, 4][m
[38;5;2m[i 1122 15:16:25.978317 96 __init__.py:412] Found /home/ubuntu/.cache/jittor/jtcuda/cuda12.2_cudnn8_linux/bin/nvcc(12.2.140) at /home/ubuntu/.cache/jittor/jtcuda/cuda12.2_cudnn8_linux/bin/nvcc.[m
[38;5;2m[i 1122 15:16:25.983354 96 __init__.py:412] Found addr2line(2.38) at /usr/bin/addr2line.[m
[38;5;2m[i 1122 15:16:26.121561 96 compiler.py:1013] cuda key:cu12.2.140_sm_86[m
[38;5;2m[i 1122 15:16:26.605663 96 __init__.py:227] Total mem: 31.34GB, using 10 procs for compiling.[m
[38;5;2m[i 1122 15:16:26.710428 96 jit_compiler.cc:28] Load cc_path: /usr/bin/g++[m
[38;5;2m[i 1122 15:16:26.817277 96 init.cc:63] Found cuda archs: [86,][m
[38;5;2m[i 1122 15:16:27.062265 96 cuda_flags.cc:55] CUDA enabled.[m


### 模型搭建
本次实验中，我们选用U-NET作为分割模型。

网络结构如下：
![](imgs/UNET_architecture.png)

In [3]:
class Block(nn.Module):
    def __init__(self, in_channels, out_channels):
        """Block模型初始化。

        Args:
            in_channels (int): 模型的输入特征通道数。
            out_channels (int): 模型的输出特征通道数。
        """
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias= False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias= False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )
    def execute(self, x):
        """Block模型执行

        Args:
            x (jt.Var): 尺寸[N, in_channels, H, W]

        Returns:
            jt.Var: 尺寸[N, out_channels, H, W]
        """
        return self.conv(x)

class UNET(nn.Module):
    def __init__(self, in_channels: int=3, out_channels: int=1, features: List=[64,128,256,512]):
        """UNET初始化

        Args:
            in_channels (int, optional): 图像通道数. 默认为3.
            out_channels (int, optional): 输出结果通道数. 默认为1.
            features (List, optional): 中间层特征通道数. 默认为 [64,128,256,512].
        """
        super().__init__()
        self.down = nn.ModuleList()
        self.up = nn.ModuleList()
        self.pool = nn.MaxPool2d(2, 2)
        for feature in features:
            self.down.append(Block(in_channels, feature))
            in_channels=feature
        for feature in reversed(features):
            self.up.append(
                nn.ConvTranspose2d(feature*2, feature, 2, 2)
            )
            self.up.append(
                Block(feature*2, feature) # x gets concat to 2xchannel
            )
        self.bottleneck = Block(features[-1], features[-1]*2)
        self.final_conv = nn.Conv2d(features[0], out_channels, 1)

    def execute(self, x):
        """UNET前向

        Args:
            x (jt.Var): 输入图像，尺寸[N, in_channels, H, W]

        Returns:
            jt.Var: 输出图像，尺寸[N, out_channels, H, W]
        """
        skip_connections = []
        for down in self.down:
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)
        x = self.bottleneck(x)
        skip_connections = skip_connections[::-1]
        for idx in range(0, len(self.up), 2):
            x = self.up[idx](x)
            skip_connection = skip_connections[idx//2]
            if x.shape != skip_connection.shape:
                x = nn.resize(x, size=skip_connection.shape[2:], mode='bicubic')
            concat_skip = jt.concat((skip_connection, x), dim=1) # Concat along channels (b, c, h, w)
            x = self.up[idx+1](concat_skip)
        return self.final_conv(x)

# 构建模型
model = UNET()

# 验证模型输入输出尺寸
dummy_input = jt.random((2, 3, 320, 280))
dummy_output = model(dummy_input)
print(dummy_output.shape)


[2,1,320,280,]


### 数据集

我们使用[Carvana](https://www.kaggle.com/datasets/ipythonx/carvana-image-masking-png)作为实验数据集。
这个数据集是一个较为简单的车辆分割数据集，包含不同类型、不同朝向的车辆。

数据集下载网站：https://www.kaggle.com/datasets/ipythonx/carvana-image-masking-png

数据集下载后会有`train_images`和`train_masks`两个目录，其中`train_images`中为训练数据，`train_masks`为Ground Truth。

我们将数据集分为训练集和验证集两个部分。这两个部分的图像分别保存在`train.txt`和`val.txt`中。


In [4]:
class SegmentationDataset(Dataset):
    def __init__(self, image_path, mask_path, label_path=None, use_aug=False,
                 batch_size=16, num_workers=2, shuffle=False):
        """初始化数据集

        Args:
            image_path (str): 图像路径
            mask_path (str): 分割掩码标注路径
            label_path (str): 图像集合文件路径。文件中每一行代表一张图像的名称
            use_aug (bool, optional): 是否使用数据增强.
            batch_size (int, optional): 图像批次N.
            num_workers (int, optional): 并行工作进程数量.
            shuffle (bool, optional): 是否随机顺序
        """
        super().__init__(batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
        assert label_path is not None
        with open(label_path, "r") as f:
            names = f.readlines()
        images, masks = [], []
        # TODO(1): 计算所有图像的路径
        # 假设image_path为'/data/image', mask_path为'/data/mask',
        # 则images应为['/data/image/name1.jpg', '/data/image/name2.jpg', ...]
        # 则masks应为['/data/mask/name1.png', '/data/mask/name2.png', ...]
        # 其中name1, name2, ... 为 label_path中读到的测例名称names。
        # Your code starts here
        for line in names:
            name = line.strip()
            if name == "":
                continue

            images.append(os.path.join(image_path, name + ".jpg"))
            masks.append(os.path.join(mask_path, name + ".png"))
        # Your code ends here

        self.images = images
        self.masks = masks
        self.use_aug = use_aug
        self.total_len = len(self.images)

    def apply_aug(self, image, mask):
        """随机数据增强

        Args:
            image (Image.Image): 图像
            mask (Image.Image): 分割掩码

        Returns:
            Tuple[np.ndarray, np.ndarray]: 增强后的图像和分割掩码
        """
        if random.random() < 0.7:
            # TODO(2): 将图像和分割掩码水平翻转
            # 提示：使用 Image.FLIP_LEFT_RIGHT
            # Your code starts here
            image = image.transpose(Image.FLIP_LEFT_RIGHT)
            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
            # Your code ends here

        if random.random() < 0.5:
            # TODO(3): 将图像和分割掩码竖直翻转
            # 提示：使用 Image.FLIP_TOP_BOTTOM
            # Your code starts here
            image = image.transpose(Image.FLIP_TOP_BOTTOM)
            mask = mask.transpose(Image.FLIP_TOP_BOTTOM)
            # Your code ends here
        return image, mask

    def __getitem__(self, index):
        """获取编号为index的数据

        Args:
            index (int): 编号

        Returns:
            Tuple[np.ndarray, np.ndarray]: 图像和掩码 尺寸分别为[3, 360, 480], [1, 360, 480]
        """
        image = np.array(Image.open(self.images[index]).convert('RGB'))
        mask = np.array(Image.open(self.masks[index]))
        mask[mask == 255.0] = 1.0

        # preprocess
        image = Image.fromarray(image).resize((480, 360), resample=Image.BILINEAR)
        mask = Image.fromarray(mask).resize((480, 360), resample=Image.NEAREST)
        if self.use_aug:
            image, mask = self.apply_aug(image, mask)

        # Image读取的图像一般排列为[H, W, C]，而神经网络中图像一般排列为[C, H, W]
        image = jt.array(np.transpose(image, (2, 0, 1)), dtype=jt.float32)
        mask = jt.unsqueeze(jt.array(mask, dtype=jt.float32), 0)
        return image, mask

# 数据集路径设置
image_path = 'train_images'
mask_path = 'train_masks'
train_label = './train.txt'
val_label = './val.txt'

# 训练集需要数据增强
train_dataset = SegmentationDataset(
    image_path, mask_path, train_label, use_aug=True,
    batch_size=9, num_workers=2, shuffle=True,
)
# 验证集
val_dataset = SegmentationDataset(
    image_path, mask_path, val_label, use_aug=False,
    batch_size=1, num_workers=2, shuffle=False,
)

# 检查数据集构建是否正确
print(len(train_dataset))
print(len(val_dataset))
for image, mask in train_dataset:
    print(image.shape, mask.shape)
    break
for image, mask in val_dataset:
    print(image.shape, mask.shape)
    break


509
508
[9,3,360,480,] [9,1,360,480,]
[1,3,360,480,] [1,1,360,480,]


### 模型的训练与评估
本次实验中我们使用AdamW优化器训练模型。默认训练轮数为8轮，可能需要训练30分钟以上。同学们调试代码时可以减少训练轮数以加快训练。

我们采用两个标准来评估模型，acc和dice score。这两个指标均为逐像素指标。

假设$P_i$为图像$i$预测掩码，$G_i$为图像$i$的实际掩码，$N$为图像总数，$Count(x)$表示计算掩码$x$中1的数量，$Size(x)$表示计算掩码$x$中像素总数， 则acc的计算标准为：
$$
    acc=(\sum_{i} \frac{Count(P_i == G_i)}{Size(G_i)}) / N
$$
dice score的计算标准为：
$$
    score=(\sum_{I} \frac{2 \times Count(P_i * G_i)}{Count(P_i) + Count(G_i)}) / N
$$


In [None]:
def training(model:UNET, data_loader:SegmentationDataset, optimizer:jt.optim.AdamW, epoch_id:int):
    """进行一轮训练

    Args:
        model (UNET): 待训练模型
        data_loader (SegmentationDataset): 数据集
        optimizer (jt.optim.AdamW): 优化器
        epoch_id (int): 当前轮数
    """
    model.train()
    loss_func = nn.binary_cross_entropy_with_logits
    for batch in tqdm(data_loader):
        x, y = batch
        pred = model(x)
        # TODO(4): 训练模型：
        # 可以参考作业二的实现。
        # Your code starts here
        loss = loss_func(pred, y)
        optimizer.step(loss)
        # Your code ends here

    # 保存模型
    print(f"Training Epoch {epoch_id}")
    print("saving model...")
    jt.save(model.state_dict(), 'epoch_{}.jt'.format(epoch_id))
    print("model saved.\n")

def validation(model:UNET, data_loader:SegmentationDataset, epoch_id:int):
    """进行一轮验证

    Args:
        model (UNET): 待验证模型
        data_loader (SegmentationDataset): 数据集
        epoch_id (int): 当前轮数

    Returns:
        float: 当前模型的dice score
    """
    sum_acc, sum_dice_score, num_imgs = 0, 0, 0
    
    model.eval()
    for batch in tqdm(data_loader):
        x, y = batch
        pred = model(x)

        pred = jt.sigmoid(pred)
        pred = (pred > 0.5).float()
        # TODO(5): 计算模型评估指标
        # sum_acc为所有图像的acc之和，sum_dice_score为所有图像的dice_score之和，num_imgs为图像总数
        # Your code starts here
        b = pred.shape[0]
        for i in range(b):
            p = pred[i]
            gt = y[i]
            acc = (p == gt).float().mean().item()
            inter = (p * gt).sum().item()
            p_sum = p.sum().item()
            g_sum = gt.sum().item()
            dice = (2.0 * inter) / (p_sum + g_sum + 1e-6)
            sum_acc += acc
            sum_dice_score += dice
            num_imgs += 1
        # Your code ends here

    if num_imgs == 0:
        print("No validation images found!")
        return 0.0
    
    print(f"Validation Epoch {epoch_id}: val_acc={sum_acc/num_imgs * 100}, val_dice_score={sum_dice_score / num_imgs}")
    return sum_dice_score / num_imgs

# 总训练轮数。可以根据算力调整。
num_epochs = 3
best_score, best_epoch = 0, 0
optimizer = jt.optim.AdamW(params = model.parameters(), lr = 1.5e-3, weight_decay=0.3)
for epoch_id in range(num_epochs):
    training(model, train_dataset, optimizer, epoch_id=epoch_id)
    score = validation(model, val_dataset, epoch_id=epoch_id)
    # 记录dice score最高轮
    if score > best_score:
        best_score = score
        best_epoch = epoch_id
shutil.copyfile('epoch_{}.jt'.format(best_epoch), 'epoch_best.jt')


  0%|          | 0/509 [00:00<?, ?it/s]
Compiling Operators(2/2) used: 4.31s eta:    0s 
  0%|          | 1/509 [00:04<36:51,  4.35s/it]
Compiling Operators(1/34) used: 3.32s eta:  110s 6/34) used: 5.33s eta: 24.9s 9/34) used: 6.33s eta: 17.6s 10/34) used: 7.33s eta: 17.6s 11/34) used: 8.34s eta: 17.4s 13/34) used: 9.34s eta: 15.1s 18/34) used: 10.3s eta:  9.2s 19/34) used: 11.4s eta: 8.96s 24/34) used: 12.4s eta: 5.15s 26/34) used: 13.4s eta: 4.11s 29/34) used: 14.4s eta: 2.48s 30/34) used: 15.4s eta: 2.05s 32/34) used: 16.4s eta: 1.02s 33/34) used: 16.4s eta: 0.497s 34/34) used: 40.4s eta:    0s 
  0%|          | 2/509 [00:50<4:04:18, 28.91s/it]
Compiling Operators(3/3) used: 4.31s eta:    0s 
100%|██████████| 509/509 [04:14<00:00,  2.00it/s]


Training Epoch 0
saving model...
model saved.



100%|██████████| 508/508 [00:42<00:00, 11.84it/s]


Validation Epoch 0: val_acc=95.78699210262674, val_dice_score=0.8799901075951495


100%|██████████| 509/509 [03:15<00:00,  2.60it/s]


Training Epoch 1
saving model...
model saved.



100%|██████████| 508/508 [00:09<00:00, 53.28it/s]


Validation Epoch 1: val_acc=62.76139148224996, val_dice_score=0.5343483091466725


100%|██████████| 509/509 [03:16<00:00,  2.59it/s]


Training Epoch 2
saving model...
model saved.



100%|██████████| 508/508 [00:09<00:00, 53.50it/s]

Validation Epoch 2: val_acc=99.19457406274915, val_dice_score=0.982086423369237





'epoch_best.jt'

### 图像分割的可视化
我们在验证集上选取部分图像生成可视化结果，用以直观查看模型的能力。

In [None]:
def save_image_with_mask(image:jt.Var, mask:jt.Var, fp:str):
    """叠放分割Mask和图像。

    Args:
        image (jt.Var): 图像，尺寸为(3, H, W)
        mask (jt.Var): 分割掩码，尺寸为(1, H, W)
        fp (str): 保存图像路径
    """
    nimage = jt.permute(jt.clamp(image, 0, 255), (1, 2, 0))
    nmask = jt.zeros_like(nimage)
    nmask[:, :, :1] = jt.permute(jt.clamp(mask * 255 + 0.5, 0, 255), (1, 2, 0))
    im = np.asarray((nimage * 0.4 + nmask * 0.6).numpy(), dtype=np.uint8)
    im = Image.fromarray(im)
    im.save(fp)

def save_images(model:UNET, loader:SegmentationDataset, folder:str='val_img'):
    """可视化数据集中部分图像的分割结果

    Args:
        model (UNET): 模型
        loader (SegmentationDataset): 数据集
        folder (str, optional): 保存路径.
    """
    model.eval()
    if not os.path.isdir(folder):
        os.mkdir(folder)
    for idx, (x, y) in enumerate(loader):
        with jt.no_grad():
            preds = jt.sigmoid(model(x))
            preds = (preds > 0.5).float()
        save_image_with_mask(x[0], preds[0], os.path.join(folder, f"pred_{idx}.png"))
        save_image_with_mask(x[0], y[0], os.path.join(folder, f"mask_{idx}.png"))
        if idx >= 4:
            break

def get_concat_v(im1:Image.Image, im2:Image.Image):
    """将两张图像合并为一张

    Args:
        im1 (Image.Image): 上方图像
        im2 (Image.Image): 下方图像

    Returns:
        Image.Image: 合并后的图像
    """
    # TODO(6): 将两张图像合并为一张。
    # Your code starts here
    w1, h1 = im1.size
    w2, h2 = im2.size
    dst_w = max(w1, w2)
    dst_h = h1 + h2
    mode = im1.mode
    if mode != im2.mode:
        im1 = im1.convert('RGB')
        im2 = im2.convert('RGB')
        mode = 'RGB'
    dst = Image.new(mode, (dst_w, dst_h))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, h1))
    # Your code ends here
    return dst

def merge_photos(src_folder: str='./val_img', dst_folder: str='./merged_val_img', remove_single: bool=True):
    """将预测结果和Ground Truth对比。

    Args:
        src_folder (str, optional): 源图像路径
        dst_folder (str, optional): 目标图像路径
        remove_single (bool, optional): 是否删除源图像
    """
    files = []
    for fname in os.listdir(src_folder):
        if fname.endswith('.png'):
            files.append(os.path.join(src_folder, fname))
    if not os.path.isdir(dst_folder):
        os.mkdir(dst_folder)
    for i in range(int(len(files)/2)):
        pred_img = Image.open(f'{src_folder}/pred_{i}.png')
        mask_img = Image.open(f'{src_folder}/mask_{i}.png')
        get_concat_v(pred_img, mask_img).save(f'{dst_folder}/merged_pred_mask_{i}.png')
        if remove_single:
            os.remove(f'./val_img/pred_{i}.png')
            os.remove(f'./val_img/mask_{i}.png')

model.load_state_dict(jt.load('epoch_best.jt'))
save_images(model, val_dataset)
merge_photos()
