# Лабораторная работа №7 (Проведение исследований моделями семантической сегментации)

## 1.	Выбор начальных условий
## Был выбран подходящий для задачи [датасет](https://www.kaggle.com/datasets/ashish2001/semantic-segmentation-of-underwater-imagery-suim). Модель обученная на нём, может исспользоваться в подводных исследованиях: загрязнение дна водоёмов, отслеживание популяции рыб - также есть потенциал в проведение спасательных операциях под водой, поиск потерянных вещей итп. 

## метрики, которые будем использовать: IoU


In [1]:
!pip install segmentation-models-pytorch albumentations timm


Collecting segmentation-models-pytorch
  Downloading segmentation_models_pytorch-0.5.0-py3-none-any.whl.metadata (17 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014

# 2.	Создание бейзлайна и оценка качества


In [2]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch
import segmentation_models_pytorch as smp
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import train_test_split


  check_for_updates()


In [3]:
DATA_DIR = "/kaggle/input/semantic-segmentation-of-underwater-imagery-suim/train_val"

IMG_DIR = os.path.join(DATA_DIR, "images")
MASK_DIR = os.path.join(DATA_DIR, "masks")

image_paths = sorted([os.path.join(IMG_DIR, img) for img in os.listdir(IMG_DIR)])
mask_paths = sorted([os.path.join(MASK_DIR, mask) for mask in os.listdir(MASK_DIR)])

train_imgs, val_imgs, train_masks, val_masks = train_test_split(image_paths, mask_paths, test_size=0.2, random_state=42)


In [4]:
SUIM_COLORS = {
    (0, 0, 0): 0,        
    (0, 0, 255): 1,     
    (0, 255, 0): 2,      
    (255, 0, 0): 3,     
    (255, 255, 0): 4,  
    (255, 0, 255): 5,   
    (0, 255, 255): 6,    
    (255, 255, 255): 7  
}


In [5]:
def convert_mask(mask):
    h, w, _ = mask.shape
    new_mask = np.zeros((h, w), dtype=np.uint8)

    for rgb, class_id in SUIM_COLORS.items():
        match = np.all(mask == rgb, axis=-1)
        new_mask[match] = class_id

    return new_mask


In [6]:
class SUIMDataset(Dataset):
    def __init__(self, image_paths, mask_paths, transform=None):
        self.image_paths = image_paths
        self.mask_paths = mask_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = cv2.imread(self.image_paths[idx])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
        mask = cv2.imread(self.mask_paths[idx])
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)
        mask = convert_mask(mask) 
    
        
        image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_LINEAR)
        mask = cv2.resize(mask, (256, 256), interpolation=cv2.INTER_NEAREST)
    
        if self.transform:
            augmented = self.transform(image=image, mask=mask)
            image = augmented["image"]
            mask = augmented["mask"].long()
    
        return image, mask




In [7]:
transform = A.Compose([
    A.Resize(256, 256),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Normalize(),
    ToTensorV2()
])

train_dataset = SUIMDataset(train_imgs, train_masks, transform=transform)
val_dataset = SUIMDataset(val_imgs, val_masks, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


## В качестве CNN модели будем использовать связку UNET + Resnet предобученую на imagenet.

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_resnet34 = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,
    classes=8, 
).to(device)

loss_fn = smp.losses.DiceLoss(mode='multiclass')
optimizer = torch.optim.Adam(model_resnet34.parameters(), lr=1e-4)


In [19]:
for epoch in range(10):
    model.train()
    total_loss = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_resnet34(imgs)
        loss = loss_fn(preds, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6206
Epoch 2, Loss: 0.4639
Epoch 3, Loss: 0.3966
Epoch 4, Loss: 0.3676
Epoch 5, Loss: 0.3504
Epoch 6, Loss: 0.3282
Epoch 7, Loss: 0.3119
Epoch 8, Loss: 0.3141
Epoch 9, Loss: 0.2907
Epoch 10, Loss: 0.2762


In [21]:
def multiclass_iou_score(preds, targets, num_classes=8, eps=1e-6):
    ious = []
    preds = preds.argmax(dim=1)

    preds = preds.cpu().numpy() 
    targets = targets.cpu().numpy()

    for cls in range(num_classes):
        pred_inds = (preds == cls)
        target_inds = (targets == cls)

        intersection = (pred_inds & target_inds).sum()
        union = (pred_inds | target_inds).sum()

        if union == 0:
            continue

        ious.append((intersection + eps) / (union + eps))

    return np.nanmean(ious)


In [22]:
model_resnet34.eval()
ious = []

with torch.no_grad():
    for imgs, masks in val_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_resnet34(imgs)
        iou = multiclass_iou_score(preds, masks)
        ious.append(iou)

print("Mean IoU:", np.mean(ious))


Mean IoU: 0.4414588024476484


## В качестве трансформенной модели у нас будет UNET + MiT предобученую на imagenet.

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_mit0 = smp.Unet(
    encoder_name="mit_b0",
    encoder_weights="imagenet",
    in_channels=3,
    classes=8,
).to(device)



loss_fn = smp.losses.DiceLoss(mode='multiclass')
optimizer = torch.optim.Adam(model_mit0.parameters(), lr=1e-4)


In [16]:
for epoch in range(10):
    model_mit0.train()
    total_loss = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_mit0(imgs)
        loss = loss_fn(preds, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6338
Epoch 2, Loss: 0.4705
Epoch 3, Loss: 0.3929
Epoch 4, Loss: 0.3544
Epoch 5, Loss: 0.3248
Epoch 6, Loss: 0.3087
Epoch 7, Loss: 0.3073
Epoch 8, Loss: 0.2782
Epoch 9, Loss: 0.2497
Epoch 10, Loss: 0.2423


In [17]:
def multiclass_iou_score(preds, targets, num_classes=8, eps=1e-6):
    ious = []
    preds = preds.argmax(dim=1) 

    preds = preds.cpu().numpy()
    targets = targets.cpu().numpy()

    for cls in range(num_classes):
        pred_inds = (preds == cls)
        target_inds = (targets == cls)

        intersection = (pred_inds & target_inds).sum()
        union = (pred_inds | target_inds).sum()

        if union == 0:
            continue

        ious.append((intersection + eps) / (union + eps))

    return np.nanmean(ious)


In [19]:
model_mit0.eval()
ious = []

with torch.no_grad():
    for imgs, masks in val_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_mit0(imgs)
        iou = multiclass_iou_score(preds, masks)
        ious.append(iou)

print("Mean IoU:", np.mean(ious))


Mean IoU: 0.45577886832064907


## По лоссу и метрики качества нетрудно заметить, что трансформенная модель показывает лучшие результаты по сранению с CNN. Учитывая разницу между resnet34 и mit_b0, а именно, в размере, который по скромным оценкам меньше в 2 раза, то можно сделать вывод, что MIT вне конкуренции в этой задаче при наших вводных.

# 3.	Улучшение бейзлайна
## Гипотеза: увеличим количество эпох и возьмём старшие модели.

## RESNET50

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_resnet50 = smp.Unet(
    encoder_name="resnet50",
    encoder_weights="imagenet",
    in_channels=3,
    classes=8,
).to(device)

loss_fn = smp.losses.DiceLoss(mode='multiclass')
optimizer = torch.optim.Adam(model_resnet50.parameters(), lr=1e-4)


config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

In [24]:
for epoch in range(15):
    model_resnet50.train()
    total_loss = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_resnet50(imgs)
        loss = loss_fn(preds, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.3982
Epoch 2, Loss: 0.3517
Epoch 3, Loss: 0.3341
Epoch 4, Loss: 0.3239
Epoch 5, Loss: 0.2999
Epoch 6, Loss: 0.2819
Epoch 7, Loss: 0.2717
Epoch 8, Loss: 0.2642
Epoch 9, Loss: 0.2507
Epoch 10, Loss: 0.2410
Epoch 11, Loss: 0.2437
Epoch 12, Loss: 0.2400
Epoch 13, Loss: 0.2299
Epoch 14, Loss: 0.2477
Epoch 15, Loss: 0.2302


In [25]:
model_resnet50.eval()
ious = []

with torch.no_grad():
    for imgs, masks in val_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_resnet50(imgs)
        iou = multiclass_iou_score(preds, masks)
        ious.append(iou)

print("Mean IoU:", np.mean(ious))

Mean IoU: 0.47541707360553875


## MIT_B1

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_mit1 = smp.Unet(
    encoder_name="mit_b1",
    encoder_weights="imagenet",
    in_channels=3,
    classes=8,  # SUIM: 8 классов
).to(device)

loss_fn = smp.losses.DiceLoss(mode='multiclass')
optimizer = torch.optim.Adam(model_mit1.parameters(), lr=1e-4)


config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/54.7M [00:00<?, ?B/s]

In [27]:
for epoch in range(15):
    model_mit1.train()
    total_loss = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_mit1(imgs)
        loss = loss_fn(preds, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.5932
Epoch 2, Loss: 0.4386
Epoch 3, Loss: 0.3834
Epoch 4, Loss: 0.3343
Epoch 5, Loss: 0.3250
Epoch 6, Loss: 0.3140
Epoch 7, Loss: 0.2826
Epoch 8, Loss: 0.2715
Epoch 9, Loss: 0.2487
Epoch 10, Loss: 0.2317
Epoch 11, Loss: 0.2292
Epoch 12, Loss: 0.2154
Epoch 13, Loss: 0.2055
Epoch 14, Loss: 0.1997
Epoch 15, Loss: 0.1845


In [28]:
model_mit1.eval()
ious = []

with torch.no_grad():
    for imgs, masks in val_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_mit1(imgs)
        iou = multiclass_iou_score(preds, masks)
        ious.append(iou)

print("Mean IoU:", np.mean(ious))

Mean IoU: 0.49538002662882064


## Обе модели показали аналогичный рост точности по задаче, по сравнению с прошлыми результатами, что подтверждает нашу гипотезу. Опять же преимущество на стороне MIT архитектуры выводы в этом соперничестве аналогичны прошлым.

# 4.	Имплементация алгоритма машинного обучения 

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, x):
        return self.double_conv(x)


class UNet(nn.Module):
    def __init__(self, in_channels=3, num_classes=8, features=[64, 128, 256, 512]):
        super().__init__()
        
        self.downs = nn.ModuleList()
        self.ups = nn.ModuleList()
        
        # Downsampling
        for feature in features:
            self.downs.append(ConvBlock(in_channels, feature))
            in_channels = feature
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Bottleneck
        self.bottleneck = ConvBlock(features[-1], features[-1] * 2)
        
        # Upsampling
        for feature in reversed(features):
            self.ups.append(
                nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2)
            )
            self.ups.append(ConvBlock(feature * 2, feature))  

        # Final classifier
        self.final_conv = nn.Conv2d(features[0], num_classes, kernel_size=1)

    def forward(self, x):
        skip_connections = []

        for down in self.downs:
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)
        
        x = self.bottleneck(x)
        skip_connections = skip_connections[::-1]
        
        for idx in range(0, len(self.ups), 2):
            x = self.ups[idx](x)  
            skip = skip_connections[idx // 2]
            
            if x.shape != skip.shape:
                x = F.interpolate(x, size=skip.shape[2:])

            x = torch.cat((skip, x), dim=1)
            x = self.ups[idx + 1](x) 

        return self.final_conv(x)


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UNet(in_channels=3, num_classes=8).to(device)


In [34]:
for epoch in range(10):
    model.train()
    total_loss = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model(imgs)
        loss = loss_fn(preds, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.7587
Epoch 2, Loss: 0.7715
Epoch 3, Loss: 0.7601
Epoch 4, Loss: 0.7682
Epoch 5, Loss: 0.7497
Epoch 6, Loss: 0.7523
Epoch 7, Loss: 0.7687
Epoch 8, Loss: 0.7550
Epoch 9, Loss: 0.7610
Epoch 10, Loss: 0.7691


In [35]:
model.eval()
ious = []

with torch.no_grad():
    for imgs, masks in val_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model(imgs)
        iou = multiclass_iou_score(preds, masks)
        ious.append(iou)

print("Mean IoU:", np.mean(ious))

Mean IoU: 0.03634133792457782


In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )
    def forward(self, x):
        return self.conv(x)

class SimpleUNet(nn.Module):
    def __init__(self, in_channels=3, num_classes=8, features=[32, 64, 128]):
        super().__init__()
        self.downs = nn.ModuleList()
        self.ups = nn.ModuleList()
        self.pool = nn.MaxPool2d(2)

        for feature in features:
            self.downs.append(SimpleConvBlock(in_channels, feature))
            in_channels = feature

        self.bottleneck = SimpleConvBlock(features[-1], features[-1] * 2)

        for feature in reversed(features):
            self.ups.append(nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2))
            self.ups.append(SimpleConvBlock(feature * 2, feature))

        self.final_conv = nn.Conv2d(features[0], num_classes, kernel_size=1)

    def forward(self, x):
        skip_connections = []

        for down in self.downs:
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)

        x = self.bottleneck(x)
        skip_connections = skip_connections[::-1]

        for idx in range(0, len(self.ups), 2):
            x = self.ups[idx](x)
            skip = skip_connections[idx // 2]

            if x.shape != skip.shape:
                x = F.interpolate(x, size=skip.shape[2:])

            x = torch.cat((skip, x), dim=1)
            x = self.ups[idx + 1](x)

        return self.final_conv(x)


In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_lite = SimpleUNet(in_channels=3, num_classes=8).to(device)

In [38]:
for epoch in range(50):
    model_lite.train()
    total_loss = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_lite(imgs)
        loss = loss_fn(preds, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.7651
Epoch 2, Loss: 0.7634
Epoch 3, Loss: 0.7500
Epoch 4, Loss: 0.7563
Epoch 5, Loss: 0.7642
Epoch 6, Loss: 0.7493
Epoch 7, Loss: 0.7677
Epoch 8, Loss: 0.7615
Epoch 9, Loss: 0.7694
Epoch 10, Loss: 0.7510
Epoch 11, Loss: 0.7592
Epoch 12, Loss: 0.7614
Epoch 13, Loss: 0.7592
Epoch 14, Loss: 0.7638


KeyboardInterrupt: 

In [39]:
model_lite.eval()
ious = []

with torch.no_grad():
    for imgs, masks in val_loader:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model_lite(imgs)
        iou = multiclass_iou_score(preds, masks)
        ious.append(iou)

print("Mean IoU:", np.mean(ious))

Mean IoU: 0.028478867086896457


## По лоссу видно что модели крайне неохотно хотят обучаться в тех же условиях что и готовые, но даже в случае хорошего обучения модели бы не показали тех же результатов, потому что готовые модели используют предобученные веса на imagenet.