# **Imports**

In [1]:
!pip install -q segmentation-models-pytorch torchmetrics transformers timm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/

In [3]:
import json
from pathlib import Path

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy, F1Score
from torchmetrics.segmentation import DiceScore
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from torch.amp import GradScaler, autocast

from transformers import SegformerForSemanticSegmentation

import torch.nn.functional as F

from PIL import Image
import numpy as np

from IPython.display import FileLink

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


# **Setup**

## **Dataset**

In [None]:
# Initial setup
DATA_ROOT = Path("/kaggle/input/data-clean")
IMG_DIR = DATA_ROOT / "images"
MSK_DIR = DATA_ROOT / "masks"

train_img_dir = IMG_DIR / "train"
val_img_dir   = IMG_DIR / "validation"
test_img_dir  = IMG_DIR / "test"

train_msk_dir = MSK_DIR / "train"
val_msk_dir   = MSK_DIR / "validation"
test_msk_dir  = MSK_DIR / "test"

CONFIG = {
    "experiment_name": "segformer_b0_dacl10k_512",
    "model": "SegFormer-B0",
    "encoder": "segformer-b0",
    "encoder_weights": "ade20k",
    "num_classes": 14,
    "image_size": [512, 512],
    "batch_size": 4,  
    "epochs": 40,
    "learning_rate": 6e-5,          
    "loss": "CrossEntropyLoss",
    "optimizer": "AdamW",    
    "scheduler": "ReduceLROnPlateau",
    "metrics": [
        "mean_iou",
        "f1_macro",
        "global_pixel_accuracy",
    ],
}


NUM_CLASSES = CONFIG["num_classes"]
BATCH_SIZE  = CONFIG["batch_size"]
EPOCHS      = CONFIG["epochs"]
LR          = CONFIG["learning_rate"]
IMAGE_SIZE  = CONFIG["image_size"]

In [None]:
# Dataset setup
class Dacl10kDataset(Dataset):
    def __init__(self, img_dir, msk_dir, image_size=(512, 512)):
        self.img_dir = Path(img_dir)
        self.msk_dir = Path(msk_dir)
        self.image_size = image_size

        self.img_paths = sorted([p for p in self.img_dir.iterdir()])

        # Transformations for training images
        self.img_transform = transforms.Compose([
            transforms.Resize(self.image_size, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.ToTensor(),
            transforms.Normalize(  # Normalize each channel with ImageNet normalization values
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
        ])

        self.mask_resize = transforms.Resize(
            self.image_size,
            interpolation=transforms.InterpolationMode.NEAREST, # Change interpolation value to keep integers
        )

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        msk_path = self.msk_dir / img_path.name.replace("jpg", "png") # same filename, jpg -> png

        # Image
        img = Image.open(img_path).convert("RGB")
        img = self.img_transform(img)

        # Mask
        mask = Image.open(msk_path)
        mask = self.mask_resize(mask)
        mask = torch.from_numpy(np.array(mask, dtype=np.int64))

        return img, mask

In [6]:
# Dataset definition
train_dataset = Dacl10kDataset(train_img_dir, train_msk_dir, IMAGE_SIZE)
val_dataset   = Dacl10kDataset(val_img_dir,   val_msk_dir,   IMAGE_SIZE)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
)

print("Train samples:", len(train_dataset), " | batches:", len(train_loader))
print("Val samples:  ", len(val_dataset),   " | batches:", len(val_loader))


Train samples: 5895  | batches: 1474
Val samples:   1040  | batches: 260


## **Model, loss, optimizer, metrics**

In [None]:
# Model
id2label = {i: f"class_{i}" for i in range(NUM_CLASSES)} 
label2id = {v: k for k, v in id2label.items()}

model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/segformer-b0-finetuned-ade-512-512",
    num_labels=NUM_CLASSES,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,  # head resized from 150 -> 14 classes
).to(device)


# Otimizer and scheduler. Scheduler will decrease learning rate by half when val_loss does not improve for 3 epochs
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG["learning_rate"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=3,
    verbose=True,
)

# Metrics
# 1) Mean IoU per class (Jaccard)
miou_metric = JaccardIndex(
    task="multiclass",
    num_classes=NUM_CLASSES,
    average="macro"
).to(device)

# 2) Dice score (macro over classes)
dice_metric = DiceScore(
    num_classes=NUM_CLASSES,
    include_background=True, 
    average="macro",               
    input_format="index",        
    aggregation_level="global", 
).to(device)

# 3) F1 Score (macro over classes)
f1_metric = F1Score(
    task="multiclass",
    num_classes=NUM_CLASSES,
    average="macro",
).to(device)

# 4) Global Pixel Accuracy
acc_metric = Accuracy(
    task="multiclass",
    num_classes=NUM_CLASSES,
).to(device)

print("SegFormerB0 params (M):", sum(p.numel() for p in model.parameters()) / 1e6)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([14]) in the model instantiated
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([14, 256, 1, 1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SegFormerB0 params (M): 3.717742


In [None]:
scaler = GradScaler(enabled=(device.type == "cuda")) # Uses AMP to speed up training

CHECKPOINT_EVERY = 2  # epochs

def train_one_epoch(model, loader, optimizer, epoch):
    model.train() # set model to training mode
    running_loss = 0.0 # start total loss at 0.0

    for step, (images, masks) in enumerate(loader, start=1):
        images = images.to(device, non_blocking=True)
        masks  = masks.to(device, non_blocking=True)

        optimizer.zero_grad() # zero the gradients

         # Uses AMP to speed up training
        with autocast(device_type="cuda", enabled=(device.type == "cuda")):
            outputs = model(pixel_values=images, labels=masks)
            loss = outputs.loss 

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() # add losses in each step

        if step % 50 == 0 or step == 1:
            print(f"[Epoch {epoch}] Step {step}/{len(loader)} - Loss: {loss.item():.4f}")

    return running_loss / len(loader) # return mean loss across an epoch


@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    val_loss = 0.0

    # Reset metrics each evaluation
    miou_metric.reset()
    dice_metric.reset()
    f1_metric.reset()
    acc_metric.reset()

    for images, masks in loader:
        images = images.to(device, non_blocking=True)
        masks  = masks.to(device, non_blocking=True)

        # Uses AMP to speed up training
        with autocast(device_type="cuda", enabled=(device.type == "cuda")):
            outputs = model(pixel_values=images, labels=masks)
            loss = outputs.loss
            logits = outputs.logits  # [B, num_labels, H/4, W/4]

        val_loss += loss.item()

        # Upsample logits back to mask size (512x512) for metrics (SegFormer downsamples by 4x)
        logits_upsampled = F.interpolate(
            logits,
            size=masks.shape[-2:], 
            mode="bilinear",
            align_corners=False,
        )

        preds = torch.argmax(logits_upsampled, dim=1)

        # Update metrics 
        miou_metric.update(preds, masks)
        dice_metric.update(preds, masks)
        f1_metric.update(preds, masks)
        acc_metric.update(preds, masks)

    val_loss /= len(loader) # compute mean loss

    miou = miou_metric.compute().item()   
    dice = dice_metric.compute().item()   
    mf1  = f1_metric.compute().item()     
    acc  = acc_metric.compute().item()   

    return val_loss, miou, dice, mf1, acc


# **Train**

In [None]:
history = [] 
best_miou = 0.0

for epoch in range(1, EPOCHS + 1):
    print(f"\n===== Epoch {epoch}/{EPOCHS} =====")

    # Training epoch
    train_loss = train_one_epoch(model, train_loader, optimizer, epoch)

    # Metrics for train and validation sets
    train_loss_eval, train_miou, train_dice, train_f1, train_acc = evaluate(model, train_loader)
    val_loss, val_miou, val_dice, val_f1, val_acc = evaluate(model, val_loader)

    # step scheduler on val_loss
    scheduler.step(val_loss)

    current_lr = optimizer.param_groups[0]["lr"]
    
    print(
        f"Epoch {epoch:03d} | "
        f"TrainLoss(step): {train_loss:.4f} | "
        f"TrainLoss(eval): {train_loss_eval:.4f} | "
        f"ValLoss: {val_loss:.4f} | "
        f"Train mIoU: {train_miou:.4f} | "
        f"Val mIoU: {val_miou:.4f} | "
        f"Val Dice: {val_dice:.4f} | "
        f"Val F1: {val_f1:.4f} | "
        f"Val Acc: {val_acc:.4f} | "
        f"LR: {current_lr:.6f}"
    )

    # Store metrics
    history.append({
        "epoch": epoch,
        # training loss from the actual training loop
        "train_loss_step": float(train_loss),
        # training loss recomputed in eval mode
        "train_loss_eval": float(train_loss_eval),
        "train_miou": float(train_miou),
        "train_dice": float(train_dice),
        "train_f1_macro": float(train_f1),
        "train_global_pixel_accuracy": float(train_acc),
        "val_loss": float(val_loss),
        "val_miou": float(val_miou),
        "val_dice": float(val_dice),
        "val_f1_macro": float(val_f1),
        "val_global_pixel_accuracy": float(val_acc),
        "lr": float(current_lr),
    })

    # save best model by mIoU
    if val_miou > best_miou:
        best_miou = val_miou
        torch.save(model.state_dict(), "segformer_best_miou.pth")
        print("  -> New best mIoU; weights saved to segformer_best_miou.pth")

    # periodic full checkpoint save
    if epoch % CHECKPOINT_EVERY == 0:
        ckpt_path = f"segformer_checkpoint_epoch_{epoch}.pth"
        torch.save({
            "config": CONFIG,
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "best_miou": best_miou,
            "history": history,
        }, ckpt_path)
        print(f"  -> Checkpoint saved to {ckpt_path}")

# final "last" checkpoint
torch.save({
    "config": CONFIG,
    "epoch": EPOCHS,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "best_miou": best_miou,
    "history": history,
}, "segformer_last.pth")

print("\nTraining complete. Best mIoU:", best_miou)

# Store training logs in JSON format

output_path = Path("/kaggle/working/segformer_results.json")

results = {
    "config": CONFIG,
    "history": history,
}

with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print("Saved metrics to:", output_path)

# Download link for results
FileLink('/kaggle/working/segformer_results.json')



===== Epoch 1/40 =====
[Epoch 1] Step 1/1474 - Loss: 2.6916
[Epoch 1] Step 50/1474 - Loss: 2.3649
[Epoch 1] Step 100/1474 - Loss: 1.8830
[Epoch 1] Step 150/1474 - Loss: 1.6227
[Epoch 1] Step 200/1474 - Loss: 1.2534
[Epoch 1] Step 250/1474 - Loss: 0.9543
[Epoch 1] Step 300/1474 - Loss: 1.4341
[Epoch 1] Step 350/1474 - Loss: 1.1236
[Epoch 1] Step 400/1474 - Loss: 0.8278
[Epoch 1] Step 450/1474 - Loss: 0.7750
[Epoch 1] Step 500/1474 - Loss: 0.5660
[Epoch 1] Step 550/1474 - Loss: 1.4163
[Epoch 1] Step 600/1474 - Loss: 1.6932
[Epoch 1] Step 650/1474 - Loss: 1.1965
[Epoch 1] Step 700/1474 - Loss: 1.2037
[Epoch 1] Step 750/1474 - Loss: 0.5148
[Epoch 1] Step 800/1474 - Loss: 0.4693
[Epoch 1] Step 850/1474 - Loss: 1.5832
[Epoch 1] Step 900/1474 - Loss: 0.5555
[Epoch 1] Step 950/1474 - Loss: 0.6670
[Epoch 1] Step 1000/1474 - Loss: 0.9260
[Epoch 1] Step 1050/1474 - Loss: 0.6068
[Epoch 1] Step 1100/1474 - Loss: 0.6969
[Epoch 1] Step 1150/1474 - Loss: 1.4538
[Epoch 1] Step 1200/1474 - Loss: 0.7707