# **Imports**

In [2]:
!pip install -q segmentation-models-pytorch torchmetrics transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/

In [2]:
import json
from pathlib import Path
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy, F1Score
from torchmetrics.segmentation import DiceScore
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from torch.amp import GradScaler, autocast

from PIL import Image
import numpy as np

from IPython.display import FileLink

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


# **Setup**

## **Dataset**

In [6]:
# Initial setup
DATA_ROOT = Path("/kaggle/input/data-clean")
IMG_DIR = DATA_ROOT / "images"
MSK_DIR = DATA_ROOT / "masks"

train_img_dir = IMG_DIR / "train"
val_img_dir   = IMG_DIR / "validation"
test_img_dir  = IMG_DIR / "test"

train_msk_dir = MSK_DIR / "train"
val_msk_dir   = MSK_DIR / "validation"
test_msk_dir  = MSK_DIR / "test"

CONFIG = {
    "experiment_name": "deeplabv3plus_resnet50_dacl10k_512",
    "model": "DeepLabV3Plus",
    "encoder": "resnet50",
    "encoder_weights": "imagenet",
    "num_classes": 14,
    "image_size": [512, 512],
    "batch_size": 8,
    "epochs": 40,
    "learning_rate": 1e-4,
    "loss": "CrossEntropyLoss",
    "optimizer": "Adam",
    "scheduler": "ReduceLROnPlateau",
    "metrics": [
        "mean_iou_per_class",
        "dice_macro",
        "f1_macro",
        "global_pixel_accuracy",
    ],
}


NUM_CLASSES = CONFIG["num_classes"]
BATCH_SIZE  = CONFIG["batch_size"]
EPOCHS      = CONFIG["epochs"]
LR          = CONFIG["learning_rate"]
IMAGE_SIZE  = CONFIG["image_size"]

In [None]:
# Dataset setup
class Dacl10kDataset(Dataset):
    def __init__(self, img_dir, msk_dir, image_size=(512, 512)):
        self.img_dir = Path(img_dir)
        self.msk_dir = Path(msk_dir)
        self.image_size = image_size

        self.img_paths = sorted([p for p in self.img_dir.iterdir()])

        # Transformations for training images
        self.img_transform = transforms.Compose([
            transforms.Resize(self.image_size, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.ToTensor(),
            transforms.Normalize(  # Normalize each channel with ImageNet normalization values
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
        ])

        self.mask_resize = transforms.Resize(
            self.image_size,
            interpolation=transforms.InterpolationMode.NEAREST, # Change interpolation value to keep integers
        )

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        msk_path = self.msk_dir / img_path.name.replace("jpg", "png") # same filename, jpg -> png

        # Image
        img = Image.open(img_path).convert("RGB")
        img = self.img_transform(img)

        # Mask
        mask = Image.open(msk_path)
        mask = self.mask_resize(mask)
        mask = torch.from_numpy(np.array(mask, dtype=np.int64))

        return img, mask

In [8]:
# Dataset definition
train_dataset = Dacl10kDataset(train_img_dir, train_msk_dir, IMAGE_SIZE)
val_dataset   = Dacl10kDataset(val_img_dir,   val_msk_dir,   IMAGE_SIZE)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
)

print("Train samples:", len(train_dataset), " | batches:", len(train_loader))
print("Val samples:  ", len(val_dataset),   " | batches:", len(val_loader))


Train samples: 5895  | batches: 737
Val samples:   1040  | batches: 130


## **Model, loss, optimizer, metrics**

In [None]:
# Model
model = smp.DeepLabV3Plus(
    encoder_name=CONFIG["encoder"],
    encoder_weights=CONFIG["encoder_weights"],
    in_channels=3,
    classes=NUM_CLASSES,
).to(device)

# Loss and optimizer. Scheduler will decrease learning rate by half when val_loss does not improve for 3 epochs
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=3,
    verbose=True,
)

# Metrics
# 1) Mean IoU per class (Jaccard)
miou_metric = JaccardIndex(
    task="multiclass",
    num_classes=NUM_CLASSES,
).to(device)

# 2) Dice score (macro over classes)
dice_metric = DiceScore(
    num_classes=NUM_CLASSES,
    average="macro",
).to(device)

# 3) F1 Score (macro over classes)
f1_metric = F1Score(
    task="multiclass",
    num_classes=NUM_CLASSES,
    average="macro",
).to(device)

# 4) Global Pixel Accuracy
acc_metric = Accuracy(
    task="multiclass",
    num_classes=NUM_CLASSES,
).to(device)

print("DeepLabV3+ params (M):", sum(p.numel() for p in model.parameters()) / 1e6)

config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

DeepLabV3+ params (M): 26.680926


# **Train**

In [None]:
scaler = GradScaler(enabled=(device.type == "cuda")) # Uses AMP to speed up training

CHECKPOINT_EVERY = 2  # epochs

def train_one_epoch(model, loader, optimizer, criterion, epoch):
    model.train() # set model to training mode
    running_loss = 0.0 # start total loss at 0.0

    for step, (images, masks) in enumerate(loader, start=1):
        images = images.to(device, non_blocking=True)
        masks  = masks.to(device, non_blocking=True)

        optimizer.zero_grad() # zero the gradients

         # Uses AMP to speed up training
        with autocast(device_type="cuda", enabled=(device.type == "cuda")):
            outputs = model(images)           
            loss = criterion(outputs, masks)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() # add losses in each step

        if step % 50 == 0 or step == 1:
            print(f"[Epoch {epoch}] Step {step}/{len(loader)} - Loss: {loss.item():.4f}")

    return running_loss / len(loader) # return mean loss across an epoch


@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    val_loss = 0.0

    # Reset metrics each evaluation
    miou_metric.reset()
    dice_metric.reset()
    f1_metric.reset()
    acc_metric.reset()

    for images, masks in loader:
        images = images.to(device, non_blocking=True)
        masks  = masks.to(device, non_blocking=True)

        # Uses AMP to speed up training
        with autocast(device_type="cuda", enabled=(device.type == "cuda")):
            outputs = model(images)
            loss = criterion(outputs, masks)

        val_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)  # Get class index with highest probability

        # Update metrics 
        miou_metric.update(preds, masks)
        dice_metric.update(preds, masks)
        f1_metric.update(preds, masks)
        acc_metric.update(preds, masks)

    val_loss /= len(loader) # compute mean loss

    miou = miou_metric.compute().item() 
    dice = dice_metric.compute().item()   
    mf1  = f1_metric.compute().item()   
    acc  = acc_metric.compute().item() 

    return val_loss, miou, dice, mf1, acc


In [None]:
history = [] 
best_miou = 0.0

for epoch in range(1, EPOCHS + 1):
    print(f"\n===== Epoch {epoch}/{EPOCHS} =====")

    # Training epoch
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, epoch)

    # Metrics for train and validation sets
    train_loss_eval, train_miou, train_dice, train_f1, train_acc = evaluate(model, train_loader, criterion)
    val_loss, val_miou, val_dice, val_f1, val_acc = evaluate(model, val_loader, criterion)

    # step scheduler on val_loss
    scheduler.step(val_loss)

    current_lr = optimizer.param_groups[0]["lr"]
    
    print(
        f"Epoch {epoch:03d} | "
        f"TrainLoss(step): {train_loss:.4f} | "
        f"TrainLoss(eval): {train_loss_eval:.4f} | "
        f"ValLoss: {val_loss:.4f} | "
        f"Train mIoU: {train_miou:.4f} | "
        f"Val mIoU: {val_miou:.4f} | "
        f"Val Dice: {val_dice:.4f} | "
        f"Val F1: {val_f1:.4f} | "
        f"Val Acc: {val_acc:.4f} | "
        f"LR: {current_lr:.6f}"
    )

    # Store metrics
    history.append({
        "epoch": epoch,
        # training loss from the actual training loop
        "train_loss_step": float(train_loss),
        # training loss recomputed in eval mode
        "train_loss_eval": float(train_loss_eval),
        "train_miou": float(train_miou),
        "train_dice": float(train_dice),
        "train_f1_macro": float(train_f1),
        "train_global_pixel_accuracy": float(train_acc),
        "val_loss": float(val_loss),
        "val_miou": float(val_miou),
        "val_dice": float(val_dice),
        "val_f1_macro": float(val_f1),
        "val_global_pixel_accuracy": float(val_acc),
        "lr": float(current_lr),
    })

    # save best model by mIoU
    if val_miou > best_miou:
        best_miou = val_miou
        torch.save(model.state_dict(), "deeplab_best_miou.pth")
        print("  -> New best mIoU; weights saved to deeplab_best_miou.pth")

    # periodic full checkpoint save
    if epoch % CHECKPOINT_EVERY == 0:
        ckpt_path = f"deeplab_checkpoint_epoch_{epoch}.pth"
        torch.save({
            "config": CONFIG,
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "best_miou": best_miou,
            "history": history,
        }, ckpt_path)
        print(f"  -> Checkpoint saved to {ckpt_path}")

# final "last" checkpoint
torch.save({
    "config": CONFIG,
    "epoch": EPOCHS,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "best_miou": best_miou,
    "history": history,
}, "deeplab_last.pth")

print("\nTraining complete. Best mIoU:", best_miou)

# Store training logs in JSON format

output_path = Path("/kaggle/working/deeplab_results.json")

results = {
    "config": CONFIG,
    "history": history,
}

with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print("Saved metrics to:", output_path)

# Download link for results
FileLink('/kaggle/working/deeplab_results.json')



===== Epoch 1/40 =====
[Epoch 1] Step 1/737 - Loss: 0.6450
[Epoch 1] Step 50/737 - Loss: 0.6377
[Epoch 1] Step 100/737 - Loss: 0.4635
[Epoch 1] Step 150/737 - Loss: 0.7103
[Epoch 1] Step 200/737 - Loss: 0.5733
[Epoch 1] Step 250/737 - Loss: 0.8020
[Epoch 1] Step 300/737 - Loss: 0.7949
[Epoch 1] Step 350/737 - Loss: 0.6793
[Epoch 1] Step 400/737 - Loss: 0.6238
[Epoch 1] Step 450/737 - Loss: 0.8973
[Epoch 1] Step 500/737 - Loss: 0.6295
[Epoch 1] Step 550/737 - Loss: 0.9504
[Epoch 1] Step 600/737 - Loss: 0.7980
[Epoch 1] Step 650/737 - Loss: 0.7801
[Epoch 1] Step 700/737 - Loss: 0.7992
Epoch 001 | TrainLoss(step): 0.8234 | TrainLoss(eval): 0.7466 | ValLoss: 0.7847 | Train mIoU: 0.1897 | Val mIoU: 0.1705 | Val Dice: 1.6069 | Val F1: 0.2452 | Val Acc: 0.7593 | LR: 0.000100
  -> New best mIoU; weights saved to deeplab_best_miou.pth

===== Epoch 2/40 =====
[Epoch 2] Step 1/737 - Loss: 0.7732
[Epoch 2] Step 50/737 - Loss: 0.6691
[Epoch 2] Step 100/737 - Loss: 0.5755
[Epoch 2] Step 150/737 - L

# **Resume training with weighted CE loss**

In [10]:
# Load checkpoint
CKPT_PATH = "/kaggle/input/deeplab-last/deeplab_last.pth"

ckpt = torch.load(CKPT_PATH, map_location=device)

print("Checkpoint keys:", ckpt.keys())
print("Last epoch:", ckpt["epoch"])
print("Best mIoU so far:", ckpt["best_miou"])


Checkpoint keys: dict_keys(['config', 'epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'best_miou', 'history'])
Last epoch: 40
Best mIoU so far: 0.3128804564476013


In [12]:
# Define model, optimizer and metrics in the same way as in previous training
# Model
model = smp.DeepLabV3Plus(
    encoder_name=CONFIG["encoder"],
    encoder_weights=CONFIG["encoder_weights"],
    in_channels=3,
    classes=NUM_CLASSES,
).to(device)

model.load_state_dict(ckpt["model_state_dict"])

# Optimizer and scheduler
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=CONFIG["learning_rate"],
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=3,
    verbose=True,
)

optimizer.load_state_dict(ckpt["optimizer_state_dict"])
scheduler.load_state_dict(ckpt["scheduler_state_dict"])

best_miou = ckpt["best_miou"]
history   = ckpt["history"]    
start_epoch = ckpt["epoch"] + 1
print("Resuming from epoch", start_epoch)

# Metrics
# 1) Mean IoU per class (Jaccard)
miou_metric = JaccardIndex(
    task="multiclass",
    num_classes=NUM_CLASSES,
).to(device)

# 2) Dice score (macro over classes)
dice_metric = DiceScore(
    num_classes=NUM_CLASSES,
    average="macro",
).to(device)

# 3) F1 Score (macro over classes)
f1_metric = F1Score(
    task="multiclass",
    num_classes=NUM_CLASSES,
    average="macro",
).to(device)

# 4) Global Pixel Accuracy
acc_metric = Accuracy(
    task="multiclass",
    num_classes=NUM_CLASSES,
).to(device)


Resuming from epoch 41


In [13]:
# Compute class imbalance (pixel-wise) to define class weights
class_counts = torch.zeros(NUM_CLASSES, dtype=torch.long)

# Set model to evaluation
model.eval()

with torch.no_grad():
    for _, masks in tqdm(train_loader, desc="Computing class frequencies"):
        # masks: [B, H, W]
        masks = masks.view(-1)  # flatten
        values, freqs = torch.unique(masks, return_counts=True)
        class_counts[values] += freqs # accumulate pixel count per class

print("Class counts:", class_counts.tolist())

# Avoid division by zero
freq = class_counts.float() / class_counts.sum()
inv_freq = 1.0 / (freq + 1e-6)

# Normalize so average weight ~ 1
class_weights = inv_freq * (NUM_CLASSES / inv_freq.sum())

print("Class weights:", class_weights.tolist())

Computing class frequencies: 100%|██████████| 737/737 [04:03<00:00,  3.03it/s]

Class counts: [1148926007, 5129526, 2217252, 37482497, 30849929, 19727449, 4548455, 34674896, 24742215, 10756736, 32937080, 161365569, 3466853, 28514416]
Class weights: [0.008133492432534695, 1.8212167024612427, 4.2116475105285645, 0.24930056929588318, 0.3028961420059204, 0.47365784645080566, 2.053800582885742, 0.26948535442352295, 0.3776625692844391, 0.8686138987541199, 0.28370317816734314, 0.057910144329071045, 2.694267511367798, 0.32770389318466187]





In [14]:
# Define new loss: Same CrossEntropy but adding class weights
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

In [None]:
EXTRA_EPOCHS = 40
END_EPOCH = start_epoch + EXTRA_EPOCHS - 1

for epoch in range(start_epoch, END_EPOCH + 1):
    print(f"\n===== Epoch {epoch}/{END_EPOCH} (resumed with weighted CE) =====")

    # Training epoch
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, epoch)

    # Metrics for train and validation sets
    train_loss_eval, train_miou, train_dice, train_f1, train_acc = evaluate(model, train_loader, criterion)
    val_loss, val_miou, val_dice, val_f1, val_acc = evaluate(model, val_loader, criterion)

    # step scheduler on val_loss
    scheduler.step(val_loss)

    current_lr = optimizer.param_groups[0]["lr"]
    
    print(
        f"Epoch {epoch:03d} | "
        f"TrainLoss(step): {train_loss:.4f} | "
        f"TrainLoss(eval): {train_loss_eval:.4f} | "
        f"ValLoss: {val_loss:.4f} | "
        f"Train mIoU: {train_miou:.4f} | "
        f"Val mIoU: {val_miou:.4f} | "
        f"Val Dice: {val_dice:.4f} | "
        f"Val F1: {val_f1:.4f} | "
        f"Val Acc: {val_acc:.4f} | "
        f"LR: {current_lr:.6f}"
    )

    # Store metrics
    history.append({
        "epoch": epoch,
        # training loss from the actual training loop
        "train_loss_step": float(train_loss),
        # training loss recomputed in eval mode
        "train_loss_eval": float(train_loss_eval),
        "train_miou": float(train_miou),
        "train_dice": float(train_dice),
        "train_f1_macro": float(train_f1),
        "train_global_pixel_accuracy": float(train_acc),
        "val_loss": float(val_loss),
        "val_miou": float(val_miou),
        "val_dice": float(val_dice),
        "val_f1_macro": float(val_f1),
        "val_global_pixel_accuracy": float(val_acc),
        "lr": float(current_lr),
    })

    # save best model by mIoU
    if val_miou > best_miou:
        best_miou = val_miou
        torch.save(model.state_dict(), "deeplab_best_miou_wce.pth")
        print("  -> New best mIoU; weights saved to deeplab_best_miou_wce.pth")

    # periodic full checkpoint save
    if epoch % CHECKPOINT_EVERY == 0:
        ckpt_path = f"deeplab_checkpoint_epoch_{epoch}_wce.pth"
        torch.save({
            "config": CONFIG,
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "best_miou": best_miou,
            "history": history,
        }, ckpt_path)
        print(f"  -> Checkpoint saved to {ckpt_path}")

# final "last" checkpoint
torch.save({
    "config": CONFIG,
    "epoch": END_EPOCH,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "best_miou": best_miou,
    "history": history,
}, "deeplab_last_wce.pth")

print("\nTraining complete. Best mIoU:", best_miou)

# Store training logs in JSON format

output_path = Path("/kaggle/working/deeplab_results_wce.json")

results = {
    "config": CONFIG,
    "history": history,
}

with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print("Saved metrics to:", output_path)

# Download link for results
FileLink('/kaggle/working/deeplab_results_wce.json')



===== Epoch 41/80 (resumed with weighted CE) =====
[Epoch 41] Step 1/737 - Loss: 0.7688
[Epoch 41] Step 50/737 - Loss: 0.7984
[Epoch 41] Step 100/737 - Loss: 0.5377
[Epoch 41] Step 150/737 - Loss: 0.6618
[Epoch 41] Step 200/737 - Loss: 0.5389
[Epoch 41] Step 250/737 - Loss: 0.4518
[Epoch 41] Step 300/737 - Loss: 0.4877
[Epoch 41] Step 350/737 - Loss: 0.4679
[Epoch 41] Step 400/737 - Loss: 0.2205
[Epoch 41] Step 450/737 - Loss: 0.6048
[Epoch 41] Step 500/737 - Loss: 0.5975
[Epoch 41] Step 550/737 - Loss: 0.3584
[Epoch 41] Step 600/737 - Loss: 0.4334
[Epoch 41] Step 650/737 - Loss: 0.3878
[Epoch 41] Step 700/737 - Loss: 0.4248
Epoch 041 | TrainLoss(step): 0.4884 | TrainLoss(eval): 0.4021 | ValLoss: 2.2341 | Train mIoU: 0.7101 | Val mIoU: 0.3139 | Val Dice: 2.9194 | Val F1: 0.4527 | Val Acc: 0.7678 | LR: 0.000000
  -> New best mIoU; weights saved to deeplab_best_miou_wce.pth

===== Epoch 42/80 (resumed with weighted CE) =====
[Epoch 42] Step 1/737 - Loss: 0.1921
[Epoch 42] Step 50/737 - 