# Setup

In [None]:
# Install the necessary dependencies
!pip install torch torchvision torchaudio
!pip install datasets
!pip install evaluate
!pip install albumentations
!pip install git+https://github.com/huggingface/transformers.git

# We will use this to push our trained model to HF Hub
!pip install huggingface_hub
!pip install torchmetrics
!pip install 'accelerate>=1.1.0'
!pip install matplotlib
!pip install pycocotools


In [None]:
# Import the necessary packages
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import json
from pathlib import Path
from PIL import Image, ImageDraw
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import albumentations as A
import numpy as np
import pandas as pd
from datasets import DatasetDict, Dataset, load_from_disk
from transformers import (
    MaskFormerForInstanceSegmentation,
    MaskFormerImageProcessor
)
from transformers.trainer import EvalPrediction
from torchmetrics.detection.mean_ap import MeanAveragePrecision
import evaluate
from huggingface_hub import notebook_login
from dataclasses import dataclass, field
from typing import Any, Optional
import logging
import transformers
import sys

# Load Dataset

In [None]:
# Load COCO-style annotations from the 'buildings' dataset and convert to the Instance Segmentations format

# ------------------------------
# SKIP IF DATASET ALREADY IN HF
# ------------------------------

def coco2seg(dataset_dir, splits=['train', 'val', 'test']):
    """
    Convert a COCO-style JSON (images, annotations, categories) to
    a Instance Segmentation Dataset compatible format for Hugging Face Tasks.

    Args:
        dataset_dir: path to the dataset
        splits: splits to load and convert

    Returns:
        DatasetDict object (!!!images saved as paths for memory usage!!!):
            - 'image': PIL.Image
            - 'annotation': PIL.Image with
                            R channel = category_id
                            G channel = instance_id (unique per image, <256 instances)
    """
    dataset_dir = Path(dataset_dir)
    result = {}

    for split in splits:
        ann_path = dataset_dir / f"{split}/{split}_512.json"
        img_dir = dataset_dir / split / "image_512"

        if not ann_path.exists() or not img_dir.exists():
            if split == "test" and img_dir.exists():
                print(f"⚠️ No annotation file found for '{split}' — loading images only.")
                images = [
                    {"image": Image.open(img_dir / f).convert("RGB")}
                    for f in img_dir.glob()
                    if str(f).lower().endswith((".jpg", ".jpeg", ".png"))
                ]
                result[split] = Dataset.from_list(images)
                continue
            print(f"WARNING: Missing split '{split}', skipping.")
            continue

        print(f"Processing split '{split}'...")

        # Load COCO annotation JSON
        with open(ann_path, "r") as f:
            coco = json.load(f)

        images = {img["id"]: img for img in coco["images"]}
        annotations = coco["annotations"]

        # Group annotations by image_id
        anns_by_img = {}
        for ann in annotations:
            anns_by_img.setdefault(ann["image_id"], []).append(ann)

        records = []
        for img_id, img_info in tqdm(images.items()):
            file_name = Path(img_info["file_name"]).name

            width, height = img_info["width"], img_info["height"]

            image_path = img_dir / file_name

            if not image_path.exists():
                continue

            # Create blank annotation image (2-channel RGB)
            ann_img = np.zeros((height, width, 3), dtype=np.uint8)

            # Draw polygons per instance
            r = Image.new("L", (width, height), 0)  # Category
            g = Image.new("L", (width, height), 0)  # Instance
            draw_r = ImageDraw.Draw(r)
            draw_g = ImageDraw.Draw(g)

            anns = anns_by_img.get(img_id, [])
            instance_counter = 1
            cat_ids = []
            for ann in anns:

                cat_id = int(ann["category_id"])
                cat_ids.append(cat_id)
                polygons = ann.get("segmentation", [])
                if not polygons or not isinstance(polygons, list):
                    continue

                # Each polygon in COCO is a list of [x1, y1, x2, y2, ...]
                for poly in polygons:
                    if len(poly) < 6:  # invalid polygon
                        continue
                    xy = [(poly[i], poly[i + 1]) for i in range(0, len(poly), 2)]
                    draw_r.polygon(xy, fill=cat_id+1)
                    draw_g.polygon(xy, fill=instance_counter)

                instance_counter += 1
                if instance_counter >= 256:
                    print(f"WARNING: Too many instances in {file_name}, clipping to 255.")
                    break

            # Merge R and G channels back into RGB
            ann_img = np.stack([
                np.array(r),
                np.array(g),
                np.zeros((height, width), np.uint8)],
                axis=-1)

            # SAVE AS PNG SUPER IMPORTANT FOR NO DATA LOSS
            ann_path = dataset_dir / split / "annotation" / f"{Path(img_info['file_name']).stem}.png"
            ann_path.parent.mkdir(parents=True, exist_ok=True)
            ann_img = ann_img.astype(np.uint8)
            Image.fromarray(ann_img).save(ann_path)

            records.append({
                "image": str(image_path),
                "annotation": str(ann_path)
            })

        result[split] = Dataset.from_list(records)

    dataset = DatasetDict(result)

    dataset.save_to_disk(dataset_dir / "hf")

    return dataset

# DATASET_DIR = Path("./building-extraction-generalization-2024")

# dataset = coco2seg(DATASET_DIR)

In [None]:
# ------------------------------
# SKIP IF DATASET ALREADY IN HF
# ------------------------------

# Upload Dataset to Hugging Face HUB
import glob
from datasets import Dataset, Features, Image
from huggingface_hub import login

login('')

def gen_examples():
    for img_path, ann_path in zip(images, annotations):
        yield {
            "image": {"path": img_path},       # let HF handle loading
            "annotation": {"path": ann_path},  # same here
        }

features = Features({
    "image": Image(),
    "annotation": Image()
})

# Train
images = sorted(glob.glob("./building-extraction-generalization-2024/train/image_512/*.jpg"))
annotations = sorted(glob.glob("./building-extraction-generalization-2024/train/annotation/*.png"))
train_ds = Dataset.from_generator(gen_examples, features=features)

# Validation
images = sorted(glob.glob("./building-extraction-generalization-2024/train/image_512/*.jpg"))
annotations = sorted(glob.glob("./building-extraction-generalization-2024/train/annotation/*.png"))
val_ds = Dataset.from_generator(gen_examples, features=features)

# Test
images = sorted(glob.glob("./building-extraction-generalization-2024/train/image_512/*.jpg"))
annotations = sorted(glob.glob("./building-extraction-generalization-2024/train/annotation/*.png"))
test_ds = Dataset.from_generator(gen_examples, features=features)

# Upload to HUB
DatasetDict({"train": train_ds, "val": val_ds, "test": test_ds}).push_to_hub("tomascanivari/building_extraction")

In [None]:
from datasets import load_dataset
# Load Converted Dataset
DATASET_HF_DIR = "tomascanivari/building_extraction"

dataset = load_dataset("tomascanivari/building_extraction")

print(dataset)

# Let's check first train image and annotation
example = dataset["train"][0]
img = example["image"]
ann = example["annotation"]

# Load PIL image
image = np.array(img.convert("RGB"))
annotation = np.array(ann)

print("Number of Categories: ", np.unique(annotation[..., 0]))  # Red channel: category IDs
print("Number of Instances: ", np.unique(annotation[..., 1]))  # Green channel: instance IDs

# Plot the original image and the annotations
plt.figure(figsize=(15, 5))
for plot_index in range(3):
    if plot_index == 0:
        # If plot index is 0 display the original image
        plot_image = image
        title = "Original"
    else:
        # Else plot the annotation maps
        plot_image = annotation[..., plot_index - 1]
        title = ["Class Map (R)", "Instance Map (G)"][plot_index - 1]
    # Plot the image
    plt.subplot(1, 3, plot_index + 1)
    plt.imshow(plot_image)
    plt.title(title)
    plt.axis("off")

# Let' check instance 0
print("Instance 1")
mask = (annotation[..., 1] == 1)
visual_mask = (mask * 255).astype(np.uint8)
Image.fromarray(visual_mask)

In [None]:
# ------------------------------
# SKIP IF MODEL ALREADY IN HF
# ------------------------------

# Change label2id and id2label (Start from 0, when in annotations it starts from 1. Compatible with reduce in Processor)
id2label = {0: 'building'}
label2id = {'building': 0}

# Load pre-trained weights
model = MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-base-coco", id2label=id2label,
                                                          ignore_mismatched_sizes=True)
# Load processor
processor = MaskFormerImageProcessor(
    do_reduce_labels=True,
    size=(512, 512),
    ignore_index=255,
    do_resize=False,
    do_rescale=False,
    do_normalize=False,
)

In [None]:
# Load Model from HUB
id2label = {0: 'building'}
label2id = {'building': 0}

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Grab the trained model and processor from the hub
model = MaskFormerForInstanceSegmentation.from_pretrained(
    "tomascanivari/maskformer-swin-base-building-instance", id2label=id2label
).to(device)

processor = MaskFormerImageProcessor.from_pretrained(
    "tomascanivari/maskformer-swin-base-building-instance")

processor.do_resize=False
processor.do_rescale=False
processor.do_normalize=False

In [None]:
from torch.utils.data import Dataset # To make ImageSegmentationDataset a PyTorch dataset !!!!!!!!!!!!!!!!

# Define the configurations of the transforms specific
# to the dataset used
ADE_MEAN = np.array([123.675, 116.280, 103.530]) / 255
ADE_STD = np.array([58.395, 57.120, 57.375]) / 255
# Build the augmentation transforms
train_val_transform = A.Compose([
    A.Resize(width=512, height=512),
    A.HorizontalFlip(p=0.3),
    A.Normalize(mean=ADE_MEAN, std=ADE_STD),
    A.ToFloat()
])

class ImageSegmentationDataset(Dataset):
    def __init__(self, dataset, processor, transform=None):
        # Initialize the dataset, processor, and transform variables
        self.dataset = dataset
        self.processor = processor
        self.transform = transform

    def __len__(self):
        # Return the number of datapoints
        return len(self.dataset)

    def __getitem__(self, idx):
        # Convert the PIL Image to a NumPy array
        image = np.array(self.dataset[idx]["image"].convert("RGB"))

        # Get the pixel wise instance id and category id maps
        # of shape (height, width)
        annotation = np.array(self.dataset[idx]["annotation"])
        instance_seg = np.array(annotation)[..., 1]
        class_id_map = np.array(annotation)[..., 0]
        class_labels = np.unique(class_id_map)

        # Build the instance to class dictionary
        inst2class = {}
        for label in class_labels:
            instance_ids = np.unique(instance_seg[class_id_map == label])
            inst2class.update({i: label for i in instance_ids})
        # Apply transforms
        if self.transform is not None:
            transformed = self.transform(image=image, mask=instance_seg)
            (image, instance_seg) = (transformed["image"], transformed["mask"])

            # Convert from channels last to channels first
            image = image.transpose(2,0,1)
        if class_labels.shape[0] == 1 and class_labels[0] == 0:
            # If the image has no objects then it is skipped
            inputs = self.processor([image], return_tensors="pt")
            inputs = {k:v.squeeze() for k,v in inputs.items()}
            inputs["class_labels"] = torch.tensor([0])
            inputs["mask_labels"] = torch.zeros(
                (0, inputs["pixel_values"].shape[-2], inputs["pixel_values"].shape[-1])
            )
        else:
            # Else use process the image with the segmentation maps
            inputs = self.processor(
                [image],
                [instance_seg],
                instance_id_to_semantic_id=inst2class,
                return_tensors="pt"
            )
            inputs = {
                k:v.squeeze() if isinstance(v, torch.Tensor) else v[0] for k,v in inputs.items()
            }
        # Return the inputs
        return inputs

# Build the train and validation instance segmentation dataset
train_dataset = ImageSegmentationDataset(
    dataset["train"],
    processor=processor,
    transform=train_val_transform
)
val_dataset = ImageSegmentationDataset(
    dataset["val"],
    processor=processor,
    transform=train_val_transform
)

In [None]:
# Check if everything is preprocessed correctly
print("Train Instance 0")
inputs = train_dataset[0]
for k,v in inputs.items():
  print(k, v.shape)

print("\nTrain Instance 1")
inputs = train_dataset[1]
for k,v in inputs.items():
  print(k, v.shape)

In [None]:
def collate_fn(batch):
    pixel_values = torch.stack([example["pixel_values"] for example in batch])
    pixel_mask = torch.stack([example["pixel_mask"] for example in batch])
    class_labels = [example["class_labels"] for example in batch]
    mask_labels = [example["mask_labels"] for example in batch]
    return {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "class_labels": class_labels, "mask_labels": mask_labels}

train_dataloader = DataLoader(
    train_dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=collate_fn)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn,
)

In [None]:
# Check if batching is correct
batch = next(iter(train_dataloader))
for k,v in batch.items():
  if isinstance(v, torch.Tensor):
    print(k,v.shape)
  else:
    print(k,len(v))

In [None]:
outputs = model(
          pixel_values=batch["pixel_values"],
          mask_labels=batch["mask_labels"],
          class_labels=batch["class_labels"],
      )
outputs.loss

In [None]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
model.to(device)

# Initialize Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# Set number of epochs and batch size

start_epoch = 8
num_epochs = 100
for epoch in range(start_epoch, num_epochs):
    print(f"Epoch {epoch} | Training")

    # Set model in training mode
    model.train()
    train_loss, val_loss = [], []


    # Training loop
    for idx, batch in enumerate(tqdm(train_dataloader)):
        # Reset the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            pixel_values=batch["pixel_values"].to(device),
            mask_labels=[labels.to(device) for labels in batch["mask_labels"]],
            class_labels=[labels.to(device) for labels in batch["class_labels"]],
        )
        # Backward propagation
        loss = outputs.loss
        train_loss.append(loss.item())
        loss.backward()
        # if idx % 50 == 0:
            # print("  Training loss: ", round(sum(train_loss)/len(train_loss), 6))

        # Optimization
        optimizer.step()

    # Average train epoch loss
    train_loss = sum(train_loss)/len(train_loss)

    # Set model in evaluation mode
    model.eval()
    start_idx = 0
    print(f"Epoch {epoch} | Validation")
    for idx, batch in enumerate(tqdm(val_dataloader)):
        with torch.no_grad():
            # Forward pass
            outputs = model(
                pixel_values=batch["pixel_values"].to(device),
                mask_labels=[labels.to(device) for labels in batch["mask_labels"]],
                class_labels=[labels.to(device) for labels in batch["class_labels"]],
            )
            # Get validation loss
            loss = outputs.loss
            val_loss.append(loss.item())
            # if idx % 50 == 0:
                # print("  Validation loss: ", round(sum(val_loss)/len(val_loss), 6))

    # Average validation epoch loss
    val_loss = sum(val_loss)/len(val_loss)

    # Print epoch losses
    print(f"Epoch {epoch} | train_loss: {train_loss} | validation_loss: {val_loss}")

    model.save_pretrained("models/mf")
    processor.save_pretrained("models/mf_p")

In [None]:
model.save_pretrained("models/mf")
processor.save_pretrained("models/mf_p")

In [None]:
from transformers import MaskFormerForInstanceSegmentation, MaskFormerImageProcessor
from huggingface_hub import login

login('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')

# We won't be using albumentations to preprocess images for inference
processor.do_normalize = True
processor.do_resize = True
processor.do_rescale = True

# Push your model and preprocessor to the Hub
model.push_to_hub("maskformer-swin-base-building-instance")
processor.push_to_hub("maskformer-swin-base-building-instance")

# Inference

In [None]:
import torch
import random
import evaluate

import matplotlib.pyplot as plt
import numpy as np

from PIL import Image
from tqdm import tqdm
from datasets import load_dataset
from transformers import (
    MaskFormerForInstanceSegmentation,
    MaskFormerImageProcessor
)


In [None]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Grab the trained model and processor from the hub
model = MaskFormerForInstanceSegmentation.from_pretrained(
    "tomascanivari/maskformer-swin-base-building-instance"
).to(device)

processor = MaskFormerImageProcessor.from_pretrained(
    "tomascanivari/maskformer-swin-base-building-instance")


In [None]:
# Load Converted Dataset
DATASET_HF_DIR = "tomascanivari/building_extraction"

# Load the whole dataset dict
dataset = load_dataset("tomascanivari/building_extraction")

# Test Split Annotation is Place-Holder
print(dataset)

In [None]:
# Let's check first train image and annotation
example = dataset["train"][0]
img = example["image"]
ann = example["annotation"]

# Load PIL image
image = np.array(img.convert("RGB"))
annotation = np.array(ann)

print("Number of Categories: ", np.unique(annotation[..., 0]))  # Red channel: category IDs
print("Number of Instances: ", np.unique(annotation[..., 1]))  # Green channel: instance IDs

# Plot the original image and the annotations
plt.figure(figsize=(15, 5))
for plot_index in range(3):
    if plot_index == 0:
        # If plot index is 0 display the original image
        plot_image = image
        title = "Original"
    else:
        # Else plot the annotation maps
        plot_image = annotation[..., plot_index - 1]
        title = ["Class Map (R)", "Instance Map (G)"][plot_index - 1]
    # Plot the image
    plt.subplot(1, 3, plot_index + 1)
    plt.imshow(plot_image)
    plt.title(title)
    plt.axis("off")

# Let' check instance 0
print("Instance 1")
mask = (annotation[..., 1] == 1)
visual_mask = (mask * 255).astype(np.uint8)
Image.fromarray(visual_mask)

In [None]:
# Visualize RLE obtained from instance segmentation annotation
from pycocotools import mask as mask_utils

def instance_mask_to_rle(instance_mask):
    """
    annotation_image: H x W x 3
        - red channel: instance ID
        - green channel: class label
    Returns:
        List of RLEs (one per instance)
    """
    instance_ids = np.unique(instance_mask)
    instance_ids = instance_ids[instance_ids != 255]  # exclude background

    rles = []
    for inst_id in instance_ids:
        mask = (instance_mask == inst_id).astype(np.uint8)
        rle = mask_utils.encode(np.asfortranarray(mask))
        rle["counts"] = rle["counts"].decode("utf-8")  # optional for JSON compatibility
        rles.append(rle)
    return rles

def visualize_rle_on_image(image, rle_list, alpha=0.5):
    """
    Visualize RLE masks over the original image.

    Args:
        image: H x W x 3 NumPy array (original image)
        rle_list: list of RLEs (from pycocotools)
        alpha: transparency for mask overlay
    """
    overlay = image.copy()

    for rle in rle_list:
        mask = mask_utils.decode(rle)  # H x W, 0/1
        color = np.array([0, 0, 255], dtype=np.uint8)
        overlay[mask==1] = (1-alpha)*overlay[mask==1] + alpha*color

    plt.figure(figsize=(10, 10))
    plt.imshow(overlay)
    plt.axis('off')
    plt.show()

idx = 1

image = np.array(dataset["val"][idx]["image"].convert("RGB"))

annotation = np.array(dataset["val"][idx]["annotation"])
annotation -= 1  # Reduce labels
annotation[annotation == -1] = 255  # ignore_index

rles = instance_mask_to_rle(annotation[..., 1])

print(len(rles), len(np.unique(annotation[..., 1]))-1)

visualize_rle_on_image(image, rles)

# Plot the original image and the annotations
plt.figure(figsize=(15, 5))
for plot_index in range(3):
    if plot_index == 0:
        # If plot index is 0 display the original image
        plot_image = image
        title = "Original"
    else:
        # Else plot the annotation maps
        plot_image = annotation[..., plot_index - 1]
        title = ["Class Map (R)", "Instance Map (G)"][plot_index - 1]
    # Plot the image
    plt.subplot(1, 3, plot_index + 1)
    plt.imshow(plot_image)
    plt.title(title)
    plt.axis("off")


In [None]:
# Let's check the prediction on the first val image

image = dataset["val"][idx]["image"].convert("RGB")
target_size = image.size[::-1]

# Preprocess image
inputs = processor(images=image, return_tensors="pt").to(device)

# Inference
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# Let's print the items returned by our model and their shapes
print("Outputs...")
for key, value in outputs.items():
    print(f"  {key}: {value.shape}")

# Post-process results to retrieve instance segmentation maps
result = processor.post_process_instance_segmentation(
    outputs,
    threshold=0.5,
    target_sizes=[target_size],
)[0] # we pass a single output therefore we take the first result (single)

instance_seg_mask = result["segmentation"].cpu().detach().numpy()
instance_seg_mask[instance_seg_mask == -1] = 255

# for i in range(instance_seg_mask.shape[0]):
#     for j in range(instance_seg_mask.shape[1]):
#         print(instance_seg_mask[i][j], end=' ')
#     print()

rles = instance_mask_to_rle(instance_seg_mask)

print(len(rles), len(np.unique(instance_seg_mask)-1))

visualize_rle_on_image(np.array(image), rles)


print(f"Final mask shape: {instance_seg_mask.shape}")
print("Segments Information...")
for info in result["segments_info"]:
    print(f"  {info}")

print(np.unique(instance_seg_mask))

In [None]:
# Obtain GT and PRED of VAL

gt_rles = {}
pred_rles = {}
for idx in tqdm(range(len(dataset["val"]))):
    image = np.array(dataset["val"][idx]["image"].convert("RGB"))

    annotation = np.array(dataset["val"][idx]["annotation"])
    annotation -= 1  # Reduce labels
    annotation[annotation == -1] = 255  # ignore_index

    gt_rles[idx] = instance_mask_to_rle(annotation[..., 1])

    # Preprocess image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-process results to retrieve instance segmentation maps
    result = processor.post_process_instance_segmentation(
        outputs,
        threshold=0.7,
        target_sizes=[target_size],
    )[0] # we pass a single output therefore we take the first result (single)

    instance_seg_mask = result["segmentation"].cpu().detach().numpy()
    instance_seg_mask[instance_seg_mask == -1] = 255

    pred_rles[idx] = instance_mask_to_rle(instance_seg_mask)



In [None]:
from scipy.optimize import linear_sum_assignment

# --- Compute F1 ---
def compute_f1(gt_rles, pred_rles, iou_thresh=0.5):
    total_TP = total_FP = total_FN = 0
    for img_id in set(list(gt_rles.keys()) + list(pred_rles.keys())):
        gt_list = gt_rles.get(img_id, [])
        pr_list = pred_rles.get(img_id, [])

        if len(gt_list) == 0 and len(pr_list) == 0:
            continue

        if len(gt_list) == 0:
            total_FP += len(pr_list)
            continue
        if len(pr_list) == 0:
            total_FN += len(gt_list)
            continue

        # Build IoU matrix
        ious = mask_utils.iou(pr_list, gt_list, [0]*len(gt_list))  # [pred, gt]

        # Hungarian assignment to maximize IoU
        cost = -ious
        row_ind, col_ind = linear_sum_assignment(cost)

        matched_pred = set()
        matched_gt = set()
        for r, c in zip(row_ind, col_ind):
            if ious[r, c] >= iou_thresh:
                matched_pred.add(r)
                matched_gt.add(c)

        total_TP += len(matched_gt)
        total_FP += len(pr_list) - len(matched_pred)
        total_FN += len(gt_list) - len(matched_gt)

    precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0.0
    recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "TP": total_TP,
        "FP": total_FP,
        "FN": total_FN,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

print(gt_rles[1])
print(pred_rles[1])

metrics = compute_f1(gt_rles, pred_rles, iou_thresh=0.5)
print("F1-score evaluation (IoU >= 0.5):")
print(metrics)

In [None]:
import cv2
import pandas as pd
# --- Paths ---
output_csv = "submission.csv"

# --- Helper: binary mask to polygon coordinates ---
def mask_to_coords(mask):
    """Convert a HxW binary mask (numpy array) to polygon coordinates."""
    mask = mask.astype(np.uint8)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    coords_list = []
    for cnt in contours:
        if len(cnt) >= 3:  # valid polygon
            coords_list.append([(int(x), int(y)) for [[x, y]] in cnt])
    return coords_list

def extract_instance_masks(segmentation_mask, exclude_background=True):
    """
    Extracts binary instance masks from a labeled segmentation mask.

    Args:
        segmentation_mask (np.ndarray): 2D array (H, W), where each unique
            nonzero integer represents an instance ID.
        exclude_background (bool): If True, ignore background ID (255).

    Returns:
        list of np.ndarray: Each element is a binary mask (H, W, dtype=bool)
            corresponding to one instance in the segmentation mask.
    """
    if segmentation_mask.ndim != 2:
        raise ValueError("Segmentation mask must be a 2D array")

    instance_ids = np.unique(segmentation_mask)
    if exclude_background:
        instance_ids = instance_ids[instance_ids != 255]

    masks = [(segmentation_mask == inst_id) for inst_id in instance_ids]
    return masks

# --- Run inference & prepare CSV ---
rows = []
for idx in tqdm(range(len(dataset["test"]))):
    image = np.array(dataset["test"][idx]["image"].convert("RGB"))
    # Preprocess image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-process results to retrieve instance segmentation maps
    result = processor.post_process_instance_segmentation(
        outputs,
        threshold=0.7,
        target_sizes=[target_size],
    )[0] # we pass a single output therefore we take the first result (single)

    instance_seg_mask = result["segmentation"].cpu().detach().numpy()
    instance_seg_mask[instance_seg_mask == -1] = 255

    masks = extract_instance_masks(instance_seg_mask)
    img_id = idx

    all_coords = []
    for mask in masks:
        polys = mask_to_coords(mask)
        all_coords.extend(polys)

    # If no detection
    if not all_coords:
        all_coords = []

    rows.append({"ImageID": img_id, "Coordinates": str(all_coords)})

# --- Save CSV ---
df = pd.DataFrame(rows)
df.to_csv(output_csv, index=False)
print(f"Submission CSV saved as {output_csv}")

In [None]:
dataset["test"][11]["image"].convert("RGB")

In [None]:
# Load Mean IoU metric
metrics = evaluate.load("mean_iou")

model.eval()

for idx in tqdm(range(len(dataset["val"]))):
    data = dataset["val"][idx]
    image = data["image"].convert("RGB")
    target_size = image.size[::-1]

    annotation = np.array(data["annotation"])[:, :, 1]
    annotation -= 1
    annotation[annotation == -1] = 255  # ignore_index

    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    result = processor.post_process_semantic_segmentation(
        outputs, target_sizes=[target_size]
    )[0]
    semantic_seg_mask = result.cpu().numpy().astype(np.uint8)

    # Update metric incrementally
    metrics.add_batch(
        predictions=[semantic_seg_mask],
        references=[annotation]
    )

    # Free up GPU memory
    del outputs, result
    torch.cuda.empty_cache()

# Compute final result
results = metrics.compute(num_labels=2, ignore_index=255)
print(f"Mean IoU: {results['mean_iou']} | Mean Accuracy: {results['mean_accuracy']} | Overall Accuracy: {results['overall_accuracy']}")

