Define Paths and Load Dataset

In [1]:
import os
import cv2
from pathlib import Path
import matplotlib.pyplot as plt
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog, DatasetCatalog

def register_dataset_splits(dataset_name="building", dataset_classes=["building"]):
    
    def load_coco_json(json_file, image_root):
        """Load dataset in COCO format for Detectron2 visualization."""
        from detectron2.data.datasets import load_coco_json
        dataset_dicts = load_coco_json(json_file, image_root)
        return dataset_dicts

    # --- PATHS ---
    SPLITS = ["train", "val"]

    for split in SPLITS:
        DATA_DIR = Path("building-extraction-generalization-2024") / split
        ANN_PATH = DATA_DIR / f"{split}.json"

        # Register dataset (if not already registered)
        dataset_split_name = f"{dataset_name}_{split}"
        if dataset_split_name not in DatasetCatalog.list():
            DatasetCatalog.register(
                dataset_split_name,
                lambda ann_path=ANN_PATH, data_dir=DATA_DIR: load_coco_json(ann_path, data_dir)
            )
            MetadataCatalog.get(dataset_split_name).set(thing_classes=dataset_classes)
            print(DATA_DIR, ANN_PATH, dataset_split_name)


register_dataset_splits()

building-extraction-generalization-2024/train building-extraction-generalization-2024/train/train.json building_train
building-extraction-generalization-2024/val building-extraction-generalization-2024/val/val.json building_val


Visualize Train Images

In [None]:
def visualize_polygons(img, dataset_dict, metadata, outputs=None):
    """
    Visualize polygons from a dataset dict.    
    Args:
        img: BGR image as np.array
        dataset_dict: one dataset dict from DatasetCatalog
        metadata: MetadataCatalog entry for the dataset
    Returns:
        img_vis: RGB image with drawn polygons
    """    

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    v = Visualizer(
        img,  # RGB
        metadata=metadata,
        scale=2.0,
        instance_mode=ColorMode.IMAGE
    )
    
    if outputs:
        out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    else:
        out = v.draw_dataset_dict(dataset_dict)
    
    img_vis = out.get_image() 

    plt.figure(figsize=(10, 10))
    plt.imshow(img_vis)
    plt.title(f"Image Name: {os.path.basename(img_path)} - ({len(dataset_dict['annotations'])} Annotations)")
    plt.axis("off")
    plt.show()

    return img_vis


# Load training dataset
dataset_dicts = DatasetCatalog.get("building_train")
metadata = MetadataCatalog.get("building_train")
# Visualize first three image
for d in dataset_dicts[0:3]:
    img_path = d["file_name"]
    img = cv2.imread(img_path)
    print(img_path)
    img_vis = visualize_polygons(img, d, metadata)



Train Dataset

In [None]:
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from mask2former import add_maskformer2_config
import os

import torch

# Clear cache
torch.cuda.empty_cache()


cfg = get_cfg()
add_deeplab_config(cfg)
add_maskformer2_config(cfg)
cfg.merge_from_file("Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml")

cfg.DATASETS.TRAIN = ("building_train",)
cfg.DATASETS.TEST = ("building_val",)

cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_swin_small_bs16_50ep/model_final_1e7f22.pkl"
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
cfg.OUTPUT_DIR = "./train/mask2former_building"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)



cfg.MODEL.MASK_ON = True
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # single class
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256 

# --- Solver ---
cfg.SOLVER.IMS_PER_BATCH = 1    # Batch Size
cfg.SOLVER.BASE_LR = 0.00025    # Learning Rate
cfg.SOLVER.MAX_ITER = 2000      # Number of Epochs
cfg.SOLVER.STEPS = (1500,)      # Reduce LR at Epoch=1500
cfg.SOLVER.GAMMA = 0.1

cfg.SOLVER.CLIP_GRADIENTS.ENABLED = True
cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "norm"  # or "value"
cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0   # typical default

# --- Input Augmentation ---
cfg.INPUT.MIN_SIZE_TRAIN = (640, 672, 704, 736) # Apply different resizes
cfg.INPUT.MAX_SIZE_TRAIN = 1333                 
cfg.INPUT.MIN_SIZE_TEST = 800                   # On validation always same resize

# --- Output ---
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 
cfg.OUTPUT_DIR = "./train/augmented_m2f"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)


trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

[32m[10/19 18:29:39 d2.engine.defaults]: [0mModel:
MaskFormer(
  (backbone): D2SwinTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0): BasicLayer(
        (blocks): ModuleList(
          (0): SwinTransformerBlock(
            (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
            (attn): WindowAttention(
              (qkv): Linear(in_features=96, out_features=288, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
              (proj): Linear(in_features=96, out_features=96, bias=True)
              (proj_drop): Dropout(p=0.0, inplace=False)
              (softmax): Softmax(dim=-1)
            )
            (drop_path): Identity()
            (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
            (mlp): Mlp(
             

  grad_scaler = GradScaler()
Weight format of MultiScaleMaskedTransformerDecoder have changed! Please upgrade your models. Applying automatic conversion now ...


[32m[10/19 18:29:41 d2.engine.train_loop]: [0mStarting training from iteration 0
[4m[5m[31mERROR[0m [32m[10/19 18:29:42 d2.engine.train_loop]: [0mException during training:
Traceback (most recent call last):
  File "/home/tomas/Downloads/Siemens/building_env/lib/python3.10/site-packages/detectron2/engine/train_loop.py", line 155, in train
    self.run_step()
  File "/home/tomas/Downloads/Siemens/building_env/lib/python3.10/site-packages/detectron2/engine/defaults.py", line 530, in run_step
    self._trainer.run_step()
  File "/home/tomas/Downloads/Siemens/building_env/lib/python3.10/site-packages/detectron2/engine/train_loop.py", line 494, in run_step
    loss_dict = self.model(data)
  File "/home/tomas/Downloads/Siemens/building_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/tomas/Downloads/Siemens/building_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line

  with autocast(dtype=self.precision):


In [None]:
# Some basic setup:
# Setup detectron2 logger
from detectron2.utils.logger import setup_logger
setup_logger()

from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.engine import DefaultTrainer
from detectron2.data import build_detection_train_loader
from detectron2.data import transforms as T
from detectron2.evaluation import COCOEvaluator
import os


import torch

# Clear cache
torch.cuda.empty_cache()

class AugmentedTrainer(DefaultTrainer):
    @classmethod
    def build_train_loader(cls, cfg):
        # Define the augmentations
        augmentation_list = [
            T.RandomFlip(prob=0.5, horizontal=True, vertical=False),
            T.RandomBrightness(0.8, 1.2),
            T.RandomContrast(0.8, 1.2),
            T.RandomSaturation(0.8, 1.2),
        ]
        
        # Use a custom DatasetMapper with these augmentations
        from detectron2.data import DatasetMapper
        mapper = DatasetMapper(cfg, is_train=True, augmentations=augmentation_list)
        
        # Build the train loader
        return build_detection_train_loader(cfg, mapper=mapper)

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "coco_eval")
        return COCOEvaluator(dataset_name, output_dir=output_folder)

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml"
))

# --- Dataset ---
cfg.DATASETS.TRAIN = ("building_train",)
cfg.DATASETS.TEST = ("building_val",)

# --- DataLoader ---
cfg.DATALOADER.NUM_WORKERS = 2

# --- Model ---
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
    "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml"
)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # single class
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 

# --- Solver ---
cfg.SOLVER.IMS_PER_BATCH = 2    # Batch Size
cfg.SOLVER.BASE_LR = 0.00025    # Learning Rate
cfg.SOLVER.MAX_ITER = 2000      # Number of Epochs
cfg.SOLVER.STEPS = (1500,)      # Reduce LR at Epoch=1500
cfg.SOLVER.GAMMA = 0.1

# --- Input Augmentation ---
cfg.INPUT.MIN_SIZE_TRAIN = (640, 672, 704, 736) # Apply different resizes
cfg.INPUT.MAX_SIZE_TRAIN = 1333                 
cfg.INPUT.MIN_SIZE_TEST = 800                   # On validation always same resize

# --- Output ---
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 
cfg.OUTPUT_DIR = "./train/augmented_X101_512"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

In [None]:
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

Inference

In [None]:
import random
from detectron2.utils.visualizer import ColorMode
from detectron2.engine import DefaultPredictor

# Inference should use the config with parameters that are used in training
# cfg now already contains everything we've set previously. We changed it a little bit for inference:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # set a custom testing threshold
predictor = DefaultPredictor(cfg)

# Load validation
dataset_dicts = DatasetCatalog.get("building_val")
metadata = MetadataCatalog.get("building_val")

for d in random.sample(dataset_dicts, 3):    
    im = cv2.imread(d["file_name"])
    outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    visualize_polygons(im, d, metadata, outputs)

In [None]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

def compute_max_f1(metrics, task="segm"):
    """
    Compute max F1 from COCO precision-recall curve.
    task: "segm" or "bbox"
    """
    precision = metrics[task]["precision"]  # shape: [IoU, recall, ...]
    recall = metrics[task]["recall"]        # shape: [IoU, recall, ...]
    
    # AP50 corresponds to IoU=0.5, which is index 0 in COCOEvaluator (0.50:0.05:0.95)
    iou_idx = 0
    # Take precision and recall arrays for IoU=0.5
    prec = precision[iou_idx].flatten()
    rec = recall[iou_idx].flatten()
    
    f1_scores = 2 * (prec * rec) / (prec + rec + 1e-8)
    return f1_scores.max()

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# Paths to your files
gt_json = f"{cfg.OUTPUT_DIR}/coco_eval/building_val_coco_format.json"      # ground truth
pred_json = f"{cfg.OUTPUT_DIR}/coco_eval/coco_instances_results.json"     # model predictions

# Load COCO objects
coco_gt = COCO(gt_json)
coco_dt = coco_gt.loadRes(pred_json)  # this works because pred_json is in COCO format

# Run COCO evaluation for segmentation ('segm') or bbox ('bbox')
coco_eval = COCOeval(coco_gt, coco_dt, iouType='segm')  # or 'bbox'
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()

# val_f1 = compute_max_f1(coco_eval)

# print("Validation metrics:")
# print("AP50 (segm):", val_metrics["segm"]["AP50"])
# print("Max F1 (segm):", val_f1)

# # Evaluate on training set
# train_evaluator = COCOEvaluator("building_train", output_dir="./output")
# train_loader = build_detection_test_loader(cfg, "building_train")
# train_metrics = inference_on_dataset(trainer.model, train_loader, train_evaluator)
# train_f1 = compute_max_f1(train_metrics)

# print("\nTraining metrics:")
# print("AP50 (segm):", train_metrics["segm"]["AP50"])
# print("Max F1 (segm):", train_f1)

In [3]:
# compute_f1_from_coco.py
import json
import numpy as np
from pycocotools import mask as mask_utils
from shapely.geometry import Polygon
from scipy.optimize import linear_sum_assignment

gt_json = f"./train/augmented_X101_512/coco_eval/building_val_coco_format.json"      # ground truth
pred_json = f"./train/augmented_X101_512/coco_eval/coco_instances_results.json"     # model predictions

with open(gt_json, "r") as f:
    gt_coco = json.load(f)

with open(pred_json, "r") as f:
    pred_coco = json.load(f)

# --- Build image_id -> (height, width) mapping ---
img_sizes = {img["id"]: (img["height"], img["width"]) for img in gt_coco["images"]}

# --- Convert COCO JSON to {image_id: [RLEs]} format ---
def coco_to_rle(coco_json, img_sizes):
    rle_dict = {}
    for ann in coco_json["annotations"]:
        image_id = ann["image_id"]
        seg = ann["segmentation"]
        height, width = img_sizes[image_id]

        # Convert polygon to RLE if needed
        if isinstance(seg, dict) and "counts" in seg:
            rle = seg  # already RLE
        else:
            # polygon -> RLE
            rle = mask_utils.frPyObjects(seg, height, width)
            if isinstance(rle, list):
                rle = mask_utils.merge(rle)
        rle_dict.setdefault(image_id, []).append(rle)
    return rle_dict

gt_rles = coco_to_rle(gt_coco, img_sizes)

print(gt_rles[1])

pred_rles = {}
for ann in pred_coco:
    image_id = ann["image_id"]
    seg = ann["segmentation"]
    if isinstance(seg, dict) and "counts" in seg:
        rle = seg
    else:
        rle = mask_utils.frPyObjects(seg, ann["height"], ann["width"])
        rle = mask_utils.merge(rle) if isinstance(rle, list) else rle
    pred_rles.setdefault(image_id, []).append(rle)

# --- Compute F1 ---
def compute_f1(gt_rles, pred_rles, iou_thresh=0.5):
    total_TP = total_FP = total_FN = 0
    for img_id in set(list(gt_rles.keys()) + list(pred_rles.keys())):
        gt_list = gt_rles.get(img_id, [])
        pr_list = pred_rles.get(img_id, [])

        if len(gt_list) == 0 and len(pr_list) == 0:
            continue

        if len(gt_list) == 0:
            total_FP += len(pr_list)
            continue
        if len(pr_list) == 0:
            total_FN += len(gt_list)
            continue

        # Build IoU matrix
        ious = mask_utils.iou(pr_list, gt_list, [0]*len(gt_list))  # [pred, gt]

        # Hungarian assignment to maximize IoU
        cost = -ious
        row_ind, col_ind = linear_sum_assignment(cost)

        matched_pred = set()
        matched_gt = set()
        for r, c in zip(row_ind, col_ind):
            if ious[r, c] >= iou_thresh:
                matched_pred.add(r)
                matched_gt.add(c)

        total_TP += len(matched_gt)
        total_FP += len(pr_list) - len(matched_pred)
        total_FN += len(gt_list) - len(matched_gt)

    precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0.0
    recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "TP": total_TP,
        "FP": total_FP,
        "FN": total_FN,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

metrics = compute_f1(gt_rles, pred_rles, iou_thresh=0.5)
print("F1-score evaluation (IoU >= 0.5):")
print(metrics)

[{'size': [512, 512], 'counts': b'o_a31n?1O11O1OQP\\4'}, {'size': [512, 512], 'counts': b'i_d31n?3M2O2M2O0O1O100O1O100O100O1O11O1O1O2N1O2N1O2N1O1O2N1OQ`m3'}, {'size': [512, 512], 'counts': b'V_`41n?2N2N2N3M2O1N2N3M2N2N3N1N2N2N3M2N2O0O00000002N2N2O2O00001O0O1O2M2N2N3M2N2N3N1N2N3M2N2Nc`i2'}, {'size': [512, 512], 'counts': b'o_U71n?1O1O1O1001O1O1OO1O1O1O100O1O1O100O1O1O1O100O1O1O1002N1O1O1O1O1O1O1O1O1O1O1O1OR`6'}, {'size': [512, 512], 'counts': b'P_\\61n?2N2N3N1N2N3N1N3M2N2O2M2N3O000010O0001PASOk>l0SAUOn>n01O2N1O2N1O1O2N1O1O2N1O1O2N1O1O2N1O2M2N2NV`n0'}, {'size': [512, 512], 'counts': b'k^g62n?1N2N2N2N3M2O1N2N3M2N2N2O2M2N2N2N3M2O1N2N1O1O0001O2N3M2O1N2N2N2N2N2N2O1N2N2N2N2N2N2O1N2N2N2N2NSa`0'}, {'size': [512, 512], 'counts': b'Y]Z73l?2N2N2O1N3M2N2N2O1N3M2N2N2O1N3M2N2N2O1N3M2N2N2O1N3M2N2N2O1N3M2N2N2O1O2O0O1O1O101N1N2N2aC'}, {'size': [512, 512], 'counts': b'k_41n?2N3M2O0O1O100O1O1O100O1O1002N1O1O1O2N1O1O1O2N1O1OQ`^7'}, {'size': [512, 512], 'counts': b'oon01o?0O1FOd@2[?0c@1\\?2a@O^?3`@N_?:00O1O

Submission

In [None]:
import pandas as pd

cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
predictor = DefaultPredictor(cfg)

# --- Paths ---
test_folder = Path("building-extraction-generalization-2024/test/image")
output_csv = "submission.csv"

# --- Helper: binary mask to polygon coordinates ---
def mask_to_coords(mask):
    """Convert a HxW binary mask (numpy array) to polygon coordinates."""
    mask = mask.astype(np.uint8)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    coords_list = []
    for cnt in contours:
        if len(cnt) >= 3:  # valid polygon
            coords_list.append([(int(x), int(y)) for [[x, y]] in cnt])
    return coords_list

# --- Run inference & prepare CSV ---
rows = []
for img_path in sorted(test_folder.iterdir()):
    print(img_path)
    img_id = int(img_path.stem)  # ImageID
    img = cv2.imread(str(img_path))
    
    outputs = predictor(img)["instances"]

    masks = outputs.pred_masks.cpu().numpy() if len(outputs) > 0 else []
    
    all_coords = []
    for mask in masks:
        polys = mask_to_coords(mask)
        all_coords.extend(polys)
    
    # If no detection
    if not all_coords:
        all_coords = []

    rows.append({"ImageID": img_id, "Coordinates": str(all_coords)})

# --- Save CSV ---
df = pd.DataFrame(rows)
df.to_csv(output_csv, index=False)
print(f"Submission CSV saved as {output_csv}")