# Final

In [1]:
# --- imports
import os, re, json, glob, copy
from typing import List, Dict, Any, Tuple

import torch
from PIL import Image, ImageDraw
# from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info

# --- small utils
def to_file_url(p: str) -> str:
    if p.startswith(("http://", "https://", "file://")):
        return p
    ap = os.path.abspath(p)
    if not os.path.exists(ap):
        raise FileNotFoundError(f"Missing image: {p}")
    return "file://" + ap.replace("\\", "/")

def extract_json(text: str) -> Dict[str, Any]:
    m = re.search(r'\{.*\}', text, flags=re.S)
    if not m:
        raise ValueError("No JSON object found in model output.")
    s = m.group(0)
    s = re.sub(r'<\|.*?\|>', '', s)
    s = s.replace('\n', ' ')
    return json.loads(s)

def natural_key(s: str):
    base = os.path.basename(s)
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', base)]

# --- draw overlays (solid red outlines; very visible)
def draw_and_save(image_path: str,
                  boxes_xyxy_px: List[Tuple[int,int,int,int]],
                  labels: List[str],
                  out_path: str,
                  outline_width: int = 5):
    img = Image.open(image_path).convert("RGB")
    d = ImageDraw.Draw(img)
    labels = labels or [f"box{i+1}" for i in range(len(boxes_xyxy_px))]

    for (x1, y1, x2, y2), lab in zip(boxes_xyxy_px, labels):

        # abs_y1 = int(y1/999 * img.size[1])
        # abs_x1 = int(x1/999 * img.size[0])
        # abs_y2 = int(y2/999 * img.size[1])
        # abs_x2 = int(x2/999 * img.size[0])
        d.rectangle([x1, y1, x2, y2], outline=(255,0,0), width=outline_width)
        # label chip
        try:
            l,t,r,b = d.textbbox((0,0), lab)
            tw, th = r-l, b-t
        except Exception:
            tw, th = max(8*len(lab), 16), 16
        pad = 4
        chip = [x1, max(0, y1 - th - 2*pad), x1 + tw + 2*pad, y1]
        d.rectangle(chip, fill=(255,0,0))
        d.text((chip[0]+pad, chip[1]+pad), lab, fill=(255,255,255))

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    img.save(out_path, quality=95)

# --- convert res["json"] -> pixel xyxy; robust to mislabeled formats
def boxes_px_from_res_json(res_json: dict, image_path: str):
    W, H = Image.open(image_path).size
    fmt = (res_json.get("box_format") or "").lower()
    out_boxes, out_labels = [], []

    # def clamp_box(x1, y1, x2, y2):
    #     x1 = max(0, min(W-1, int(round(x1))))
    #     y1 = max(0, min(H-1, int(round(y1))))
    #     x2 = max(0, min(W-1, int(round(x2))))
    #     y2 = max(0, min(H-1, int(round(y2))))
    #     if x2 <= x1: x2 = min(W-1, x1 + 2)
    #     if y2 <= y1: y2 = min(H-1, y1 + 2)
    #     return (x1, y1, x2, y2)

    for item in res_json.get("click_boxes", []):
        lab = item.get("label", "box")
        box = item.get("box", [])

        
        abs_x1 = int(box[0]/999 * W)
        abs_y1 = int(box[1]/999 * H)
        abs_x2 = int(box[2]/999 * W)
        abs_y2 = int(box[3]/999 * H)

        box = [abs_x1, abs_y1, abs_x2, abs_y2]


        if not (isinstance(box, (list, tuple)) and len(box) == 4):
            continue
        x1, y1, x2, y2 = [float(v) for v in box]

        # if fmt in ("xyxy_norm", "x1y1x2y2_norm"):
        #     if max(x1, y1, x2, y2) <= 1.5:  # looks normalized
        #         X1, Y1, X2, Y2 = x1*W, y1*H, x2*W, y2*H
        #     else:                           # mislabeled; already pixels
        #         X1, Y1, X2, Y2 = x1, y1, x2, y2
        # elif fmt in ("xywh_norm",):
        #     if max(x1, y1, x2, y2) <= 1.5:
        #         X1, Y1, X2, Y2 = x1*W, y1*H, (x1+x2)*W, (y1+y2)*H
        #     else:
        #         X1, Y1, X2, Y2 = x1, y1, x1+x2, y1+y2
        # elif fmt in ("xywh",):
        #     X1, Y1, X2, Y2 = x1, y1, x1+x2, y1+y2
        # else:  # xyxy/unknown → assume pixels
        #     X1, Y1, X2, Y2 = x1, y1, x2, y2

        out_boxes.append((abs_x1, abs_y1, abs_x2, abs_y2))
        out_labels.append(lab)

    return out_boxes, out_labels


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_task_meta(task_dir: str):
    """
    Returns:
      task_query: str
      instr_by_file: dict like {"1.png": "Open the View tab", ...}
    """
    annot = os.path.join(task_dir, "annotations.json")
    task_query = os.path.basename(task_dir)
    instr_by_file = {}

    if os.path.exists(annot):
        with open(annot, "r", encoding="utf-8") as f:
            data = json.load(f)
        task_query = data.get("task_query") or data.get("task_name") or task_query
        for im in data.get("images", []):
            fname = im.get("file_name")
            instr = im.get("instruction") or ""
            if fname:
                instr_by_file[fname] = instr
    return task_query, instr_by_file


In [3]:
def build_messages_for_image(image_path: str, task_query: str, image_instruction: str) -> List[Dict[str, Any]]:
    """
    We pass BOTH:
      - the global task (task_query), and
      - the per-image instruction (image_instruction).
    We also ask the model to set label==instruction, though we will enforce it downstream anyway.
    """
    sys_txt = (
        "You are a precise UI grounding assistant. "
        "Given a GUI screenshot, a global task, and the step instruction for THIS image, "
        "output STRICT JSON only with bounding boxes needed on THIS screen. "
        "Use normalized coordinates (xyxy_norm) in [0,1] relative to this image. "
        "Set each box's 'label' EXACTLY to the provided instruction."
    )
    system = {"role": "system", "content": [{"type": "text", "text": sys_txt}]}

    schema = (
        "{\n"
        '  "image": "<filename>",\n'
        '  "box_format": "xyxy_norm",\n'
        '  "click_boxes": [\n'
        '     {"label": "<instruction>", "box": [x1,y1,x2,y2]},\n'
        "     ...\n"
        "  ]\n"
        "}\n"
        "Only valid JSON. Coordinates 0..1. If no action is needed on this screen, return an empty list."
    )

    user_content = [
        {"type": "text", "text": f"Global task: {task_query}"},
        {"type": "text", "text": f"Instruction for this image: {image_instruction}"},
        {"type": "text", "text": f"Image filename: {os.path.basename(image_path)}"},
        {"type": "image", "image": to_file_url(image_path)},
        {"type": "text", "text": "Return JSON with this schema:"},
        {"type": "text", "text": schema},
    ]
    return [{"role": "user", "content": user_content}, system]


In [4]:
@torch.inference_mode()
def infer_one(model, processor, image_path: str, task_query: str, image_instruction: str,
              temperature: float = 0.0, max_new_tokens: int = 256):
    messages = build_messages_for_image(image_path, task_query, image_instruction)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    images, videos, vkw = process_vision_info(messages, return_video_kwargs=True)
    inputs = processor(
        text=[text],
        images=images, videos=videos,
        return_tensors="pt", padding=True, **(vkw or {})
    )
    for k, v in list(inputs.items()):
        if hasattr(v, "to"):
            inputs[k] = v.to(model.device)
    inputs.pop("token_type_ids", None)

    out = model.generate(**inputs, max_new_tokens=max_new_tokens)
    new_tokens = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
    txt = processor.batch_decode(new_tokens, skip_special_tokens=True)[0]
    

    # print("Results of model are: ", txt, "\n\n")

    # parse model JSON
    try:
        parsed = extract_json(txt)
    except Exception as e:
        print(f"[ERROR] JSON parse failed for {os.path.basename(image_path)}: {e}\nRAW[:400]: {txt[:400]}")
        parsed = {"image": os.path.basename(image_path), "box_format": "xyxy_norm", "click_boxes": []}

    # convert to pixel xyxy
    boxes_px, _labels_from_model = boxes_px_from_res_json(parsed, image_path)

    # IMPORTANT: override labels with the per-image instruction (your requirement)
    labels = [image_instruction] * len(boxes_px)

    # also write a copy of the prediction with labels overwritten, for evaluation convenience
    # pred_with_instruction = copy.deepcopy(parsed)
    # for cb in pred_with_instruction.get("click_boxes", []):
    #     cb["label"] = image_instruction

    pred_with_instruction, changed = enforce_instruction_label(parsed, image_instruction)

    W, H = Image.open(image_path).size
    return {
        "raw_text": txt,
        "prediction_raw": parsed,                   # keep raw internally; caller can drop it
        "prediction": pred_with_instruction,        # labels set to the instruction
        "label_overridden": bool(changed),
        "boxes_px": boxes_px,                       # (computed however you prefer)
        "labels": [image_instruction] * len(boxes_px),
        "image_size": (W, H),
    }
import copy

def enforce_instruction_label(parsed_json: dict, instruction: str):
    """
    Returns (pred_with_instruction, changed: bool)
    - Sets each click_boxes[i]["label"] = instruction
    - `changed` is True if any original label differed
    """
    pj = copy.deepcopy(parsed_json)
    changed = False
    for cb in pj.get("click_boxes", []):
        if cb.get("label") != instruction:
            changed = True
        cb["label"] = instruction
    return pj, changed


In [6]:
# Top-level directory that contains: Predictions/, Task-1/, Task-2/, ...
# import necessary libraries
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
ROOT_DIR   = "/share_2/users/umair_nawaz/DOMINO-Evaluation/Medical/OHIF/SS"  # folder that contains Task-1, Task-2, ...
MODEL_ID = "Qwen/Qwen3-VL-32B-Instruct"  # or 7B/14B if you have access
# MODEL_ID = "Qwen/Qwen2.5-VL-32B-Instruct"
# MODEL_ID = "Qwen/Qwen2-VL-7B"  # or 7B/14B if you have access
PRED_ROOT  = os.path.join(ROOT_DIR, "Predictions-Qwen3-32B-VL")  # <-- all outputs go here
FILE_EXTS  = (".png", ".jpg", ".jpeg", ".PNG", ".JPG", ".JPEG")
TEMP       = 0.0
MAXTOK     = 256

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID, torch_dtype=torch_dtype, device_map="cuda", attn_implementation="sdpa"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

os.makedirs(PRED_ROOT, exist_ok=True)
print("Model ready on:", model.device)
print("Predictions root:", PRED_ROOT)


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 14/14 [00:24<00:00,  1.72s/it]


Model ready on: cuda:0
Predictions root: /share_2/users/umair_nawaz/DOMINO-Evaluation/Medical/OHIF/SS/Predictions-Qwen3-32B-VL


In [11]:
def is_task_dir(path: str) -> bool:
    if not os.path.isdir(path):
        return False
    if os.path.basename(path).lower() == "predictions-qwen3-8b-vl":
        return False
    for ext in FILE_EXTS:
        if glob.glob(os.path.join(path, f"*{ext}")):
            return True
    return False

def list_task_dirs(root_dir: str) -> List[str]:
    return [
        os.path.join(root_dir, name)
        for name in sorted(os.listdir(root_dir), key=natural_key)
        if is_task_dir(os.path.join(root_dir, name))
    ]

def run_all_tasks(root_dir: str, pred_root: str, out_img_suffix="_pred.png"):
    task_dirs = list_task_dirs(root_dir)
    print(f"Found {len(task_dirs)} task(s).")

    # (optional) aggregate across tasks
    agg_path = os.path.join(pred_root, "all_predictions.jsonl")
    with open(agg_path, "w", encoding="utf-8") as _:
        pass

    for task_dir in task_dirs:
        task_label = os.path.basename(task_dir)
        task_query, instr_by_file = load_task_meta(task_dir)

        out_task_dir = os.path.join(pred_root, task_label)
        os.makedirs(out_task_dir, exist_ok=True)
        pred_jsonl_path = os.path.join(out_task_dir, "predictions.jsonl")

        # collect images for this task
        image_paths = []
        for ext in FILE_EXTS:
            image_paths += glob.glob(os.path.join(task_dir, f"*{ext}"))
        image_paths = sorted(set(image_paths), key=natural_key)

        print(f"\n=== {task_label} :: {task_query} ===")
        with open(pred_jsonl_path, "w", encoding="utf-8") as f_out, \
             open(os.path.join(pred_root, "all_predictions.jsonl"), "a", encoding="utf-8") as agg_out:
            for img_path in image_paths:
                fname = os.path.basename(img_path)
                instruction = instr_by_file.get(fname, "")  # may be empty if not in annotations.json

                # --- inference using per-image instruction
                res = infer_one(
                    model, processor,
                    image_path=img_path,
                    task_query=task_query,
                    image_instruction=instruction,
                    temperature=TEMP, max_new_tokens=MAXTOK,
                )
                

                # # --- draw with labels = instruction
                # boxes_px, labels = boxes_px_from_res_json(res["prediction_with_instruction"], img_path)
                # overlay_path = os.path.join(out_task_dir, f"{os.path.splitext(fname)[0]}{out_img_suffix}")
                # draw_and_save(img_path, boxes_px, labels, overlay_path)

                # Enforce the label again on what we write (defensive; cheap)
                pred_with_instruction, changed = enforce_instruction_label(res["prediction_raw"], instruction)

                # If you derive boxes from JSON, do it from pred_with_instruction:
                boxes_px, labels = boxes_px_from_res_json(pred_with_instruction, img_path)


                # --- write JSON line (store both model raw + overridden labels)
                record = {
                    "task": task_label,
                    "task_query": task_query,
                    "image": fname,
                    "instruction": instruction,                 # <-- the label we want on disk
                    "image_relpath": os.path.relpath(img_path, root_dir),
                    "image_size": res["image_size"],
                    "prediction": pred_with_instruction,        # <-- labels == instruction
                    "boxes_px": boxes_px,
                    "labels": labels,                            # == instruction repeated
                    "label_overridden": bool(changed),
                }

                # Only attach raw when it differs (changed=True)
                if changed:
                    record["prediction_raw"] = res["prediction_raw"]

                # save overlay using enforced labels
                overlay_path = os.path.join(out_task_dir, f"{os.path.splitext(fname)[0]}{out_img_suffix}")
                draw_and_save(img_path, boxes_px, labels, overlay_path)

                
                # write the line
                line = json.dumps(record, ensure_ascii=False)
                f_out.write(line + "\n")
                agg_out.write(line + "\n")

                print(f"  {fname} -> {os.path.relpath(overlay_path, out_task_dir)} "
                      f"(boxes: {len(boxes_px)})")

        print(f"Saved: {pred_jsonl_path}")

# go!
run_all_tasks(ROOT_DIR, PRED_ROOT)


Found 50 task(s).

=== 1 :: Sort by newest study and expand first row ===
  1.png -> 1_pred.png (boxes: 1)
  2.png -> 2_pred.png (boxes: 1)
  3.png -> 3_pred.png (boxes: 1)
  4.png -> 4_pred.png (boxes: 1)
  5.png -> 5_pred.png (boxes: 1)
Saved: /share_2/users/umair_nawaz/DOMINO-Evaluation/Medical/OHIF/SS/Predictions-Qwen3-32B-VL/1/predictions.jsonl

=== 2 :: Sort by Patient Name then inspect a row ===
  1.png -> 1_pred.png (boxes: 1)
  2.png -> 2_pred.png (boxes: 1)
  3.png -> 3_pred.png (boxes: 1)
  4.png -> 4_pred.png (boxes: 1)
  5.png -> 5_pred.png (boxes: 1)
Saved: /share_2/users/umair_nawaz/DOMINO-Evaluation/Medical/OHIF/SS/Predictions-Qwen3-32B-VL/2/predictions.jsonl

=== 3 :: Sort by Description and preview a different row ===
  1.png -> 1_pred.png (boxes: 1)
  2.png -> 2_pred.png (boxes: 1)
  3.png -> 3_pred.png (boxes: 1)
  4.png -> 4_pred.png (boxes: 1)
  5.png -> 5_pred.png (boxes: 1)
Saved: /share_2/users/umair_nawaz/DOMINO-Evaluation/Medical/OHIF/SS/Predictions-Qwen3-32B

## Evaluation Metrics

In [12]:

# Root that contains Predictions/ and Task-* folders
ROOT_DIR  = "/share_2/users/umair_nawaz/DOMINO-Evaluation/Medical/OHIF/SS"
PRED_DIR  = os.path.join(ROOT_DIR, "Predictions-Qwen3-32B-VL")

# IoU thresholds (you can change these)
THRESHOLDS = [0.3, 0.5, 0.75, 0.9]
PRIMARY_TAU = 0.3   # for Task Completion@τ

# Require predicted label to equal the per-image instruction?
# (Your pipeline already enforces this, so keep True. Set False to ignore label text.)
REQUIRE_LABEL_MATCH = True

In [13]:
import os, json, re, glob
from typing import List, Dict, Tuple, Any
from pathlib import Path

def xywh_to_xyxy(b: List[float]) -> Tuple[float,float,float,float]:
    x, y, w, h = b
    return (x, y, x + w, y + h)

def iou_xyxy(a: Tuple[float,float,float,float],
             b: Tuple[float,float,float,float]) -> float:
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    inter_w = max(0.0, min(ax2, bx2) - max(ax1, bx1))
    inter_h = max(0.0, min(ay2, by2) - max(ay1, by1))
    inter   = inter_w * inter_h
    area_a  = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
    area_b  = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
    denom   = area_a + area_b - inter
    return (inter / denom) if denom > 0 else 0.0

def load_annotations(task_dir: str) -> Dict[str, Any]:
    """Return GT-by-image: {file_name: {boxes:[xyxy...], instruction:str, size:(W,H)}}"""
    annot_path = os.path.join(task_dir, "annotations.json")
    with open(annot_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    by_img = {}
    for im in data.get("images", []):
        W, H = im["width"], im["height"]
        boxes = [xywh_to_xyxy(bb["bbox"]) for bb in im.get("bboxes", [])]
        by_img[im["file_name"]] = {
            "boxes": boxes,
            "instruction": im.get("instruction", ""),
            "size": (W, H),
            "step_index": im.get("step_index"),
        }
    task_query = data.get("task_query") or data.get("task_name") or os.path.basename(task_dir)
    return {"task_query": task_query, "by_image": by_img}

def boxes_px_from_prediction_record(rec: Dict[str, Any], root_dir: str, task_name: str) -> Tuple[List[Tuple[int,int,int,int]], List[str], Tuple[int,int]]:
    """
    Pull predicted boxes (pixel xyxy) + labels from a predictions.jsonl record.
    Prefers boxes_px; otherwise converts from prediction.box_format.
    Returns: (boxes_px, labels, image_size)
    """
    # Fast path
    if "boxes_px" in rec and isinstance(rec["boxes_px"], list) and len(rec["boxes_px"]) > 0:
        boxes = [tuple(map(int, b)) for b in rec["boxes_px"]]
        labels = rec.get("labels", [])
        size = tuple(rec.get("image_size", (0, 0)))
        return boxes, labels, size

    # Convert from normalized/other formats
    pred = rec.get("prediction") or rec.get("prediction_raw") or {}
    box_format = (pred.get("box_format") or "").lower()

    # Determine size
    size = tuple(rec.get("image_size", (0, 0)))
    if not (size and size[0] > 0 and size[1] > 0):
        # fall back to reading the original image file if relpath is available
        rel = rec.get("image_relpath")
        if rel:
            try:
                from PIL import Image
                W, H = Image.open(os.path.join(root_dir, rel)).size
                size = (W, H)
            except Exception:
                pass
    W, H = size if (size and size[0] > 0 and size[1] > 0) else (1, 1)

    boxes_px, labels = [], []
    for item in pred.get("click_boxes", []):
        lab = item.get("label", "box")
        box = item.get("box", [])
        if not (isinstance(box, (list, tuple)) and len(box) == 4):
            continue
        x1, y1, x2, y2 = [float(v) for v in box]

        if box_format in ("xyxy_norm", "x1y1x2y2_norm"):
            if max(x1, y1, x2, y2) <= 1.5:  # likely normalized
                X1, Y1, X2, Y2 = x1 * W, y1 * H, x2 * W, y2 * H
            else:
                X1, Y1, X2, Y2 = x1, y1, x2, y2
        elif box_format in ("xywh_norm",):
            if max(x1, y1, x2, y2) <= 1.5:
                X1, Y1, X2, Y2 = x1 * W, y1 * H, (x1 + x2) * W, (y1 + y2) * H
            else:
                X1, Y1, X2, Y2 = x1, y1, x1 + x2, y1 + y2
        elif box_format in ("xywh",):
            X1, Y1, X2, Y2 = x1, y1, x1 + x2, y1 + y2
        else:  # xyxy or unknown → assume pixels
            X1, Y1, X2, Y2 = x1, y1, x2, y2

        # clamp
        X1 = max(0, min(int(round(X1)), W - 1))
        Y1 = max(0, min(int(round(Y1)), H - 1))
        X2 = max(0, min(int(round(X2)), W - 1))
        Y2 = max(0, min(int(round(Y2)), H - 1))
        if X2 <= X1: X2 = min(W - 1, X1 + 2)
        if Y2 <= Y1: Y2 = min(H - 1, Y1 + 2)

        boxes_px.append((X1, Y1, X2, Y2))
        labels.append(lab)

    return boxes_px, labels, (W, H)


In [14]:
def greedy_match_iou(gt_boxes: List[Tuple[float,float,float,float]],
                     pred_boxes: List[Tuple[int,int,int,int]],
                     label_ok: bool = True) -> List[Tuple[int,int,float]]:
    """
    Return list of (gt_idx, pred_idx, IoU) with greedy, IoU-descending matching.
    If label_ok=False, you could pass a parallel check; here we already enforce labels in records.
    """
    pairs = []
    for gi, gb in enumerate(gt_boxes):
        for pi, pb in enumerate(pred_boxes):
            pairs.append((gi, pi, iou_xyxy(gb, pb)))
    pairs.sort(key=lambda x: x[2], reverse=True)

    used_g, used_p, out = set(), set(), []
    for gi, pi, i in pairs:
        if gi in used_g or pi in used_p:
            continue
        used_g.add(gi); used_p.add(pi)
        out.append((gi, pi, i))
    return out

def score_task(task_dir: str, pred_task_dir: str,
               primary_tau: float = 0.25,
               thresholds: List[float] = [0.25, 0.5, 0.75, 0.9],
               require_label_match: bool = True) -> Dict[str, Any]:
    """
    Compare Predictions/<Task-X>/Predictions-Qwen2-VL.jsonl with Task-X/annotations.json
    """
    gt = load_annotations(task_dir)
    task_query = gt["task_query"]
    gt_by_img = gt["by_image"]

    pred_file = os.path.join(pred_task_dir, "predictions.jsonl")
    if not os.path.exists(pred_file):
        return {"task": os.path.basename(task_dir), "exists": False, "reason": "missing predictions.jsonl"}

    # load predictions
    preds = []
    with open(pred_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            preds.append(json.loads(line))

    preds_by_img = {p["image"]: p for p in preds}

    per_image_rows = []
    total_gt_boxes = 0
    iou_list_best_per_gt = []  # one value per GT box (best matched IoU; 0 if none)

    step_pass = {}  # fname -> bool at primary_tau

    for fname, meta in gt_by_img.items():
        gt_boxes = meta["boxes"]
        total_gt_boxes += len(gt_boxes)
        instruction = meta.get("instruction", "")
        # fetch prediction record for this image
        prec = preds_by_img.get(fname)
        if not prec:
            # no prediction => zeros
            best_iou = 0.0
            ious_for_gts = [0.0] * len(gt_boxes)
            per_image_rows.append({
                "image": fname, "instruction": instruction,
                "best_iou": 0.0, "gt_count": len(gt_boxes),
                "matched": 0, "note": "no_prediction"
            })
            iou_list_best_per_gt.extend(ious_for_gts)
            step_pass[fname] = False
            continue

        # collect predicted boxes (pixels) + labels
        pred_boxes, pred_labels, _sz = boxes_px_from_prediction_record(prec, ROOT_DIR, os.path.basename(task_dir))

        # optionally filter preds whose label != instruction
        if require_label_match:
            pred_keep = [i for i, lab in enumerate(pred_labels) if str(lab).strip() == str(instruction).strip()]
            pred_boxes = [pred_boxes[i] for i in pred_keep]
            pred_labels = [pred_labels[i] for i in pred_keep]

        # match
        matches = greedy_match_iou(gt_boxes, pred_boxes)
        matched_iou = [0.0] * len(gt_boxes)
        for gi, pi, i in matches:
            matched_iou[gi] = max(matched_iou[gi], i)

        # step success = every GT box on this image matched at >= primary_tau
        step_success = all(i >= primary_tau for i in matched_iou) if len(gt_boxes) > 0 else True
        step_pass[fname] = step_success

        # per-image logging
        per_image_rows.append({
            "image": fname,
            "instruction": instruction,
            "gt_count": len(gt_boxes),
            "pred_count": len(pred_boxes),
            "best_iou": max(matched_iou) if matched_iou else 0.0,
            "ious_per_gt": matched_iou,
            "step_success@{:.3f}".format(primary_tau): bool(step_success),
        })
        iou_list_best_per_gt.extend(matched_iou if matched_iou else [0.0])

    # task completion = every step passed
    task_completed = all(step_pass.values()) if len(step_pass) > 0 else False

    # aggregate metrics over this task (across all GT boxes in task)
    mIoU = sum(iou_list_best_per_gt) / max(1, len(iou_list_best_per_gt))
    acc_at = {
        "Acc@{:.1f}".format(t): sum(1 for v in iou_list_best_per_gt if v >= t) / max(1, len(iou_list_best_per_gt))
        for t in thresholds
    }

    return {
        "task": os.path.basename(task_dir),
        "task_query": task_query,
        "exists": True,
        "task_completed@{:.2f}".format(primary_tau): bool(task_completed),
        "num_images": len(gt_by_img),
        "num_gt_boxes": int(total_gt_boxes),
        "mIoU": mIoU,
        **acc_at,
        "per_image": per_image_rows,
    }


In [None]:
def list_task_dirs(root_dir: str) -> List[str]:
    # A task dir has an annotations.json (ground truth)
    out = []
    for name in sorted(os.listdir(root_dir)):
        task_dir = os.path.join(root_dir, name)
        if not os.path.isdir(task_dir): 
            continue
        if name.lower() == "predictions-qwen2.5-vl": 
            continue
        if os.path.exists(os.path.join(task_dir, "annotations.json")):
            out.append(task_dir)
    return out

def evaluate_all(root_dir: str, pred_dir: str,
                 primary_tau: float, thresholds: List[float],
                 require_label_match: bool = True):
    tasks = list_task_dirs(root_dir)
    print(f"Found {len(tasks)} task(s) with annotations.")

    per_task_results = []
    overall_iou_values = []
    overall_gt_count = 0
    completed = 0

    for task_dir in tasks:
        task_name = os.path.basename(task_dir)
        pred_task_dir = os.path.join(pred_dir, task_name)
        res = score_task(
            task_dir, pred_task_dir,
            primary_tau=primary_tau,
            thresholds=thresholds,
            require_label_match=require_label_match
        )
        per_task_results.append(res)

        # accumulate for overall
        if res.get("exists"):
            # collect per-image IoUs (all GTs)
            for row in res.get("per_image", []):
                for v in row.get("ious_per_gt", []):
                    overall_iou_values.append(float(v))
                    overall_gt_count += 1
            if res.get(f"task_completed@{primary_tau:.2f}"):
                completed += 1

        # save per-task file
        out_task_json = os.path.join(pred_dir, task_name, "eval.json")
        with open(out_task_json, "w", encoding="utf-8") as f:
            json.dump(res, f, indent=2)
        print(f"  Saved per-task eval: {out_task_json}")

    # overall metrics
    overall_mIoU = (sum(overall_iou_values) / overall_gt_count) if overall_gt_count else 0.0
    overall_acc = {
        f"Acc@{t:.1f}": (sum(1 for v in overall_iou_values if v >= t) / overall_gt_count) if overall_gt_count else 0.0
        for t in thresholds
    }
    task_completion_rate = (completed / len(tasks)) if tasks else 0.0

    overall = {
        "num_tasks": len(tasks),
        "overall_num_gt_boxes": int(overall_gt_count),
        "task_completion_rate@{:.3f}".format(primary_tau): task_completion_rate,
        "overall_mIoU": overall_mIoU,
        **overall_acc,
    }

    # save summary
    out_summary = os.path.join(pred_dir, "_eval_summary.json")
    with open(out_summary, "w", encoding="utf-8") as f:
        json.dump({"overall": overall, "per_task": per_task_results}, f, indent=2)
    print("\n=== Overall Summary ===")
    for k, v in overall.items():
        print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")
    print(f"\nSaved: {out_summary}")

# Run it
evaluate_all(
    ROOT_DIR, PRED_DIR,
    primary_tau=PRIMARY_TAU,
    thresholds=THRESHOLDS,
    require_label_match=REQUIRE_LABEL_MATCH
)
