In [6]:
#!/usr/bin/env python3
"""
Make a flat list of consolidated annotations using **ID-only** pairing:
[{"id": "...", "type": "...", "value": "...", "bbox": [l,t,r,b]}]

- Uses rectanglelabels for (type + bbox) and textarea for (value)
- Pairs ONLY when result IDs are equal (no IoU fallback)
- Converts percentage coords to pixel bbox
- Optional: --with-file-name adds "file_name" to each item
"""

import json
import os
from typing import Any, Dict, List

def _pct_to_px_bbox(x: float, y: float, w: float, h: float, W: int, H: int) -> List[int]:
    l = round(x * W / 100.0)
    t = round(y * H / 100.0)
    r = round((x + w) * W / 100.0)
    b = round((y + h) * H / 100.0)
    # clamp
    l = max(0, min(l, W)); r = max(0, min(r, W))
    t = max(0, min(t, H)); b = max(0, min(b, H))
    return [l, t, r, b]

def _resize_bbox_to_840(bbox: List[int], original_width: int, original_height: int, target_size: int = 840) -> List[int]:
    """
    Resize bbox coordinates from original dimensions to target_size x target_size.
    
    Args:
        bbox: [left, top, right, bottom] in original image coordinates
        original_width: Original image width
        original_height: Original image height  
        target_size: Target square size (default 840)
        
    Returns:
        Resized bbox [left, top, right, bottom] in target coordinates
    """
    l, t, r, b = bbox
    
    # Calculate scale factors
    scale_x = target_size / original_width
    scale_y = target_size / original_height
    
    # Apply scaling
    new_l = round(l * scale_x)
    new_t = round(t * scale_y)
    new_r = round(r * scale_x)
    new_b = round(b * scale_y)
    
    # Clamp to target dimensions
    new_l = max(0, min(new_l, target_size))
    new_t = max(0, min(new_t, target_size))
    new_r = max(0, min(new_r, target_size))
    new_b = max(0, min(new_b, target_size))
    
    return [new_l, new_t, new_r, new_b]

def _basename_from_task(task: Dict[str, Any]) -> str:
    data = task.get("data", {}) or {}
    for key in ("ocr", "image", "url"):
        v = data.get(key)
        if isinstance(v, str) and v:
            return os.path.basename(v)
    v = task.get("file_upload")
    if isinstance(v, str) and v:
        return os.path.basename(v)
    return f"task_{task.get('id','unknown')}"

def consolidate_flat_id_only(ls_export_path: str, include_filename: bool = False) -> List[Dict[str, Any]]:
    """
    Read a Label Studio export JSON and return a flat list:
    [{"id","type","value","bbox"}]  (+ "file_name" if include_filename=True)
    """
    with open(ls_export_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    tasks = data if isinstance(data, list) else data.get("tasks") or data.get("results") or []
    flat: List[Dict[str, Any]] = []

    for task in tasks:
        file_name = _basename_from_task(task)

        for ann in (task.get("annotations") or []):
            rect_by_id: Dict[str, Dict[str, Any]] = {}
            text_by_id: Dict[str, List[str]] = {}

            for r in (ann.get("result") or []):
                r_id = r.get("id") or ""
                if not r_id:
                    continue

                v = r.get("value") or {}
                W = r.get("original_width"); H = r.get("original_height")
                x, y, w, h = v.get("x"), v.get("y"), v.get("width"), v.get("height")

                # need dimensions + geometry
                if not all(isinstance(t, (int, float)) for t in (W, H, x, y, w, h)):
                    continue

                bbox = _pct_to_px_bbox(float(x), float(y), float(w), float(h), int(W), int(H))
                # Resize bbox to 840x840
                resized_bbox = _resize_bbox_to_840(bbox, int(W), int(H))

                if r.get("type") == "rectanglelabels":
                    labels = v.get("rectanglelabels") or []
                    if labels:
                        rect_by_id[r_id] = {"label": str(labels[0]), "bbox": resized_bbox}

                elif r.get("type") == "textarea":
                    txt_list = v.get("text") or []
                    text = " ".join(map(str, txt_list)) if isinstance(txt_list, list) else str(txt_list)
                    text_by_id.setdefault(r_id, []).append(text)

            # emit only when ids match on both sides
            for r_id, rect in rect_by_id.items():
                if r_id in text_by_id:
                    item = {
                        "id": r_id,
                        "type": rect["label"],
                        "value": " ".join(text_by_id[r_id]).strip(),
                        "bbox": rect["bbox"],
                    }
                    if include_filename:
                        item["file_name"] = file_name
                    flat.append(item)

    return flat

def consolidate_by_filename(ls_export_path: str) -> Dict[str, List[Dict[str, Any]]]:
    """
    Read a Label Studio export JSON and return a dictionary grouped by filename:
    {"filename.ext": [{"id","type","value","bbox"}, ...]}
    """
    with open(ls_export_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    tasks = data if isinstance(data, list) else data.get("tasks") or data.get("results") or []
    by_filename: Dict[str, List[Dict[str, Any]]] = {}

    for task in tasks:
        file_name = _basename_from_task(task)

        for ann in (task.get("annotations") or []):
            rect_by_id: Dict[str, Dict[str, Any]] = {}
            text_by_id: Dict[str, List[str]] = {}

            for r in (ann.get("result") or []):
                r_id = r.get("id") or ""
                if not r_id:
                    continue

                v = r.get("value") or {}
                W = r.get("original_width"); H = r.get("original_height")
                x, y, w, h = v.get("x"), v.get("y"), v.get("width"), v.get("height")

                # need dimensions + geometry
                if not all(isinstance(t, (int, float)) for t in (W, H, x, y, w, h)):
                    continue

                bbox = _pct_to_px_bbox(float(x), float(y), float(w), float(h), int(W), int(H))
                # Resize bbox to 840x840
                resized_bbox = _resize_bbox_to_840(bbox, int(W), int(H))

                if r.get("type") == "rectanglelabels":
                    labels = v.get("rectanglelabels") or []
                    if labels:
                        rect_by_id[r_id] = {"label": str(labels[0]), "bbox": resized_bbox}

                elif r.get("type") == "textarea":
                    txt_list = v.get("text") or []
                    text = " ".join(map(str, txt_list)) if isinstance(txt_list, list) else str(txt_list)
                    text_by_id.setdefault(r_id, []).append(text)

            # emit only when ids match on both sides
            file_annotations = []
            for r_id, rect in rect_by_id.items():
                if r_id in text_by_id:
                    item = {
                        "id": r_id,
                        "type": rect["label"],
                        "value": " ".join(text_by_id[r_id]).strip(),
                        "bbox": rect["bbox"],
                    }
                    file_annotations.append(item)
            
            # Add annotations to the file's list
            if file_annotations:
                if file_name not in by_filename:
                    by_filename[file_name] = []
                by_filename[file_name].extend(file_annotations)

    return by_filename

def save_labels_by_file(ls_export_path: str, output_dir: str = "labels_output") -> None:
    """
    Extract labels from Label Studio export and save each file's labels as separate JSON files.
    """
    labels_by_file = consolidate_by_filename(ls_export_path)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    saved_count = 0
    for file_name, annotations in labels_by_file.items():
        # Remove extension and add .json
        base_name = os.path.splitext(file_name)[0]
        output_file = os.path.join(output_dir, f"{base_name}.json")
        
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(annotations, f, ensure_ascii=False, indent=2)
        
        print(f"Saved {len(annotations)} annotations for {file_name} -> {output_file}")
        saved_count += 1
    
    print(f"\nTotal: Saved labels for {saved_count} files in '{output_dir}' directory")

path = "/Users/vishnuprasad/Downloads/export_179167_project-179167-at-2025-08-26-01-58-2cd3a2e9.json"

# Option 1: Save labels for each file separately (NEW APPROACH)
save_labels_by_file(path, output_dir="ad-buy-form-test-labels")

# # Option 2: Keep the original flat list approach (ORIGINAL APPROACH)
# out = consolidate_flat_id_only(path, include_filename=True)
# with open("test-adbuy-34-samples-with-filename.json", "w", encoding="utf-8") as f:
#     json.dump(out, f, ensure_ascii=False, indent=2)
# print(f"Wrote {len(out)} items to combined file")


Saved 9 annotations for 2fac4856-ff9c72fa-8267-5701-5492-e8cbf336e101_page2.png -> ad-buy-form-test-labels/2fac4856-ff9c72fa-8267-5701-5492-e8cbf336e101_page2.json
Saved 27 annotations for 981b4837-fdf168ea-c840-1465-710a-762427b285c3_page1.png -> ad-buy-form-test-labels/981b4837-fdf168ea-c840-1465-710a-762427b285c3_page1.json
Saved 15 annotations for 423f9ea6-faa55a77-9090-22ac-fe9b-32ab3f026300_page1.png -> ad-buy-form-test-labels/423f9ea6-faa55a77-9090-22ac-fe9b-32ab3f026300_page1.json
Saved 24 annotations for 71662f51-f7635eed-5555-27b6-780f-8386863b25ca_page3.png -> ad-buy-form-test-labels/71662f51-f7635eed-5555-27b6-780f-8386863b25ca_page3.json
Saved 15 annotations for 156320b0-f9a59888-b508-9792-d356-1d36de82c212_page2.png -> ad-buy-form-test-labels/156320b0-f9a59888-b508-9792-d356-1d36de82c212_page2.json
Saved 26 annotations for 25fab5c0-f9a59888-b508-9792-d356-1d36de82c212_page1.png -> ad-buy-form-test-labels/25fab5c0-f9a59888-b508-9792-d356-1d36de82c212_page1.json
Saved 38 an

In [7]:
def create_consolidated_test_data(labels_dir: str, output_file: str) -> None:
    """
    Convert individual label files to consolidated test data format.
    
    Format: [{"file_name": "...", "labels": [{"entity_type": "...", "value": "...", "bbox": [...]}]}]
    """
    import json
    import os
    from pathlib import Path
    
    consolidated_data = []
    
    # Read all JSON files in the labels directory
    labels_path = Path(labels_dir)
    if not labels_path.exists():
        print(f"Error: Directory {labels_dir} does not exist")
        return
    
    json_files = list(labels_path.glob("*.json"))
    print(f"Found {len(json_files)} JSON files in {labels_dir}")
    
    for json_file in sorted(json_files):
        file_name = json_file.stem  # filename without extension
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                annotations = json.load(f)
            
            # Convert format: "type" -> "entity_type", remove "id"
            labels = []
            for ann in annotations:
                if isinstance(ann, dict) and all(key in ann for key in ["type", "value", "bbox"]):
                    labels.append({
                        "entity_type": ann["type"],
                        "value": ann["value"],
                        "bbox": ann["bbox"]
                    })
            
            if labels:  # Only add if there are valid labels
                consolidated_data.append({
                    "file_name": file_name,
                    "labels": labels
                })
                print(f"Processed {file_name}: {len(labels)} labels")
            
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
    
    # Save consolidated data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(consolidated_data, f, ensure_ascii=False, indent=4)
    
    print(f"\nSaved consolidated data to {output_file}")
    print(f"Total files: {len(consolidated_data)}")
    print(f"Total labels: {sum(len(item['labels']) for item in consolidated_data)}")

# Create the consolidated test data file
labels_directory = "/Volumes/MyDataDrive/thesis/code-2/src/labelrix/ad-buy-form-test-labels"
output_filename = "/Volumes/MyDataDrive/thesis/code-2/src/labelrix/ad-buy-form-testset-labels-consolidated.json"

create_consolidated_test_data(labels_directory, output_filename)


Found 34 JSON files in /Volumes/MyDataDrive/thesis/code-2/src/labelrix/ad-buy-form-test-labels
Processed 0c6862cc-f56e2fc1-7f41-68e3-bfe5-06176b9a2e8a_page1: 41 labels
Processed 15294724-eff55361-0e39-53a1-da0b-a337a361b66b_page3: 44 labels
Processed 156320b0-f9a59888-b508-9792-d356-1d36de82c212_page2: 15 labels
Processed 163d14b9-efca8764-0dfb-3f1c-beb9-f629991435bb_page2: 18 labels
Processed 1eb6480c-f7635eed-5555-27b6-780f-8386863b25ca_page2: 38 labels
Processed 25fab5c0-f9a59888-b508-9792-d356-1d36de82c212_page1: 26 labels
Processed 2c197804-f21fcdd1-214c-c145-29cc-362e0b0ef1e3_page1: 45 labels
Processed 2fac4856-ff9c72fa-8267-5701-5492-e8cbf336e101_page2: 9 labels
Processed 3aa8ab38-ef640e66-1f79-701f-61d7-e968acb9e3fc_page2: 38 labels
Processed 4043cd85-e8d41204-64eb-9f4b-608e-5593933aca41_page4: 35 labels
Processed 423f9ea6-faa55a77-9090-22ac-fe9b-32ab3f026300_page1: 15 labels
Processed 4827ce5e-eabf486e-2ff6-c060-68b7-fcf5363bde66_page1: 33 labels
Processed 591b2144-e910d31f-0c