In [2]:
import os
import json
import glob

# === Set output path for your .jsonl dataset description ===
# output_jsonl_path = "/data/usr/yb107/colon_data/colon_dataset.jsonl"
output_jsonl_path = "/home/yb107/cvpr2025/DukeDiffSeg/data/json/colon_dataset.jsonl"

# === Fixed base paths ===
image_dir = "/data/usr/yb107/colon_data/CT"
mask_root = "/data/usr/yb107/colon_data/masks"

entries = []

# === Go through each image ===
image_files = sorted(glob.glob(os.path.join(image_dir, "*.nii.gz")))

for image_path in image_files:
    patient_name = os.path.basename(image_path).replace(".nii.gz", "")
    mask_path = os.path.join(mask_root, patient_name, "colon.nii.gz")

    # === Check if mask exists, raise error if not ===
    if not os.path.exists(mask_path):
        raise FileNotFoundError(f"Missing mask for: {patient_name} → {mask_path}")

    entries.append({"image": image_path, "mask": mask_path})

# === Write to .jsonl ===
with open(output_jsonl_path, "w") as f:
    for entry in entries:
        f.write(json.dumps(entry) + "\n")

print(f"[✓] Wrote {len(entries)} valid entries to {output_jsonl_path}")

[✓] Wrote 217 valid entries to /home/yb107/cvpr2025/DukeDiffSeg/data/json/colon_dataset.jsonl


In [3]:
def divide_jsonl_train_val_test(
    jsonl_file, train_file, val_file, test_file, train_ratio=0.7, val_ratio=0.2
):
    """Divide the JSONL dataset into train, validation, and test sets."""
    with open(jsonl_file, "r") as f:
        entries = [json.loads(line) for line in f]

    total_entries = len(entries)
    train_end = int(total_entries * train_ratio)
    val_end = int(total_entries * (train_ratio + val_ratio))

    with open(train_file, "w") as f:
        for entry in entries[:train_end]:
            f.write(json.dumps(entry) + "\n")

    with open(val_file, "w") as f:
        for entry in entries[train_end:val_end]:
            f.write(json.dumps(entry) + "\n")

    with open(test_file, "w") as f:
        for entry in entries[val_end:]:
            f.write(json.dumps(entry) + "\n")


divide_jsonl_train_val_test(
    output_jsonl_path,
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/colon_train.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/colon_val.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/colon_test.jsonl",
)