In [4]:
IMAGE_PATH = "/NAS/user_data/user/yb107/abdomen_1k/unzipped/AbdomenCT-1K/"
MASK_PATH = "/NAS/user_data/user/yb107/abdomen_1k/unzipped/Mask/"

import os
import json


def create_jsonl_dataset(image_path, mask_path, output_file):
    """Go through every .nii.gz file in image_path, find the corresponding mask file,
    and create a JSONL file with the image and mask paths.
    """
    dataset = []

    for filename in os.listdir(image_path):
        if filename.endswith(".nii.gz"):
            image_file = os.path.join(image_path, filename)
            mask_file = os.path.join(mask_path, filename.replace("_0000", ""))

            if os.path.exists(mask_file):
                dataset.append({"image": image_file, "mask": mask_file})
            else:
                print(f"Warning: Mask file for {filename} not found.")

    with open(output_file, "w") as f:
        for entry in dataset:
            f.write(json.dumps(entry) + "\n")


create_jsonl_dataset(
    IMAGE_PATH, MASK_PATH, "/home/yb107/cvpr2025/DukeDiffSeg/data/json/abdomen_1k.jsonl"
)

# Jsonl file format:
# {"image": "/path/to/image.nii.gz", "mask": "/path/to/mask.nii.gz"}


def divide_jsonl_train_val_test(
    jsonl_file, train_file, val_file, test_file, train_ratio=0.7, val_ratio=0.2
):
    """Divide the JSONL dataset into train, validation, and test sets."""
    with open(jsonl_file, "r") as f:
        entries = [json.loads(line) for line in f]

    total_entries = len(entries)
    train_end = int(total_entries * train_ratio)
    val_end = int(total_entries * (train_ratio + val_ratio))

    with open(train_file, "w") as f:
        for entry in entries[:train_end]:
            f.write(json.dumps(entry) + "\n")

    with open(val_file, "w") as f:
        for entry in entries[train_end:val_end]:
            f.write(json.dumps(entry) + "\n")

    with open(test_file, "w") as f:
        for entry in entries[val_end:]:
            f.write(json.dumps(entry) + "\n")


divide_jsonl_train_val_test(
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/abdomen_1k.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/abdomen_1k_train.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/abdomen_1k_val.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/abdomen_1k_test.jsonl",
)

