In [8]:
import os
import json
import glob

# === Set output path for your .jsonl dataset description ===
# output_jsonl_path = "/data/usr/yb107/colon_data/colon_dataset.jsonl"
output_jsonl_path = "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_dataset.jsonl"

# === Fixed base paths ===
# image_dir = "/data/usr/yb107/c_grade_colons/images"
# mask_root = "/data/usr/yb107/c_grade_colons/3d_vlsm_label_masks"

mask_roots = [
    "/data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/",
    "/data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/masks/",
    "/data/usr/yb107/colon_data/refined_by_mobina/female_cases_refined_by_md/masks/",
    "/data/usr/yb107/colon_data/refined_by_mobina/male_cases_refined_by_md/masks/",
]
entries = []

# === Go through each image ===
# image_files = sorted(glob.glob(os.path.join(image_dir, "*.nii.gz")))

# Go through each .nii.gz file in labels directory
# files = glob.glob(f"{mask_root}/*.nii.gz")
files = []
files.extend(glob.glob(os.path.join(mask_roots[0], "*.nii.gz")))
files.extend(glob.glob(os.path.join(mask_roots[1], "*.nii.gz")))
files.extend(glob.glob(os.path.join(mask_roots[2], "*.nii.gz")))
files.extend(glob.glob(os.path.join(mask_roots[3], "*.nii.gz")))

print(f"Found {len(files)} files in total across all mask roots.")
print(f"Files: {files[0]}")

for mask_path in files:
    # patient_name = os.path.basename(image_path).replace(".nii.gz", "")
    # mask_path = os.path.join(mask_root, patient_name, "colon.nii.gz")
    # image_path = mask_path.replace("3d_vlsm_label_masks", "images")

    # # === Check if mask exists, raise error if not ===
    if not os.path.exists(mask_path):
        raise FileNotFoundError(f"Missing mask for: {mask_path}")

    # entries.append({"image": image_path, "mask": mask_path})
    entries.append({"mask": mask_path})

# === Write to .jsonl ===
with open(output_jsonl_path, "w") as f:
    for entry in entries:
        f.write(json.dumps(entry) + "\n")

print(f"[✓] Wrote {len(entries)} valid entries to {output_jsonl_path}")

Found 419 files in total across all mask roots.
Files: /data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/Patient_00101_Study_73554_Series_04.nii.gz
[✓] Wrote 419 valid entries to /home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_dataset.jsonl


In [9]:
import json
import random

output_jsonl_path = "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_dataset.jsonl"


def divide_jsonl_train_val_test(
    jsonl_file, train_file, val_file, test_file, train_ratio=0.85, val_ratio=0.10
):
    """Divide the JSONL dataset into train, validation, and test sets."""
    with open(jsonl_file, "r") as f:
        entries = [json.loads(line) for line in f]

    # shuffle the entries
    random.shuffle(entries)

    total_entries = len(entries)
    train_end = int(total_entries * train_ratio)
    val_end = int(total_entries * (train_ratio + val_ratio))

    with open(train_file, "w") as f:
        for entry in entries[:train_end]:
            f.write(json.dumps(entry) + "\n")

    with open(val_file, "w") as f:
        for entry in entries[train_end:val_end]:
            f.write(json.dumps(entry) + "\n")

    with open(test_file, "w") as f:
        for entry in entries[val_end:]:
            f.write(json.dumps(entry) + "\n")


divide_jsonl_train_val_test(
    output_jsonl_path,
    "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_train.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_val.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_test.jsonl",
)

# Copy Data from Drcc to Capri

In [None]:
import os
import subprocess

# === Set these ===
base_image_dir = "/scratch/railabs/ld258/projects/3d_vlsm/data/images/"
remote_user = "yb107"
remote_host = "plp-capri"
remote_target_dir = "/data/usr/yb107/colon_data/3d_vlsm/"

case_names = sorted(common_all)

print(f"Total cases to copy: {len(case_names)}")
# # === Loop and copy ===
# for case in case_names:
#     # Find matching files in base_image_dir
#     matching_files = [f for f in os.listdir(base_image_dir) if f.startswith(case)]

#     for file in matching_files:
#         local_path = os.path.join(base_image_dir, file)

#         # Check if file exists
#         if not os.path.isfile(local_path):
#             print(f"[!] File not found: {local_path}")
#             continue

#         remote_path = f"{remote_user}@{remote_host}:{remote_target_dir}"

#         print(f"Copying {file} → {remote_path}")

#         # Run scp (will ask for password unless SSH keys are set up)
#         subprocess.run(["scp", local_path, remote_path])

In [5]:
nrrd_base_paths = [
    "/data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/",
    "/data/usr/yb107/colon_data/refined_by_mobina/female_cases_refined_by_md/",
    "/data/usr/yb107/colon_data/refined_by_mobina/male_cases_refined_by_md/",
]

import SimpleITK as sitk
from multiprocessing import Pool
import os


def process_nrrd_file(nrrd_path):
    """Convert NRRD file to NIfTI and save it in the same directory."""
    try:
        image = sitk.ReadImage(nrrd_path)
        # nifti_path = nrrd_path.replace(".nrrd", ".nii.gz")
        basename = os.path.basename(nrrd_path)
        filename = basename.split(".nii.gz")[0]
        os.makedirs(os.path.join(os.path.dirname(nrrd_path), "masks"), exist_ok=True)
        nifti_path = os.path.join(
            os.path.dirname(nrrd_path), "masks", f"{filename}.nii.gz"
        )
        sitk.WriteImage(image, nifti_path)
        print(f"Converted {nrrd_path} to {nifti_path}")
    except Exception as e:
        print(f"Error processing {nrrd_path}: {e}")


def convert_nrrd_to_nifti(nrrd_base_paths):
    """Convert all NRRD files in the given base paths to NIfTI format."""
    nrrd_files = []
    for base_path in nrrd_base_paths:
        nrrd_files.extend(glob.glob(os.path.join(base_path, "*.nrrd")))

    print(f"Found {len(nrrd_files)} NRRD files to convert.")

    with Pool(processes=os.cpu_count()) as pool:
        pool.map(process_nrrd_file, nrrd_files)


convert_nrrd_to_nifti(nrrd_base_paths)

Found 245 NRRD files to convert.
Converted /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/Patient_00927_Study_17475_Series_03.nii.gz.nii.seg.nrrd to /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/masks/Patient_00927_Study_17475_Series_03.nii.gz
Converted /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/Patient_00351_Study_77248_Series_03.nii.gz.nii.seg.nrrd to /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/masks/Patient_00351_Study_77248_Series_03.nii.gz
Converted /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/Patient_00895_Study_77626_Series_03.nii.gz.nii.seg.nrrd to /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/masks/Patient_00895_Study_77626_Series_03.nii.gzConverted /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_mobina/Patient_00138_Study_73227_Series_03.nii.gz.nii.seg.nrrd to /data/usr/yb107/colon_data/refined_by_mobina/colon_refined_by_