In [4]:
import os
import json
import glob

# === Set output path for your .jsonl dataset description ===
# output_jsonl_path = "/data/usr/yb107/colon_data/colon_dataset.jsonl"
output_jsonl_path = "/home/yb107/cvpr2025/DukeDiffSeg/data/json/3d_vlsm_dataset.jsonl"

# === Fixed base paths ===
image_dir = "/data/usr/yb107/3d_vlsm/images"
mask_root = "/data/usr/yb107/3d_vlsm/labels"

entries = []

# === Go through each image ===
# image_files = sorted(glob.glob(os.path.join(image_dir, "*.nii.gz")))

# Go through each .nii.gz file in labels directory
files = glob.glob(f"{mask_root}/*.nii.gz")
print(f"Found {len(files)} files in {mask_root}")
print(f"Files: {files[0]}")

for mask_path in files:
    # patient_name = os.path.basename(image_path).replace(".nii.gz", "")
    # mask_path = os.path.join(mask_root, patient_name, "colon.nii.gz")
    image_path = mask_path.replace("labels", "images")

    # # === Check if mask exists, raise error if not ===
    if not os.path.exists(mask_path):
        raise FileNotFoundError(f"Missing mask for: {mask_path}")

    entries.append({"image": image_path, "mask": mask_path})

# === Write to .jsonl ===
with open(output_jsonl_path, "w") as f:
    for entry in entries:
        f.write(json.dumps(entry) + "\n")

print(f"[✓] Wrote {len(entries)} valid entries to {output_jsonl_path}")

Found 558 files in /data/usr/yb107/3d_vlsm/labels
Files: /data/usr/yb107/3d_vlsm/labels/Patient_00552_Study_84140_Series_03.nii.gz
[✓] Wrote 558 valid entries to /home/yb107/cvpr2025/DukeDiffSeg/data/json/3d_vlsm_dataset.jsonl


In [None]:
import json

output_jsonl_path = (
    "/home/yb107/cvpr2025/DukeDiffSeg/data/3d_vlsm_v2/3d_vlsm_v2_dataset.jsonl"
)


def divide_jsonl_train_val_test(
    jsonl_file, train_file, val_file, test_file, train_ratio=0.7, val_ratio=0.2
):
    """Divide the JSONL dataset into train, validation, and test sets."""
    with open(jsonl_file, "r") as f:
        entries = [json.loads(line) for line in f]

    total_entries = len(entries)
    train_end = int(total_entries * train_ratio)
    val_end = int(total_entries * (train_ratio + val_ratio))

    with open(train_file, "w") as f:
        for entry in entries[:train_end]:
            f.write(json.dumps(entry) + "\n")

    with open(val_file, "w") as f:
        for entry in entries[train_end:val_end]:
            f.write(json.dumps(entry) + "\n")

    with open(test_file, "w") as f:
        for entry in entries[val_end:]:
            f.write(json.dumps(entry) + "\n")


divide_jsonl_train_val_test(
    output_jsonl_path,
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/3d_vlsm_train.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/3d_vlsm_val.jsonl",
    "/home/yb107/cvpr2025/DukeDiffSeg/data/json/3d_vlsm_test.jsonl",
)

# Copy Data from Drcc to Capri

In [None]:
import os
import subprocess

# === Set these ===
base_image_dir = "/scratch/railabs/ld258/projects/3d_vlsm/data/images/"
remote_user = "yb107"
remote_host = "plp-capri"
remote_target_dir = "/data/usr/yb107/colon_data/3d_vlsm/"

case_names = sorted(common_all)

print(f"Total cases to copy: {len(case_names)}")
# # === Loop and copy ===
# for case in case_names:
#     # Find matching files in base_image_dir
#     matching_files = [f for f in os.listdir(base_image_dir) if f.startswith(case)]

#     for file in matching_files:
#         local_path = os.path.join(base_image_dir, file)

#         # Check if file exists
#         if not os.path.isfile(local_path):
#             print(f"[!] File not found: {local_path}")
#             continue

#         remote_path = f"{remote_user}@{remote_host}:{remote_target_dir}"

#         print(f"Copying {file} → {remote_path}")

#         # Run scp (will ask for password unless SSH keys are set up)
#         subprocess.run(["scp", local_path, remote_path])