In [9]:
import monai
from monai import transforms
import os
import functools
from monai.utils.enums import CommonKeys as Keys
from utils.data import (
    add_spacing,
    binary_mask_labels,
    remove_labels,
    transform_labels,
    list_from_jsonl,
    dataset_depended_transform_labels,
    mask_to_sdf,
    sdf_to_mask,
    MaskToSDFd,
)
from utils.monai_transforms import CropForegroundAxisd, SmoothColonMaskd
import torch

In [None]:
def get_transforms(train=True):
    """Get the MONAI transforms for training or validation."""

    def custom_name_formatter(meta_dict, saver):
        full_path = meta_dict["filename_or_obj"]
        base = os.path.basename(full_path)
        # If the filename itself contains "colon", pull the parent folder as the ID

        if "labels" in full_path.lower():
            postfix = "_label"
        else:
            postfix = "_image"

        return {"filename": f"{base.replace('.nii.gz', '')}"}

    data_keys = [Keys.IMAGE, Keys.LABEL]

    custom_transforms = [
        transforms.LoadImaged(keys=data_keys),
        transforms.EnsureChannelFirstd(keys=data_keys),
        transforms.Spacingd(
            keys=data_keys,
            pixdim=[1.5, 1.5, 2.0],
            mode="nearest",
        ),
        transforms.Orientationd(
            keys=data_keys,
            axcodes="RAS",
        ),
        transforms.KeepLargestConnectedComponentd(
            keys=[Keys.LABEL, Keys.IMAGE],
        ),
        transforms.Lambdad(
            keys=[Keys.IMAGE, Keys.LABEL],
            func=functools.partial(
                dataset_depended_transform_labels,
            ),
        ),
    ]

    return transforms.Compose(custom_transforms)

In [None]:
import os
import json
import numpy as np
import nibabel as nib
from scipy import ndimage
from skimage.morphology import skeletonize
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path


def compute_colon_length_from_skeleton(mask_array, spacing):
    """
    Compute colon length using skeletonization method.

    Args:
        mask_array: Binary mask of colon (numpy array)
        spacing: Voxel spacing in mm [x, y, z]

    Returns:
        length_mm: Estimated colon length in millimeters
    """
    # Ensure binary mask
    binary_mask = (mask_array > 0).astype(np.uint8)

    # Skeletonize the colon
    skeleton = skeletonize(binary_mask)

    # Get skeleton coordinates
    coords = np.argwhere(skeleton > 0)

    if len(coords) < 2:
        return 0.0

    # Calculate length by summing distances between consecutive skeleton points
    # This is an approximation; more sophisticated methods trace the actual path
    length_voxels = np.sum(skeleton)

    # Convert to physical distance using spacing
    # Average spacing for isotropic approximation
    avg_spacing = np.mean(spacing)
    length_mm = length_voxels * avg_spacing

    return length_mm


def compute_colon_length_centerline(mask_array, spacing):
    """
    More sophisticated centerline-based length calculation.
    Uses distance transform and skeleton to trace the colon path.

    Args:
        mask_array: Binary mask of colon
        spacing: Voxel spacing [x, y, z] in mm

    Returns:
        length_mm: Colon length in millimeters
    """
    binary_mask = (mask_array > 0).astype(np.uint8)

    # Compute skeleton
    skeleton = skeletonize(binary_mask)
    skeleton_coords = np.argwhere(skeleton > 0)

    if len(skeleton_coords) < 2:
        return 0.0

    # Build a graph of skeleton points and find the longest path
    # For simplicity, we'll sum edge lengths in the skeleton
    # Scale by spacing to get physical distance

    total_length = 0.0

    # For each skeleton voxel, check 26-connected neighbors
    for i in range(len(skeleton_coords) - 1):
        coord1 = skeleton_coords[i]
        coord2 = skeleton_coords[i + 1]

        # Calculate Euclidean distance in physical space
        diff = (coord1 - coord2) * spacing
        distance = np.linalg.norm(diff)
        total_length += distance

    return total_length


def load_and_measure_colon(file_path, label_value=None, spacing=[1.5, 1.5, 2.0]):
    """
    Load a NIfTI file and measure colon length.

    Args:
        file_path: Path to .nii.gz file
        label_value: If multi-label, specify colon label (1 for your case).
                     If None, treats as binary mask
        spacing: Physical spacing after preprocessing

    Returns:
        length_mm: Colon length in millimeters
        volume_ml: Colon volume in milliliters
    """
    try:
        # Load NIfTI
        nii = nib.load(file_path)
        data = nii.get_fdata()

        # Extract colon mask
        if label_value is not None:
            colon_mask = data == label_value
        else:
            colon_mask = data > 0

        # Calculate volume
        voxel_volume_mm3 = np.prod(spacing)
        volume_ml = np.sum(colon_mask) * voxel_volume_mm3 / 1000.0

        # Calculate length
        length_mm = compute_colon_length_centerline(colon_mask, spacing)

        return length_mm, volume_ml

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None


def process_training_data(jsonl_path, spacing=[1.5, 1.5, 2.0]):
    """
    Process training data and extract colon measurements.

    Args:
        jsonl_path: Path to training data JSONL file
        spacing: Physical spacing

    Returns:
        DataFrame with measurements
    """
    results = []

    with open(jsonl_path, "r") as f:
        for line in f:
            data = json.loads(line)
            mask_path = data["mask"]

            # Extract patient ID
            patient_id = os.path.basename(mask_path).replace(".nii.gz", "")

            # Measure colon (assuming label 1 is colon after dataset_depended_transform_labels)
            length_mm, volume_ml = load_and_measure_colon(
                mask_path, label_value=1, spacing=spacing  # Colon label
            )

            if length_mm is not None:
                results.append(
                    {
                        "patient_id": patient_id,
                        "dataset": "training_a_grade",
                        "length_mm": length_mm,
                        "length_cm": length_mm / 10.0,
                        "volume_ml": volume_ml,
                        "file_path": mask_path,
                    }
                )

    return pd.DataFrame(results)


def process_c_grade_data(jsonl_path, pred_dir=None, spacing=[1.5, 1.5, 2.0]):
    """
    Process C-grade data (original masks and predictions if available).

    Args:
        jsonl_path: Path to C-grade JSONL file
        pred_dir: Directory containing _pred.nii.gz files
        spacing: Physical spacing

    Returns:
        DataFrame with measurements
    """
    results = []

    with open(jsonl_path, "r") as f:
        for line in f:
            data = json.loads(line)
            mask_path = data["mask"]

            # Extract patient ID
            patient_id = os.path.basename(mask_path).replace(".nii.gz", "")

            # Measure original C-grade mask
            length_mm, volume_ml = load_and_measure_colon(
                mask_path, label_value=2, spacing=spacing
            )

            if length_mm is not None:
                results.append(
                    {
                        "patient_id": patient_id,
                        "dataset": "c_grade_original",
                        "length_mm": length_mm,
                        "length_cm": length_mm / 10.0,
                        "volume_ml": volume_ml,
                        "file_path": mask_path,
                    }
                )

            # Measure predicted mask if available
            if pred_dir is not None:
                pred_path = os.path.join(pred_dir, f"{patient_id}_pred.nii.gz")
                if os.path.exists(pred_path):
                    length_mm_pred, volume_ml_pred = load_and_measure_colon(
                        pred_path,
                        label_value=None,  # Binary prediction
                        spacing=spacing,
                    )

                    if length_mm_pred is not None:
                        results.append(
                            {
                                "patient_id": patient_id,
                                "dataset": "c_grade_predicted",
                                "length_mm": length_mm_pred,
                                "length_cm": length_mm_pred / 10.0,
                                "volume_ml": volume_ml_pred,
                                "file_path": pred_path,
                            }
                        )

    return pd.DataFrame(results)


def create_comparison_plots(df, output_dir="plots"):
    """
    Create comprehensive visualization plots for colon length distributions.

    Args:
        df: DataFrame with measurements
        output_dir: Directory to save plots
    """
    os.makedirs(output_dir, exist_ok=True)

    # Plot 1: Distribution comparison (box plot)
    plt.figure(figsize=(12, 6))
    datasets = df["dataset"].unique()
    data_to_plot = [df[df["dataset"] == d]["length_cm"].dropna() for d in datasets]

    plt.boxplot(data_to_plot, labels=datasets)
    plt.ylabel("Colon Length (cm)", fontsize=12)
    plt.title(
        "Colon Length Distribution Across Datasets", fontsize=14, fontweight="bold"
    )
    plt.xticks(rotation=15, ha="right")
    plt.grid(axis="y", alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "length_distribution_boxplot.png"), dpi=300)
    plt.close()

    # Plot 2: Histogram comparison
    plt.figure(figsize=(12, 6))
    for dataset in datasets:
        subset = df[df["dataset"] == dataset]["length_cm"].dropna()
        plt.hist(subset, bins=20, alpha=0.5, label=dataset, edgecolor="black")

    plt.xlabel("Colon Length (cm)", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.title("Colon Length Distribution Histogram", fontsize=14, fontweight="bold")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "length_distribution_histogram.png"), dpi=300)
    plt.close()

    # Plot 3: Paired comparison for C-grade (original vs predicted)
    c_grade_orig = df[df["dataset"] == "c_grade_original"].set_index("patient_id")
    c_grade_pred = df[df["dataset"] == "c_grade_predicted"].set_index("patient_id")

    # Find common patients
    common_patients = c_grade_orig.index.intersection(c_grade_pred.index)

    if len(common_patients) > 0:
        plt.figure(figsize=(10, 10))

        orig_lengths = c_grade_orig.loc[common_patients, "length_cm"]
        pred_lengths = c_grade_pred.loc[common_patients, "length_cm"]

        plt.scatter(orig_lengths, pred_lengths, alpha=0.6, s=100)

        # Add diagonal line (perfect agreement)
        min_val = min(orig_lengths.min(), pred_lengths.min())
        max_val = max(orig_lengths.max(), pred_lengths.max())
        plt.plot(
            [min_val, max_val],
            [min_val, max_val],
            "r--",
            linewidth=2,
            label="Perfect Agreement",
        )

        plt.xlabel("Original C-Grade Length (cm)", fontsize=12)
        plt.ylabel("Predicted Length (cm)", fontsize=12)
        plt.title(
            "C-Grade: Original vs Predicted Colon Length",
            fontsize=14,
            fontweight="bold",
        )
        plt.legend()
        plt.grid(alpha=0.3)
        plt.axis("equal")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "c_grade_comparison_scatter.png"), dpi=300)
        plt.close()

        # Plot 4: Improvement plot (difference)
        differences = pred_lengths - orig_lengths

        plt.figure(figsize=(12, 6))
        plt.bar(
            range(len(differences)),
            differences.values,
            color=["green" if d > 0 else "red" for d in differences.values],
            alpha=0.7,
            edgecolor="black",
        )
        plt.axhline(y=0, color="black", linestyle="-", linewidth=1)
        plt.xlabel("Patient Index", fontsize=12)
        plt.ylabel("Length Difference (Predicted - Original) cm", fontsize=12)
        plt.title(
            "Colon Length Change: Predicted vs Original C-Grade",
            fontsize=14,
            fontweight="bold",
        )
        plt.grid(axis="y", alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "length_improvement.png"), dpi=300)
        plt.close()

    # Generate summary statistics
    summary = (
        df.groupby("dataset")["length_cm"]
        .agg(["count", "mean", "std", "min", "max", "median"])
        .round(2)
    )

    print("\n" + "=" * 60)
    print("COLON LENGTH SUMMARY STATISTICS")
    print("=" * 60)
    print(summary)
    print("=" * 60 + "\n")

    # Save summary to CSV
    summary.to_csv(os.path.join(output_dir, "length_summary_statistics.csv"))

    return summary


def main():
    """
    Main execution function.
    """
    # Define paths - UPDATE THESE TO YOUR ACTUAL PATHS
    training_jsonl = "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_dataset_with_body_filled.jsonl"
    c_grade_jsonl = "/home/yb107/cvpr2025/DukeDiffSeg/data/c_grade_colons/3d_vlsmv2_c_grade_colon_dataset_with_body_filled.jsonl"
    pred_dir = "/home/yb107/cvpr2025/DukeDiffSeg/outputs/diffunet-binary-colon/5.1/inference_c_grade_550_gs_2.0_final_small_with_skeletonization"

    spacing = [1.5, 1.5, 2.0]  # From your transforms

    print("Processing training data...")
    df_training = process_training_data(training_jsonl, spacing)

    print("Processing C-grade data...")
    df_c_grade = process_c_grade_data(c_grade_jsonl, pred_dir, spacing)

    # Combine all results
    df_all = pd.concat([df_training, df_c_grade], ignore_index=True)

    # Save raw data
    df_all.to_csv("colon_length_measurements.csv", index=False)
    print(
        f"\nSaved measurements for {len(df_all)} cases to colon_length_measurements.csv"
    )

    # Create plots
    print("\nGenerating visualization plots...")
    summary = create_comparison_plots(df_all, output_dir="plots")

    print("\nAnalysis complete! Check the 'plots' directory for visualizations.")


main()

Processing training data...
Error processing /data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/Patient_00101_Study_73554_Series_04.nii.gz: name 'skeletonize_3d' is not defined
Error processing /data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/Patient_02190_Study_01823_Series_03.nii.gz: name 'skeletonize_3d' is not defined
Error processing /data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/Patient_01769_Study_52102_Series_03.nii.gz: name 'skeletonize_3d' is not defined
Error processing /data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/Patient_00248_Study_02847_Series_03.nii.gz: name 'skeletonize_3d' is not defined
Error processing /data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/masks/Patient_02065_Study_50544_Series_05.nii.gz: name 'skeletonize_3d' is not defined
Error processing /data/usr/yb107/colon_data/ref

KeyboardInterrupt: 

In [13]:
!pip install seaborn

[0mCollecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[0mInstalling collected packages: seaborn
Successfully installed seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import functools
from typing import Dict, List, Tuple

import torch
from monai import transforms
from monai.data import Dataset, DataLoader
from monai.transforms import MapTransform
from scipy import ndimage
from skimage.morphology import skeletonize
from scipy.spatial.distance import pdist, squareform


# Your existing transform setup
class Keys:
    IMAGE = "image"
    LABEL = "label"
    MASK = "mask"
    BODY_FILLED_MASK = "body_filled_mask"


def dataset_depended_transform_labels(data, label_mapping=None):
    """
    Your existing label transformation function.
    Maps labels to standardized organ labels:
      1: colon
      2: rectum
      3: small_bowel
      etc.
    """
    # Placeholder - implement your actual transform logic here
    # This should handle the label mapping from your multi-organ segmentation
    return data


def get_data_dicts_from_jsonl(
    jsonl_path: str, dataset_type: str = "training"
) -> List[Dict]:
    """
    Load data from JSONL and create MONAI-compatible data dictionaries.

    Args:
        jsonl_path: Path to JSONL file
        dataset_type: "training", "c_grade_original", or "c_grade_predicted"

    Returns:
        List of data dictionaries
    """
    data_dicts = []

    with open(jsonl_path, "r") as f:
        for line in f:
            data = json.loads(line.strip())

            if dataset_type == "training":
                # Training data has mask and body_filled_mask
                # Need to infer image path from mask path
                mask_path = data["mask"]
                image_path = mask_path.replace("/masks/", "/images/").replace(
                    "_mask", ""
                )

                data_dict = {
                    Keys.IMAGE: image_path,
                    Keys.LABEL: mask_path,
                    Keys.MASK: mask_path,
                    Keys.BODY_FILLED_MASK: data["body_filled_mask"],
                    "dataset_type": dataset_type,
                    "patient_id": os.path.basename(mask_path).replace(".nii.gz", ""),
                }

            elif dataset_type == "c_grade_original":
                # C-grade data has image, mask, and body_filled_mask
                data_dict = {
                    Keys.IMAGE: data["image"],
                    Keys.LABEL: data["mask"],
                    Keys.MASK: data["mask"],
                    Keys.BODY_FILLED_MASK: data["body_filled_mask"],
                    "dataset_type": dataset_type,
                    "patient_id": os.path.basename(data["mask"]).replace(".nii.gz", ""),
                }

            data_dicts.append(data_dict)

    return data_dicts


def get_c_grade_prediction_dicts(jsonl_path: str, pred_dir: str) -> List[Dict]:
    """
    Create data dicts for C-grade predictions.

    Args:
        jsonl_path: Path to original C-grade JSONL
        pred_dir: Directory containing _pred.nii.gz files

    Returns:
        List of data dictionaries with predictions
    """
    data_dicts = []

    with open(jsonl_path, "r") as f:
        for line in f:
            data = json.loads(line.strip())
            patient_id = os.path.basename(data["mask"]).replace(".nii.gz", "")
            pred_path = os.path.join(pred_dir, f"{patient_id}_pred.nii.gz")

            if os.path.exists(pred_path):
                # For predictions, we use the pred as the label (binary mask)
                data_dict = {
                    Keys.IMAGE: data["image"],
                    Keys.LABEL: pred_path,  # Use prediction as label
                    Keys.MASK: pred_path,
                    "dataset_type": "c_grade_predicted",
                    "patient_id": patient_id,
                    "is_binary_pred": True,  # Flag to skip label transformation
                }
                data_dicts.append(data_dict)

    return data_dicts


def get_transforms_for_analysis(is_binary_pred=False):
    """
    Get MONAI transforms for analysis (modified from your get_transforms).

    Args:
        is_binary_pred: If True, skip label transformation for binary predictions

    Returns:
        Composed transforms
    """
    data_keys = [Keys.IMAGE, Keys.LABEL]

    custom_transforms = [
        transforms.LoadImaged(keys=data_keys, image_only=False),
        transforms.EnsureChannelFirstd(keys=data_keys),
        transforms.Spacingd(
            keys=data_keys,
            pixdim=[1.5, 1.5, 2.0],
            mode=("bilinear", "nearest"),
        ),
        transforms.Orientationd(
            keys=data_keys,
            axcodes="RAS",
        ),
    ]

    # Only apply label transformation and connected component for non-binary predictions
    if not is_binary_pred:
        custom_transforms.extend(
            [
                transforms.KeepLargestConnectedComponentd(
                    keys=[Keys.LABEL],
                    applied_labels=[1],  # Only for colon label
                ),
                transforms.Lambdad(
                    keys=[Keys.LABEL],
                    func=functools.partial(dataset_depended_transform_labels),
                ),
            ]
        )
    else:
        # For binary predictions, just ensure it's binary
        custom_transforms.append(
            transforms.Lambdad(
                keys=[Keys.LABEL],
                func=lambda x: (x > 0).astype(np.float32),
            )
        )

    return transforms.Compose(custom_transforms)


class ColonLengthMetricsd(MapTransform):
    """
    MONAI transform to compute colon length and volume metrics.
    """

    def __init__(
        self,
        keys: List[str],
        colon_label: int = 1,
        spacing: List[float] = [1.5, 1.5, 2.0],
        is_binary: bool = False,
        allow_missing_keys: bool = False,
    ):
        """
        Args:
            keys: Keys to compute metrics for (typically [Keys.LABEL])
            colon_label: Label value for colon (1 by default)
            spacing: Physical spacing in mm
            is_binary: If True, treat as binary mask instead of multi-label
            allow_missing_keys: Whether to allow missing keys
        """
        super().__init__(keys, allow_missing_keys)
        self.colon_label = colon_label
        self.spacing = np.array(spacing)
        self.is_binary = is_binary

    def _compute_skeleton_length(self, binary_mask: np.ndarray) -> float:
        """
        Compute colon length using 3D skeletonization.

        Args:
            binary_mask: Binary colon mask

        Returns:
            Length in millimeters
        """
        if binary_mask.sum() == 0:
            return 0.0

        # Ensure 3D array (remove channel dimension if present)
        if binary_mask.ndim == 4:
            binary_mask = binary_mask[0]

        # Skeletonize
        skeleton = skeletonize(binary_mask.astype(np.uint8), method="lee")

        # Get skeleton coordinates
        skeleton_points = np.argwhere(skeleton > 0)

        if len(skeleton_points) < 2:
            return 0.0

        # Method 1: Simple skeleton voxel count scaled by average spacing
        # This is fast but less accurate
        # length_mm = skeleton.sum() * np.mean(self.spacing)

        # Method 2: Sum of distances between consecutive points
        # Sort points to create a path (simplified - not optimal path)
        # For better results, you'd use minimum spanning tree or graph algorithms

        # Calculate pairwise distances in physical space
        skeleton_points_physical = skeleton_points * self.spacing

        # Use minimum spanning tree approach for better path estimation
        length_mm = self._compute_path_length_mst(skeleton_points_physical)

        return length_mm

    def _compute_path_length_mst(self, points: np.ndarray) -> float:
        """
        Compute path length using Minimum Spanning Tree approach.
        This provides a better approximation of the actual colon path.

        Args:
            points: Skeleton points in physical coordinates (N x 3)

        Returns:
            Total path length in mm
        """
        if len(points) < 2:
            return 0.0

        # For very long paths, sample points to avoid memory issues
        max_points = 5000
        if len(points) > max_points:
            indices = np.random.choice(len(points), max_points, replace=False)
            points = points[indices]

        # Compute pairwise distances
        try:
            distances = pdist(points, metric="euclidean")
            dist_matrix = squareform(distances)

            # Simple MST using Prim's algorithm
            n_points = len(points)
            visited = np.zeros(n_points, dtype=bool)
            visited[0] = True
            total_length = 0.0

            for _ in range(n_points - 1):
                min_dist = np.inf
                min_idx = -1

                for i in range(n_points):
                    if visited[i]:
                        for j in range(n_points):
                            if not visited[j] and dist_matrix[i, j] < min_dist:
                                min_dist = dist_matrix[i, j]
                                min_idx = j

                if min_idx != -1:
                    visited[min_idx] = True
                    total_length += min_dist

            return total_length

        except MemoryError:
            # Fallback: simple consecutive point distances
            sorted_indices = np.lexsort((points[:, 2], points[:, 1], points[:, 0]))
            sorted_points = points[sorted_indices]
            diffs = np.diff(sorted_points, axis=0)
            distances = np.linalg.norm(diffs, axis=1)
            return np.sum(distances)

    def _compute_centerline_length(self, binary_mask: np.ndarray) -> float:
        """
        Alternative method: Compute length using distance transform and centerline.
        More accurate for tubular structures.

        Args:
            binary_mask: Binary colon mask

        Returns:
            Length in millimeters
        """
        if binary_mask.sum() == 0:
            return 0.0

        if binary_mask.ndim == 4:
            binary_mask = binary_mask[0]

        # Compute distance transform
        distance = ndimage.distance_transform_edt(binary_mask, sampling=self.spacing)

        # Find centerline by skeletonizing the mask
        skeleton = skeletonize(binary_mask.astype(np.uint8), method="lee")

        # Get skeleton coordinates
        skeleton_coords = np.argwhere(skeleton > 0)

        if len(skeleton_coords) < 2:
            return 0.0

        # Convert to physical coordinates
        skeleton_physical = skeleton_coords * self.spacing

        # Compute path length
        length_mm = self._compute_path_length_mst(skeleton_physical)

        return length_mm

    def _compute_volume(self, binary_mask: np.ndarray) -> float:
        """
        Compute colon volume in milliliters.

        Args:
            binary_mask: Binary colon mask

        Returns:
            Volume in milliliters
        """
        if binary_mask.ndim == 4:
            binary_mask = binary_mask[0]

        voxel_volume_mm3 = np.prod(self.spacing)
        volume_ml = binary_mask.sum() * voxel_volume_mm3 / 1000.0

        return float(volume_ml)

    def _compute_surface_area(self, binary_mask: np.ndarray) -> float:
        """
        Compute approximate surface area using voxel counting.

        Args:
            binary_mask: Binary colon mask

        Returns:
            Surface area in cm²
        """
        if binary_mask.ndim == 4:
            binary_mask = binary_mask[0]

        # Count boundary voxels (simple approach)
        # More sophisticated: use marching cubes for actual surface
        eroded = ndimage.binary_erosion(binary_mask)
        boundary = binary_mask & ~eroded

        # Approximate surface area
        voxel_face_area = np.mean(
            [
                self.spacing[i] * self.spacing[j]
                for i in range(3)
                for j in range(i + 1, 3)
            ]
        )
        surface_area_cm2 = boundary.sum() * voxel_face_area / 100.0

        return float(surface_area_cm2)

    def __call__(self, data: Dict) -> Dict:
        """
        Apply the transform to compute metrics.

        Args:
            data: Data dictionary

        Returns:
            Data dictionary with added metrics
        """
        d = dict(data)

        for key in self.key_iterator(d):
            label_array = d[key]

            # Convert to numpy if tensor
            if torch.is_tensor(label_array):
                label_array = label_array.numpy()

            # Extract colon mask
            if self.is_binary:
                colon_mask = (label_array > 0).astype(np.uint8)
            else:
                colon_mask = (label_array == self.colon_label).astype(np.uint8)

            # Compute metrics
            length_skeleton = self._compute_skeleton_length(colon_mask)
            length_centerline = self._compute_centerline_length(colon_mask)
            volume = self._compute_volume(colon_mask)
            surface_area = self._compute_surface_area(colon_mask)

            # Add metrics to data dictionary
            d["colon_length_skeleton_mm"] = length_skeleton
            d["colon_length_centerline_mm"] = length_centerline
            d["colon_length_cm"] = length_centerline / 10.0  # Use centerline as primary
            d["colon_volume_ml"] = volume
            d["colon_surface_area_cm2"] = surface_area

            # Additional derived metrics
            if volume > 0 and length_centerline > 0:
                d["colon_avg_diameter_mm"] = np.sqrt(
                    volume * 1000 / (np.pi * length_centerline)
                )
            else:
                d["colon_avg_diameter_mm"] = 0.0

        return d


def process_dataset_with_monai(
    data_dicts: List[Dict],
    batch_size: int = 1,
    num_workers: int = 4,
    is_binary_pred: bool = False,
) -> pd.DataFrame:
    """
    Process dataset using MONAI pipeline to compute colon metrics.

    Args:
        data_dicts: List of data dictionaries
        batch_size: Batch size for processing
        num_workers: Number of workers for data loading
        is_binary_pred: Whether this is binary prediction data

    Returns:
        DataFrame with all computed metrics
    """
    # Get transforms
    base_transforms = get_transforms_for_analysis(is_binary_pred=is_binary_pred)

    # Add metric computation transform
    metric_transform = ColonLengthMetricsd(
        keys=[Keys.LABEL],
        colon_label=1,
        spacing=[1.5, 1.5, 2.0],
        is_binary=is_binary_pred,
    )

    # Combine transforms
    all_transforms = transforms.Compose([base_transforms, metric_transform])

    # Create dataset
    dataset = Dataset(data=data_dicts, transform=all_transforms)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=list,  # Return list of dicts instead of batched dict
    )

    # Process all data
    results = []

    print(f"Processing {len(data_dicts)} cases...")
    for batch_idx, batch_data in enumerate(dataloader):
        for item in batch_data:
            result = {
                "patient_id": item["patient_id"],
                "dataset_type": item["dataset_type"],
                "length_skeleton_mm": item["colon_length_skeleton_mm"],
                "length_centerline_mm": item["colon_length_centerline_mm"],
                "length_cm": item["colon_length_cm"],
                "volume_ml": item["colon_volume_ml"],
                "surface_area_cm2": item["colon_surface_area_cm2"],
                "avg_diameter_mm": item["colon_avg_diameter_mm"],
            }
            results.append(result)

        if (batch_idx + 1) % 10 == 0:
            print(f"  Processed {(batch_idx + 1) * batch_size}/{len(data_dicts)} cases")

    return pd.DataFrame(results)


def create_comprehensive_plots(df: pd.DataFrame, output_dir: str = "plots"):
    """
    Create comprehensive visualization plots.

    Args:
        df: DataFrame with all measurements
        output_dir: Output directory for plots
    """
    os.makedirs(output_dir, exist_ok=True)

    # Set style
    sns.set_style("whitegrid")
    sns.set_palette("husl")

    # Plot 1: Length distribution comparison (box + violin)
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Box plot
    df.boxplot(column="length_cm", by="dataset_type", ax=axes[0])
    axes[0].set_title(
        "Colon Length Distribution (Box Plot)", fontsize=14, fontweight="bold"
    )
    axes[0].set_xlabel("Dataset", fontsize=12)
    axes[0].set_ylabel("Colon Length (cm)", fontsize=12)
    axes[0].get_figure().suptitle("")  # Remove automatic title
    plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=15, ha="right")

    # Violin plot
    sns.violinplot(data=df, x="dataset_type", y="length_cm", ax=axes[1])
    axes[1].set_title(
        "Colon Length Distribution (Violin Plot)", fontsize=14, fontweight="bold"
    )
    axes[1].set_xlabel("Dataset", fontsize=12)
    axes[1].set_ylabel("Colon Length (cm)", fontsize=12)
    plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=15, ha="right")

    plt.tight_layout()
    plt.savefig(
        os.path.join(output_dir, "01_length_distribution.png"),
        dpi=300,
        bbox_inches="tight",
    )
    plt.close()

    # Plot 2: Histogram overlays
    plt.figure(figsize=(12, 6))
    for dataset in df["dataset_type"].unique():
        subset = df[df["dataset_type"] == dataset]["length_cm"].dropna()
        plt.hist(
            subset, bins=30, alpha=0.5, label=dataset, edgecolor="black", linewidth=0.5
        )

    plt.xlabel("Colon Length (cm)", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.title(
        "Colon Length Distribution - Histogram Overlay", fontsize=14, fontweight="bold"
    )
    plt.legend(fontsize=10)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(
        os.path.join(output_dir, "02_length_histogram.png"),
        dpi=300,
        bbox_inches="tight",
    )
    plt.close()

    # Plot 3: Paired comparison for C-grade (original vs predicted)
    c_orig = df[df["dataset_type"] == "c_grade_original"].set_index("patient_id")
    c_pred = df[df["dataset_type"] == "c_grade_predicted"].set_index("patient_id")

    common_patients = c_orig.index.intersection(c_pred.index)

    if len(common_patients) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(16, 7))

        orig_lengths = c_orig.loc[common_patients, "length_cm"].values
        pred_lengths = c_pred.loc[common_patients, "length_cm"].values

        # Scatter plot
        axes[0].scatter(
            orig_lengths,
            pred_lengths,
            alpha=0.6,
            s=100,
            edgecolors="black",
            linewidth=0.5,
        )

        # Perfect agreement line
        min_val = min(orig_lengths.min(), pred_lengths.min())
        max_val = max(orig_lengths.max(), pred_lengths.max())
        axes[0].plot(
            [min_val, max_val],
            [min_val, max_val],
            "r--",
            linewidth=2,
            label="Perfect Agreement",
            alpha=0.7,
        )

        # Add correlation coefficient
        correlation = np.corrcoef(orig_lengths, pred_lengths)[0, 1]
        axes[0].text(
            0.05,
            0.95,
            f"r = {correlation:.3f}",
            transform=axes[0].transAxes,
            fontsize=12,
            verticalalignment="top",
            bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
        )

        axes[0].set_xlabel("Original C-Grade Length (cm)", fontsize=12)
        axes[0].set_ylabel("Predicted Length (cm)", fontsize=12)
        axes[0].set_title(
            "Original vs Predicted: Correlation", fontsize=14, fontweight="bold"
        )
        axes[0].legend()
        axes[0].grid(alpha=0.3)
        axes[0].set_aspect("equal", adjustable="box")

        # Bland-Altman plot
        mean_lengths = (orig_lengths + pred_lengths) / 2
        diff_lengths = pred_lengths - orig_lengths
        mean_diff = np.mean(diff_lengths)
        std_diff = np.std(diff_lengths)

        axes[1].scatter(
            mean_lengths,
            diff_lengths,
            alpha=0.6,
            s=100,
            edgecolors="black",
            linewidth=0.5,
        )
        axes[1].axhline(
            mean_diff,
            color="blue",
            linestyle="-",
            linewidth=2,
            label=f"Mean Diff: {mean_diff:.2f} cm",
        )
        axes[1].axhline(
            mean_diff + 1.96 * std_diff,
            color="red",
            linestyle="--",
            linewidth=2,
            label=f"+1.96 SD: {mean_diff + 1.96*std_diff:.2f} cm",
        )
        axes[1].axhline(
            mean_diff - 1.96 * std_diff,
            color="red",
            linestyle="--",
            linewidth=2,
            label=f"-1.96 SD: {mean_diff - 1.96*std_diff:.2f} cm",
        )
        axes[1].axhline(0, color="black", linestyle="-", linewidth=1, alpha=0.3)

        axes[1].set_xlabel("Mean Length (cm)", fontsize=12)
        axes[1].set_ylabel("Difference (Predicted - Original) cm", fontsize=12)
        axes[1].set_title("Bland-Altman Plot", fontsize=14, fontweight="bold")
        axes[1].legend(fontsize=9)
        axes[1].grid(alpha=0.3)

        plt.tight_layout()
        plt.savefig(
            os.path.join(output_dir, "03_c_grade_comparison.png"),
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

        # Plot 4: Individual patient improvement bars
        differences = pred_lengths - orig_lengths

        plt.figure(figsize=(14, 6))
        colors = ["green" if d > 0 else "red" for d in differences]
        bars = plt.bar(
            range(len(differences)),
            differences,
            color=colors,
            alpha=0.7,
            edgecolor="black",
            linewidth=0.5,
        )

        plt.axhline(y=0, color="black", linestyle="-", linewidth=1.5)
        plt.axhline(
            y=mean_diff,
            color="blue",
            linestyle="--",
            linewidth=2,
            label=f"Mean: {mean_diff:.2f} cm",
        )

        plt.xlabel("Patient Index", fontsize=12)
        plt.ylabel("Length Difference (Predicted - Original) cm", fontsize=12)
        plt.title(
            "Per-Patient Length Change: Improvement Analysis",
            fontsize=14,
            fontweight="bold",
        )
        plt.legend()
        plt.grid(axis="y", alpha=0.3)
        plt.tight_layout()
        plt.savefig(
            os.path.join(output_dir, "04_improvement_bars.png"),
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

        # Plot 5: Volume vs Length relationship
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))

        for dataset in df["dataset_type"].unique():
            subset = df[df["dataset_type"] == dataset]
            axes[0].scatter(
                subset["length_cm"],
                subset["volume_ml"],
                label=dataset,
                alpha=0.6,
                s=100,
                edgecolors="black",
                linewidth=0.5,
            )

        axes[0].set_xlabel("Colon Length (cm)", fontsize=12)
        axes[0].set_ylabel("Colon Volume (ml)", fontsize=12)
        axes[0].set_title(
            "Length vs Volume Relationship", fontsize=14, fontweight="bold"
        )
        axes[0].legend()
        axes[0].grid(alpha=0.3)

        # Average diameter distribution
        for dataset in df["dataset_type"].unique():
            subset = df[df["dataset_type"] == dataset]["avg_diameter_mm"].dropna()
            axes[1].hist(
                subset,
                bins=20,
                alpha=0.5,
                label=dataset,
                edgecolor="black",
                linewidth=0.5,
            )

        axes[1].set_xlabel("Average Colon Diameter (mm)", fontsize=12)
        axes[1].set_ylabel("Frequency", fontsize=12)
        axes[1].set_title(
            "Average Diameter Distribution", fontsize=14, fontweight="bold"
        )
        axes[1].legend()
        axes[1].grid(alpha=0.3)

        plt.tight_layout()
        plt.savefig(
            os.path.join(output_dir, "05_volume_diameter_analysis.png"),
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

    print(f"\nAll plots saved to '{output_dir}/' directory")


def generate_statistical_summary(df: pd.DataFrame, output_dir: str = "plots"):
    """
    Generate comprehensive statistical summary and tests.

    Args:
        df: DataFrame with measurements
        output_dir: Output directory
    """
    from scipy import stats

    print("\n" + "=" * 80)
    print("COMPREHENSIVE STATISTICAL ANALYSIS")
    print("=" * 80)

    # Overall summary statistics
    summary = (
        df.groupby("dataset_type")
        .agg(
            {
                "length_cm": ["count", "mean", "std", "min", "median", "max"],
                "volume_ml": ["mean", "std", "median"],
                "avg_diameter_mm": ["mean", "std", "median"],
            }
        )
        .round(2)
    )

    print("\n1. DESCRIPTIVE STATISTICS")
    print("-" * 80)
    print(summary)

    # Statistical tests for C-grade comparison
    c_orig = df[df["dataset_type"] == "c_grade_original"].set_index("patient_id")
    c_pred = df[df["dataset_type"] == "c_grade_predicted"].set_index("patient_id")

    common_patients = c_orig.index.intersection(c_pred.index)

    if len(common_patients) > 0:
        orig_lengths = c_orig.loc[common_patients, "length_cm"].values
        pred_lengths = c_pred.loc[common_patients, "length_cm"].values

        # Paired t-test
        t_stat, p_value = stats.ttest_rel(pred_lengths, orig_lengths)

        # Wilcoxon signed-rank test (non-parametric alternative)
        w_stat, w_p_value = stats.wilcoxon(pred_lengths, orig_lengths)

        # Effect size (Cohen's d)
        differences = pred_lengths - orig_lengths
        cohens_d = np.mean(differences) / np.std(differences)

        print("\n2. PAIRED COMPARISON: C-GRADE ORIGINAL vs PREDICTED")
        print("-" * 80)
        print(f"Number of paired samples: {len(common_patients)}")
        print(f"Mean difference (Pred - Orig): {np.mean(differences):.2f} cm")
        print(f"Std of differences: {np.std(differences):.2f} cm")
        print(f"\nPaired t-test:")
        print(f"  t-statistic: {t_stat:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")
        print(f"\nWilcoxon signed-rank test:")
        print(f"  W-statistic: {w_stat:.4f}")
        print(f"  p-value: {w_p_value:.4f}")
        print(f"\nEffect size (Cohen's d): {cohens_d:.4f}")

        # Correlation
        correlation = np.corrcoef(orig_lengths, pred_lengths)[0, 1]
        print(f"Pearson correlation: {correlation:.4f}")

        # Improvement metrics
        improved = (differences > 0).sum()
        worsened = (differences < 0).sum()
        unchanged = (differences == 0).sum()

        print(f"\n3. IMPROVEMENT ANALYSIS")
        print("-" * 80)
        print(
            f"Cases with increased length: {improved} ({100*improved/len(differences):.1f}%)"
        )
        print(
            f"Cases with decreased length: {worsened} ({100*worsened/len(differences):.1f}%)"
        )
        print(f"Cases unchanged: {unchanged} ({100*unchanged/len(differences):.1f}%)")

        # Mean absolute error and relative error
        mae = np.mean(np.abs(differences))
        mape = np.mean(np.abs(differences) / orig_lengths) * 100

        print(f"\nMean Absolute Error: {mae:.2f} cm")
        print(f"Mean Absolute Percentage Error: {mape:.2f}%")

    # ANOVA across all groups
    groups = [
        df[df["dataset_type"] == dt]["length_cm"].dropna().values
        for dt in df["dataset_type"].unique()
    ]

    if len(groups) > 2:
        f_stat, anova_p = stats.f_oneway(*groups)

        print(f"\n4. ONE-WAY ANOVA (ALL GROUPS)")
        print("-" * 80)
        print(f"F-statistic: {f_stat:.4f}")
        print(f"p-value: {anova_p:.4f}")
        print(f"Significant at α=0.05: {'Yes' if anova_p < 0.05 else 'No'}")

    print("\n" + "=" * 80 + "\n")

    # Save summary to CSV
    summary.to_csv(os.path.join(output_dir, "statistical_summary.csv"))

    # Save detailed comparison if available
    if len(common_patients) > 0:
        comparison_df = pd.DataFrame(
            {
                "patient_id": common_patients,
                "original_length_cm": orig_lengths,
                "predicted_length_cm": pred_lengths,
                "difference_cm": differences,
                "percent_change": (differences / orig_lengths) * 100,
            }
        )
        comparison_df.to_csv(
            os.path.join(output_dir, "c_grade_paired_comparison.csv"), index=False
        )
        print(
            f"Detailed comparison saved to '{output_dir}/c_grade_paired_comparison.csv'"
        )


def main():
    """
    Main execution function integrating MONAI transforms.
    """
    # ==================== CONFIGURATION ====================
    # UPDATE THESE PATHS TO YOUR ACTUAL DATA
    # TRAINING_JSONL = "/path/to/training_data.jsonl"
    # C_GRADE_JSONL = "/path/to/c_grade_data.jsonl"
    # PRED_DIR = "/path/to/predictions"
    # OUTPUT_DIR = "colon_analysis_results"

    TRAINING_JSONL = "/home/yb107/cvpr2025/DukeDiffSeg/data/mobina_mixed_colon_dataset/mobina_mixed_colon_dataset_with_body_filled.jsonl"
    C_GRADE_JSONL = "/home/yb107/cvpr2025/DukeDiffSeg/data/c_grade_colons/3d_vlsmv2_c_grade_colon_dataset_with_body_filled.jsonl"
    PRED_DIR = "/home/yb107/cvpr2025/DukeDiffSeg/outputs/diffunet-binary-colon/5.1/inference_c_grade_550_gs_2.0_final_small_with_skeletonization"
    OUTPUT_DIR = "/home/yb107/cvpr2025/DukeDiffSeg/notebook/length_analysis"

    BATCH_SIZE = 4
    NUM_WORKERS = 4
    # =======================================================

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("=" * 80)
    print("COLON LENGTH ANALYSIS USING MONAI TRANSFORMS")
    print("=" * 80)

    # Step 1: Load data dictionaries
    print("\n[1/5] Loading data dictionaries...")

    training_dicts = get_data_dicts_from_jsonl(TRAINING_JSONL, dataset_type="training")
    print(f"  Loaded {len(training_dicts)} training cases")

    c_grade_dicts = get_data_dicts_from_jsonl(
        C_GRADE_JSONL, dataset_type="c_grade_original"
    )
    print(f"  Loaded {len(c_grade_dicts)} C-grade original cases")

    c_pred_dicts = get_c_grade_prediction_dicts(C_GRADE_JSONL, PRED_DIR)
    print(f"  Loaded {len(c_pred_dicts)} C-grade prediction cases")

    # Step 2: Process datasets with MONAI
    print("\n[2/5] Processing datasets with MONAI transforms...")

    print("\n  Processing training data...")
    df_training = process_dataset_with_monai(
        training_dicts,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        is_binary_pred=False,
    )

    print("\n  Processing C-grade original data...")
    df_c_grade = process_dataset_with_monai(
        c_grade_dicts,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        is_binary_pred=False,
    )

    print("\n  Processing C-grade predictions...")
    df_c_pred = process_dataset_with_monai(
        c_pred_dicts,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        is_binary_pred=True,
    )

    # Step 3: Combine results
    print("\n[3/5] Combining results...")
    df_all = pd.concat([df_training, df_c_grade, df_c_pred], ignore_index=True)

    # Save raw measurements
    output_csv = os.path.join(OUTPUT_DIR, "colon_measurements_all.csv")
    df_all.to_csv(output_csv, index=False)
    print(f"  Saved {len(df_all)} measurements to '{output_csv}'")

    # Step 4: Generate visualizations
    print("\n[4/5] Generating visualizations...")
    create_comprehensive_plots(df_all, output_dir=OUTPUT_DIR)

    # Step 5: Statistical analysis
    print("\n[5/5] Performing statistical analysis...")
    generate_statistical_summary(df_all, output_dir=OUTPUT_DIR)

    print("\n" + "=" * 80)
    print("ANALYSIS COMPLETE!")
    print(f"All results saved to '{OUTPUT_DIR}/' directory")
    print("=" * 80 + "\n")


main()

COLON LENGTH ANALYSIS USING MONAI TRANSFORMS

[1/5] Loading data dictionaries...
  Loaded 419 training cases
  Loaded 67 C-grade original cases
  Loaded 11 C-grade prediction cases

[2/5] Processing datasets with MONAI transforms...

  Processing training data...
Processing 419 cases...


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/nibabel/loadsave.py", line 101, in load
    stat_result = os.stat(filename)
FileNotFoundError: [Errno 2] No such file or directory: '/data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/images/Patient_00101_Study_73554_Series_04.nii.gz'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/transform.py", line 150, in apply_transform
    return _apply_transform(transform, data, unpack_items, lazy, overrides, log_stats)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/transform.py", line 98, in _apply_transform
    return transform(data, lazy=lazy) if isinstance(transform, LazyTrait) else transform(data)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/io/dictionary.py", line 163, in __call__
    data = self._loader(d[key], reader)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/io/array.py", line 263, in __call__
    img = reader.read(filename)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/data/image_reader.py", line 1088, in read
    img = nib.load(name, **kwargs_)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/nibabel/loadsave.py", line 103, in load
    raise FileNotFoundError(f"No such file or no access: '{filename}'")
FileNotFoundError: No such file or no access: '/data/usr/yb107/colon_data/refined_by_mobina/a_grade_colons_not_in_refined_by_md/images/Patient_00101_Study_73554_Series_04.nii.gz'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/transform.py", line 150, in apply_transform
    return _apply_transform(transform, data, unpack_items, lazy, overrides, log_stats)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/transform.py", line 98, in _apply_transform
    return transform(data, lazy=lazy) if isinstance(transform, LazyTrait) else transform(data)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/compose.py", line 346, in __call__
    result = execute_compose(
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/compose.py", line 116, in execute_compose
    data = apply_transform(
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/transform.py", line 180, in apply_transform
    raise RuntimeError(f"applying transform {transform}") from e
RuntimeError: applying transform <monai.transforms.io.dictionary.LoadImaged object at 0x7fd5366ce290>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/data/dataset.py", line 108, in __getitem__
    return self._transform(index)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/data/dataset.py", line 94, in _transform
    return self.transform(data_i)
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/compose.py", line 346, in __call__
    result = execute_compose(
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/compose.py", line 116, in execute_compose
    data = apply_transform(
  File "/home/yb107/.local/share/virtualenvs/DukeDiffSeg-HooVw7aP/lib/python3.10/site-packages/monai/transforms/transform.py", line 180, in apply_transform
    raise RuntimeError(f"applying transform {transform}") from e
RuntimeError: applying transform <monai.transforms.compose.Compose object at 0x7fd4d969d540>
