In the segfm3d dataset, search cases that help evaluate IC segmentation :
- multiple cases with same tasks

In [2]:
import os
from src.config import config
import pandas as pd
import ast


origin_image_dir = os.path.join(config["DATA_DIR"], "3D_val_npz")
origin_label_dir = os.path.join(config["DATA_DIR"], "3D_val_gt_interactive_seg")
current_dir = os.getcwd()
segfm3d_stats_bycase = pd.read_csv(os.path.join(current_dir, "ic_segmentation/notebooks/segfm3d_stats_bycase.csv"))
segfm3d_stats_bycase['nb_obj'] = segfm3d_stats_bycase['ours_object_shapes'].apply(
    lambda x: len(ast.literal_eval(x))
)


In [4]:
print(len(segfm3d_stats_bycase))
print(len(segfm3d_stats_bycase[segfm3d_stats_bycase['nb_obj'] == 1]))


2130
375


In [25]:
from collections import defaultdict
import re

def extract_dataset_name(filename):
    """
    Extract dataset name handling both _ and - as separators.
    """
    # First split by underscore to get modality and rest
    parts = filename.split('_', 1)  # Split only on first underscore
    
    if len(parts) < 2:
        return filename
    
    modality = parts[0]  # CT, MR, PET, US, Microscopy
    rest = parts[1]
    
    # Now split the rest by both _ and - to get all components
    components = re.split(r'[_-]', rest)
    
    dataset_parts = []
    
    for part in components:
        # Check if this part looks like a case identifier
        if is_case_identifier(part):
            break
        dataset_parts.append(part)
    
    if dataset_parts:
        return modality + '_' + '_'.join(dataset_parts)
    else:
        # Fallback: just use the first component after modality
        return modality + '_' +components[0] if components else modality + '_' + rest

def is_case_identifier(part):
    """
    Determine if a part looks like a case identifier rather than part of dataset name.
    """
    # Pure numbers (with optional leading zeros)
    if re.match(r'^\d+$', part):
        return True
    
    # Patterns like AortaD10, AortaR5 (repeating word + letter + number)
    # Check if the part ends with a capital letter followed by digits
    if re.match(r'^[A-Z][a-z]+[A-Z]\d+$', part):
        return True
    
    # Number patterns like "0018", "s0001", "patient001"
    if re.match(r'^[a-z]?\d{2,}$', part, re.IGNORECASE):
        return True
    
    # Hex-like strings (long alphanumeric)
    if len(part) > 8 and re.match(r'^[0-9a-f]+$', part, re.IGNORECASE):
        return True
    
    # Date patterns
    if re.match(r'^\d{4}-\d{2}-\d{2}$', part) or re.match(r'^\d{2}-\d{2}-\d{4}$', part):
        return True
        
    # Common case prefixes
    if any(part.lower().startswith(prefix) for prefix in 
           ['case', 'patient', 'sub', 'volume', 'acdc', 'lung1', 'lctsc']):
        return True
    
    # Sequence/imaging descriptors (these come after the case ID usually)
    # These are NOT case identifiers
    if part.lower() in ['t1', 't2', 't1n', 't1c', 't2f', 't2w', 'adc', 'dwi', 'flair', 
                         'spi', 'ivd', 'vet', 'myo', 'lvrv', 'sequence', 'cet1',
                         't1dual', 't2spir', 'mr', 'ceT1', 'space']:
        return False
    
    # Frame numbers
    if re.match(r'^frame\d+$', part, re.IGNORECASE):
        return True
    
    # Specific patterns like "2CH", "4CH" (cardiac views)
    if re.match(r'^\d+CH$', part, re.IGNORECASE):
        return False
    
    # Patterns like "half", "preRT", "midRT" are descriptors, not case IDs
    if part.lower() in ['half', 'prert', 'midrt', 'amos', 'bratsgli', 'atm',
                         'coronacases', 'segrap', 'msd', 'prostate', 'nci',
                         'prostatex', 'pcampmri', 'fdg', 'psma', 'etz', 'ldn', 'ukm']:
        return False
    
    return False

# Group the filenames
def group_by_dataset(filenames):
    datasets = defaultdict(list)
    
    for filename in filenames:
        dataset = extract_dataset_name(filename)
        datasets[dataset].append(filename)
    
    return dict(sorted(datasets.items()))

# Test with problematic cases
test_cases = [
    'CT_Aorta_AortaD10',
    'CT_Aorta_AortaD6',
    'CT_Aorta_AortaK12',
    'CT_Aorta_AortaR5',
    'CT_Aorta_AortaR9',
    'CT_LNQ_LymphNode-0380',
    'CT_LNQ_LymphNode-0895',
    'MR_Spider_101_t2_spi',
    'MR_Spider_11_t2_spi',
]

result = group_by_dataset(test_cases)
for dataset, files in result.items():
    print(f"{dataset}: {files}")

CT_Aorta: ['CT_Aorta_AortaD10', 'CT_Aorta_AortaD6', 'CT_Aorta_AortaK12', 'CT_Aorta_AortaR5', 'CT_Aorta_AortaR9']
CT_LNQ_LymphNode: ['CT_LNQ_LymphNode-0380', 'CT_LNQ_LymphNode-0895']
MR_Spider: ['MR_Spider_101_t2_spi', 'MR_Spider_11_t2_spi']


In [26]:
dataset_group = group_by_dataset(segfm3d_stats_bycase[segfm3d_stats_bycase['nb_obj'] == 1]['CaseName'].tolist())
# print datasets with only one case
total_nb_case = 0
for dataset, files in dataset_group.items():
    if len(files) == 1:
        print(f"{dataset}: {files}")
    total_nb_case += len(files)
print(f"Total number of cases: {total_nb_case}")

CT_COVID19_CT_Seg_Bench_coronacases: ['CT_COVID19-CT-Seg-Bench_coronacases_006']
MR_BraTS_T2w_bratsgli: ['MR_BraTS-T2w_bratsgli_0811']
MR_ProstateADC_ADC_MSD_prostate: ['MR_ProstateADC_ADC-MSD-prostate_06']
MR_totalseg_mr: ['MR_totalseg_mr_s0058']
Microscopy_SELMA3D_patchvolume_ADplaques: ['Microscopy_SELMA3D_patchvolume_ADplaques_019']
Total number of cases: 375


In [28]:
# keep datasets with more than 2 cases
dataset_group_more_than_2 = {k: v for k, v in dataset_group.items() if len(v) > 2}
dataset_group_more_than_2

{'CT_AbdTumor': ['CT_AbdTumor_case_00004',
  'CT_AbdTumor_case_00007',
  'CT_AbdTumor_case_00039',
  'CT_AbdTumor_case_00048',
  'CT_AbdTumor_case_00071',
  'CT_AbdTumor_case_00108',
  'CT_AbdTumor_case_00132',
  'CT_AbdTumor_case_00162',
  'CT_AbdTumor_case_00175',
  'CT_AbdTumor_case_00208',
  'CT_AbdTumor_case_00209',
  'CT_AbdTumor_case_00239',
  'CT_AbdTumor_case_00245',
  'CT_AbdTumor_case_00251',
  'CT_AbdTumor_case_00295',
  'CT_AbdTumor_case_00406',
  'CT_AbdTumor_case_00415',
  'CT_AbdTumor_case_00443',
  'CT_AbdTumor_case_00445',
  'CT_AbdTumor_case_00460',
  'CT_AbdTumor_case_00461',
  'CT_AbdTumor_case_00466',
  'CT_AbdTumor_case_00489',
  'CT_AbdTumor_case_00517',
  'CT_AbdTumor_case_00569',
  'CT_AbdTumor_case_00572',
  'CT_AbdTumor_case_00580'],
 'CT_AbdTumor_Adrenal_Ki67_Seg': ['CT_AbdTumor_Adrenal_Ki67_Seg_003',
  'CT_AbdTumor_Adrenal_Ki67_Seg_004',
  'CT_AbdTumor_Adrenal_Ki67_Seg_018',
  'CT_AbdTumor_Adrenal_Ki67_Seg_025',
  'CT_AbdTumor_Adrenal_Ki67_Seg_030'],
 'CT_