In [None]:
import os
from collections import defaultdict

data_path = '/data4/processed_data'
embedding_path = '/data4/embedding'

def get_total_slides(dataset_path):
    """Get total slides count in the output directory."""
    output_path = os.path.join(dataset_path, 'output')
    return len(os.listdir(output_path)) if os.path.exists(output_path) else 0

def analyze_embeddings():
    dataset_model_counts = defaultdict(int)
    dataset_missing_counts = defaultdict(lambda: defaultdict(int))
    all_models = set()
    
    for dataset in os.listdir(data_path):
        dataset_path = os.path.join(data_path, dataset)
        total_slides = get_total_slides(dataset_path)
        #print(f'{dataset} has {total_slides} slides')
        
        dataset_embedding_path = os.path.join(embedding_path, dataset)
        if not os.path.exists(dataset_embedding_path):
           # print(f'{dataset} has no embedding')
            #print('---------------------------')
            continue
        
        model_slide_counts = {}
        for model in os.listdir(dataset_embedding_path):
            model_path = os.path.join(dataset_embedding_path, model)
            processed_slides = len(os.listdir(model_path))
            model_slide_counts[model] = processed_slides
            all_models.add(model)
            #print(f'{model} has {processed_slides} slides processed')
            missing_slides = total_slides - processed_slides
            dataset_missing_counts[dataset][model] = missing_slides
            dataset_model_counts[dataset] = max(dataset_model_counts[dataset], len(model_slide_counts))
        
        #print('---------------------------')
    
    # Determine the complete set of models that should be present
    max_model_count = max(dataset_model_counts.values(), default=0)
    
    print("Summary of missing slides per dataset/model:")
    for dataset, models in dataset_missing_counts.items():
        print(f'Dataset: {dataset}')
        missing_models = all_models - set(models.keys())
        for model in all_models:
            if model not in models:
                print(f'  Missing Model: {model} (0 slides processed)')
            else:
                print(f'  Model: {model}, Missing: {models[model]} slides')
        
        #print(f'  Missing {len(missing_models)} models to match max ({max_model_count})')
        print('---------------------------')

if __name__ == "__main__":
    analyze_embeddings()


Summary of missing slides per dataset/model:
Dataset: private_chunk_9
  Missing Model: FMBC (0 slides processed)
  Missing Model: CONCH (0 slides processed)
  Missing Model: UNI (0 slides processed)
  Missing Model: Virchow (0 slides processed)
  Missing Model: Gigapath_tile (0 slides processed)
  Missing Model: TITAN (0 slides processed)
  Missing Model: CHIEF_tile (0 slides processed)
  Model: UNI-2, Missing: -2695 slides
  Missing Model: CHIEF (0 slides processed)
  Missing Model: Gigapath (0 slides processed)
  Missing 9 models to match max (9)
---------------------------
Dataset: TCGA-BRCA
  Model: FMBC, Missing: 935 slides
  Model: CONCH, Missing: -2126 slides
  Model: UNI, Missing: -992 slides
  Missing Model: Virchow (0 slides processed)
  Model: Gigapath_tile, Missing: -991 slides
  Model: TITAN, Missing: -1604 slides
  Model: CHIEF_tile, Missing: -991 slides
  Model: UNI-2, Missing: 0 slides
  Model: CHIEF, Missing: -991 slides
  Model: Gigapath, Missing: -991 slides
  Missin