In [None]:
#Imports
import pandas as pd
import sys
import os
import json
from transformers import AutoTokenizer

# Add utils to path
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from general_utils import (
    parse_jsonish,
    load_data,
    prepare_sample,
    prepare_all_samples,
    get_entity_date_pairs,
    calculate_metrics
)

from bert_training_utils import create_training_pairs
from naive_extractor_utils import naive_extraction

In [None]:
#Dataset to test
#test_df_path = "../data/training_dataset.csv"
test_df_path = "../data/training_dataset_synthetic2.csv"

In [None]:
#Test load_data
def test_load_data():
    """Test loading and parsing of training dataset"""
    df = load_data(test_df_path)
    
    print("Dataset Overview:")
    print(f"Number of documents: {len(df)}")
    print("\nColumns present:")
    for col in df.columns:
        print(f"- {col}")
    
    # Check first row
    first_row = df.iloc[0]
    print("\nFirst row contents:")
    print(f"Document ID: {first_row.get('doc_id')}")
    print(f"Text length: {len(first_row['note_text'])} characters")
    print(f"Number of entities: {len(first_row['entities_json'])}")
    print(f"Number of dates: {len(first_row['dates_json'])}")
    
    # Sample of parsed content
    print("\nSample entities (first 3):")
    for e in first_row['entities_json'][:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nSample dates (first 3):")
    for d in first_row['dates_json'][:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return df

# Run test
df = test_load_data()

In [None]:
#Test prepare_sample
def test_prepare_sample():
    """Test preparation of a single sample"""
    # Get first row
    row = df.iloc[0]
    
    # Prepare sample
    note_text, entities_list, dates = prepare_sample(row)
    
    print("Sample Preparation Results:")
    print(f"\nText length: {len(note_text)} characters")
    print(f"Number of entities: {len(entities_list)}")
    print(f"Number of dates: {len(dates)}")
    
    print("\nFirst 3 entities:")
    for e in entities_list[:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nFirst 3 dates:")
    for d in dates[:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return note_text, entities_list, dates

# Run test
note_text, entities_list, dates = test_prepare_sample()

In [None]:
#Test prepare_all_samples
def test_prepare_all_samples():
    """Test preparation of all samples"""
    samples = prepare_all_samples(df)
    
    print("All Samples Preparation Results:")
    print(f"Number of samples prepared: {len(samples)}")
    
    # Check first sample
    first_sample = samples[0]
    print("\nFirst sample contents:")
    print(f"- doc_id: {first_sample['doc_id']}")
    print(f"- Text length: {len(first_sample['note_text'])} characters")
    print(f"- Number of entities: {len(first_sample['entities_list'])}")
    print(f"- Number of dates: {len(first_sample['dates'])}")
    print(f"- Number of relative dates: {len(first_sample['relative_dates'])}")
    
    # Print first few entities and dates
    print("\nFirst 3 entities:")
    for e in first_sample['entities_list'][:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nFirst 3 dates:")
    for d in first_sample['dates'][:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    print("\nFirst 3 relative dates:")
    for rd in first_sample['relative_dates'][:3]:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    return samples

# Run test
samples = test_prepare_all_samples()

In [None]:
#Test get_entity_date_pairs
def test_get_entity_date_pairs():
    """Test creation of entity-date pairs"""
    # Get first sample
    sample = samples[0]
    
    # Get pairs
    pairs = get_entity_date_pairs(
        sample['entities_list'],
        sample['dates'],
        sample['relative_dates']
    )
    
    print("Entity-Date Pairs Results:")
    print(f"Total pairs generated: {len(pairs)}")
    
    print("\nFirst 5 pairs:")
    for i, pair in enumerate(pairs[:5]):
        print(f"\nPair {i+1}:")
        print(f"Entity: {pair['entity_label']} ({pair['entity']['start']}-{pair['entity']['end']})")
        print(f"Date: {pair['date']} ({pair['date_info']['start']}-{pair['date_info']['end']})")
        print(f"Distance: {pair['distance']} chars")
        print(f"Date type: {pair['date_type']}")
    
    return pairs

# Run test
pairs = test_get_entity_date_pairs()

In [None]:
#Test Relative Date Handling
def test_relative_dates():
    """Test specific handling of relative dates in pairs"""
    
    # Get first sample
    sample = samples[0]
    
    print("Relative Date Analysis:")
    print(f"Total relative dates: {len(sample['relative_dates'])}")
    
    # Show all relative dates
    print("\nAll relative dates:")
    for rd in sample['relative_dates']:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    # Find pairs with relative dates
    pairs = get_entity_date_pairs(
        sample['entities_list'],
        sample['dates'],
        sample['relative_dates']
    )
    
    relative_pairs = [p for p in pairs if p['date_type'] == 'relative']
    print(f"\nPairs using relative dates: {len(relative_pairs)}")
    print("\nFirst 3 relative date pairs:")
    for p in relative_pairs[:3]:
        print(f"\nEntity: {p['entity_label']} ({p['entity']['start']}-{p['entity']['end']})")
        print(f"Date: {p['date']} ({p['date_info']['start']}-{p['date_info']['end']})")
        print(f"Distance: {p['distance']} chars")

# Run test
test_relative_dates()

In [None]:
#Test Duplicate Relations
def test_duplicate_relations(samples):
    """Investigate potential duplicate relations in training pairs"""
    print("=== Duplicate Relations Analysis ===")
    
    # Get first sample
    sample = samples[0]
    
    # Analyze relations
    print("\nOriginal Relations:")
    relations = sample['relations_json']
    print(f"Total relations in gold set: {len(relations)}")
    
    # Check for duplicates in relations
    entity_date_pairs = [(r['entity'], r['date']) for r in relations]
    unique_pairs = set(entity_date_pairs)
    print(f"Unique entity-date pairs: {len(unique_pairs)}")
    
    if len(entity_date_pairs) != len(unique_pairs):
        print("\nFound duplicate relations:")
        from collections import Counter
        duplicates = Counter(entity_date_pairs)
        for pair, count in duplicates.items():
            if count > 1:
                print(f"- {pair[0]} -> {pair[1]} (appears {count} times)")
    
    # Create training pairs and analyze
    df = create_training_pairs([sample])
    positive_pairs = df[df['label'] == 1]
    
    print("\nTraining Pairs Analysis:")
    print(f"Total pairs created: {len(df)}")
    print(f"Positive pairs: {len(positive_pairs)}")
    
    # Check if same entity-date pair appears multiple times
    pair_texts = [(row['marked_text'].split('[E1]')[1].split('[/E1]')[0].strip(),
                  row['marked_text'].split('[E2]')[1].split('[/E2]')[0].strip())
                  for _, row in positive_pairs.iterrows()]
    unique_pair_texts = set(pair_texts)
    
    print(f"Unique positive pairs: {len(unique_pair_texts)}")
    
    if len(pair_texts) != len(unique_pair_texts):
        print("\nFound duplicate pairs in training data:")
        duplicates = Counter(pair_texts)
        for pair, count in duplicates.items():
            if count > 1:
                print(f"- {pair[0]} -> {pair[1]} (appears {count} times)")

# Run test
test_duplicate_relations(samples)

In [None]:
#Test Token Length and Distance Analysis
def test_text_lengths_and_distances():
    """Analyze document lengths and distances between entities/dates"""
    print("=== Text Length and Distance Analysis ===")
    
    # Document Length Analysis across all samples
    print("\nDocument Length Analysis:")
    text_lengths = [len(s['note_text']) for s in samples]
    word_counts = [len(s['note_text'].split()) for s in samples]
    
    print(f"Total documents: {len(samples)}")
    print(f"Mean document length: {sum(text_lengths)/len(text_lengths):.1f} characters")
    print(f"Min document length: {min(text_lengths)} characters")
    print(f"Max document length: {max(text_lengths)} characters")
    print(f"Mean words per document: {sum(word_counts)/len(word_counts):.1f} words")
    print(f"Min words: {min(word_counts)} words")
    print(f"Max words: {max(word_counts)} words")
    
    # Distance Analysis for Relations across all documents
    print("\nDistance Analysis for Relations:")
    
    all_distances = []
    abs_distances = []
    rel_distances = []
    
    for sample in samples:
        for relation in sample['relations_json']:
            # Find corresponding entity and date objects
            entity = next((e for e in sample['entities_list'] if str(e['id']) == str(relation['entity_id'])), None)
            date = next((d for d in sample['dates'] if str(d['id']) == str(relation['date_id'])), None)
            rel_date = next((rd for rd in sample['relative_dates'] if str(rd['id']) == str(relation['date_id'])), None)
            
            if entity:
                if date:
                    distance = abs(entity['start'] - date['start'])
                    all_distances.append(distance)
                    abs_distances.append(distance)
                elif rel_date:
                    distance = abs(entity['start'] - rel_date['start'])
                    all_distances.append(distance)
                    rel_distances.append(distance)
    
    print("\nAll Relations:")
    print(f"Total relations: {len(all_distances)}")
    if all_distances:
        print(f"Mean distance: {sum(all_distances)/len(all_distances):.1f} chars")
        print(f"Min distance: {min(all_distances)} chars")
        print(f"Max distance: {max(all_distances)} chars")
    
    print("\nAbsolute Date Relations:")
    print(f"Total relations: {len(abs_distances)}")
    if abs_distances:
        print(f"Mean distance: {sum(abs_distances)/len(abs_distances):.1f} chars")
        print(f"Min distance: {min(abs_distances)} chars")
        print(f"Max distance: {max(abs_distances)} chars")
    
    print("\nRelative Date Relations:")
    print(f"Total relations: {len(rel_distances)}")
    if rel_distances:
        print(f"Mean distance: {sum(rel_distances)/len(rel_distances):.1f} chars")
        print(f"Min distance: {min(rel_distances)} chars")
        print(f"Max distance: {max(rel_distances)} chars")
    
    # Show examples of closest and furthest pairs across all documents
    if all_distances:
        print("\nExample Relations:")
        closest_global = min(all_distances)
        furthest_global = max(all_distances)
        
        # Find examples of closest and furthest pairs
        for sample in samples:
            for relation in sample['relations_json']:
                entity = next((e for e in sample['entities_list'] if str(e['id']) == str(relation['entity_id'])), None)
                date = next((d for d in sample['dates'] if str(d['id']) == str(relation['date_id'])), None)
                rel_date = next((rd for rd in sample['relative_dates'] if str(rd['id']) == str(relation['date_id'])), None)
                
                if entity and (date or rel_date):
                    date_obj = date if date else rel_date
                    distance = abs(entity['start'] - date_obj['start'])
                    
                    if distance == closest_global:
                        print(f"\nClosest Pair (across all documents):")
                        print(f"Distance: {distance} chars")
                        print(f"Entity: {entity['value']} (Position: {entity['start']}-{entity['end']})")
                        print(f"Date: {date_obj['value']} (Position: {date_obj['start']}-{date_obj['end']})")
                        print(f"Document ID: {sample['doc_id']}")
                    
                    if distance == furthest_global:
                        print(f"\nFurthest Pair (across all documents):")
                        print(f"Distance: {distance} chars")
                        print(f"Entity: {entity['value']} (Position: {entity['start']}-{entity['end']})")
                        print(f"Date: {date_obj['value']} (Position: {date_obj['start']}-{date_obj['end']})")
                        print(f"Document ID: {sample['doc_id']}")

test_text_lengths_and_distances()

In [None]:
#Test Metrics Calculation
def test_metrics_calculation(samples):
    """Compare metrics calculation with position-based vs unique pairs"""
    print("=== Metrics Calculation Analysis ===")
    
    # Get first sample
    sample = samples[0]
    
    # Create training pairs (position-based)
    df = create_training_pairs([sample])
    
    # Analyze position-based predictions
    print("\nPosition-based Training Pairs:")
    print(f"Total pairs: {len(df)}")
    print(f"Positive pairs: {len(df[df['label'] == 1])}")
    print(f"Negative pairs: {len(df[df['label'] == 0])}")
    
    # Convert to unique entity-date pairs
    unique_pairs = set()
    unique_positive_pairs = set()
    for _, row in df.iterrows():
        try:
            # Find all marker positions
            e1_start = row['marked_text'].find('[E1]')
            e1_end = row['marked_text'].find('[/E1]')
            e2_start = row['marked_text'].find('[E2]')
            e2_end = row['marked_text'].find('[/E2]')
            
            # Only process if all markers are found
            if all(pos != -1 for pos in [e1_start, e1_end, e2_start, e2_end]):
                entity = row['marked_text'][e1_start+4:e1_end].strip()
                date = row['marked_text'][e2_start+4:e2_end].strip()
                pair = (entity, date)
                unique_pairs.add(pair)
                if row['label'] == 1:
                    unique_positive_pairs.add(pair)
        except Exception as e:
            print(f"Warning: Could not process row due to {str(e)}")
            continue
    
    print("\nUnique Entity-Date Pairs:")
    print(f"Total unique pairs: {len(unique_pairs)}")
    print(f"Unique positive pairs: {len(unique_positive_pairs)}")
    print(f"Unique negative pairs: {len(unique_pairs) - len(unique_positive_pairs)}")
    
    # Show example of how same pair appears in different positions
    if unique_positive_pairs:
        print("\nExample of position variations for same pair:")
        example_pair = next(iter(unique_positive_pairs))
        positions = []
        for _, row in df[df['label'] == 1].iterrows():
            try:
                entity = row['marked_text'][row['marked_text'].find('[E1]')+4:row['marked_text'].find('[/E1]')].strip()
                date = row['marked_text'][row['marked_text'].find('[E2]')+4:row['marked_text'].find('[/E2]')].strip()
                if (entity, date) == example_pair:
                    positions.append({
                        'entity_pos': (row['ent1_start'], row['ent1_end']),
                        'date_pos': (row['ent2_start'], row['ent2_end']),
                        'distance': row['distance']
                    })
            except Exception:
                continue
        
        print(f"\nPositions for pair '{example_pair[0]} -> {example_pair[1]}':")
        for pos in positions:
            print(f"- Entity at {pos['entity_pos']}, Date at {pos['date_pos']}, Distance: {pos['distance']} chars")
    
    print("\nMetrics Calculation:")
    
    # Position-based predictions
    position_predictions = [
        {'entity_label': row['marked_text'][row['marked_text'].find('[E1]')+4:row['marked_text'].find('[/E1]')].strip(),
         'date': row['marked_text'][row['marked_text'].find('[E2]')+4:row['marked_text'].find('[/E2]')].strip()}
        for _, row in df[df['label'] == 1].iterrows()
    ]
    
    print("\nMetrics when using all position-based pairs:")
    metrics = calculate_metrics(position_predictions, pd.DataFrame([sample]))
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1: {metrics['f1']:.3f}")
    print(f"True Positives: {metrics['tp']}")
    print(f"False Positives: {metrics['fp']}")
    print(f"False Negatives: {metrics['fn']}")
    
    # Unique pair predictions
    unique_predictions = [
        {'entity_label': entity, 'date': date}
        for entity, date in unique_positive_pairs
    ]
    
    print("\nMetrics when using unique pairs:")
    metrics = calculate_metrics(unique_predictions, pd.DataFrame([sample]))
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1: {metrics['f1']:.3f}")
    print(f"True Positives: {metrics['tp']}")
    print(f"False Positives: {metrics['fp']}")
    print(f"False Negatives: {metrics['fn']}")

# Run test
test_metrics_calculation(samples)

In [None]:
#Test Dataset Statistics
def test_dataset_statistics(samples):
    """Compare dataset statistics across different approaches"""
    print("=== Dataset Statistics Comparison ===")
    
    print("\n=== Single Document Analysis ===")
    sample = samples[0]
    
    print("\n1. Create Training Dataset Approach:")
    n_entities = len(sample['entities_list'])
    n_abs_dates = len(sample['dates'])
    n_rel_dates = len(sample['relative_dates'])
    n_relations = len(sample['relations_json'])
    total_possible = n_entities * (n_abs_dates + n_rel_dates)
    
    print(f"Number of entities: {n_entities}")
    print(f"Number of absolute dates: {n_abs_dates}")
    print(f"Number of relative dates: {n_rel_dates}")
    print(f"Total possible unique pairs: {total_possible}")
    print(f"Number of actual relations: {n_relations}")
    print(f"Percentage positive class: {(n_relations / total_possible) * 100:.2f}%")
    
    print("\n2. BERT Training Approach:")
    df = create_training_pairs([sample])
    
    print("\nPosition-based pairs:")
    print(f"Total pairs: {len(df)}")
    print(f"Positive pairs: {len(df[df['label'] == 1])}")
    print(f"Negative pairs: {len(df[df['label'] == 0])}")
    print(f"Percentage positive: {(len(df[df['label'] == 1]) / len(df) * 100):.2f}%")
    
    unique_pairs = set()
    unique_positive_pairs = set()
    for _, row in df.iterrows():
        try:
            e1_start = row['marked_text'].find('[E1]')
            e1_end = row['marked_text'].find('[/E1]')
            e2_start = row['marked_text'].find('[E2]')
            e2_end = row['marked_text'].find('[/E2]')
            
            if all(pos != -1 for pos in [e1_start, e1_end, e2_start, e2_end]):
                entity = row['marked_text'][e1_start+4:e1_end].strip()
                date = row['marked_text'][e2_start+4:e2_end].strip()
                pair = (entity, date)
                unique_pairs.add(pair)
                if row['label'] == 1:
                    unique_positive_pairs.add(pair)
        except Exception:
            continue
    
    print("\nUnique pairs:")
    print(f"Total unique pairs: {len(unique_pairs)}")
    print(f"Unique positive pairs: {len(unique_positive_pairs)}")
    print(f"Unique negative pairs: {len(unique_pairs) - len(unique_positive_pairs)}")
    print(f"Percentage positive: {(len(unique_positive_pairs) / len(unique_pairs) * 100):.2f}%")
    
    print("\n3. Naive Approach:")
    all_dates = sample['dates'] + sample['relative_dates']
    naive_pairs = naive_extraction(sample['entities_list'], all_dates, max_distance=25)
    print(f"Total pairs predicted: {len(naive_pairs)}")
    print(f"Total possible pairs: {len(sample['entities_list']) * len(all_dates)}")
    
    print("\n4. RelCAT Approach:")
    relcat_total = n_entities * (n_abs_dates + n_rel_dates)
    print(f"Total possible pairs: {relcat_total}")
    
    print("\n5. LLM Approach:")
    print("\nBinary method:")
    pairs = get_entity_date_pairs(sample['entities_list'], sample['dates'], sample['relative_dates'])
    print(f"Total pairs to evaluate: {len(pairs)}")
    print(f"Total possible pairs: {n_entities * (n_abs_dates + n_rel_dates)}")
    
    print("\nMulti method:")
    print(f"Total possible pairs: {n_entities * (n_abs_dates + n_rel_dates)}")
    
    print("\n=== Full Dataset Analysis ===")
    
    print("\n1. Create Training Dataset Approach:")
    total_entities = sum(len(s['entities_list']) for s in samples)
    total_abs_dates = sum(len(s['dates']) for s in samples)
    total_rel_dates = sum(len(s['relative_dates']) for s in samples)
    total_relations = sum(len(s['relations_json']) for s in samples)
    total_possible_pairs = sum(len(s['entities_list']) * (len(s['dates']) + len(s['relative_dates'])) for s in samples)
    
    print(f"Total entities across all documents: {total_entities}")
    print(f"Total absolute dates: {total_abs_dates}")
    print(f"Total relative dates: {total_rel_dates}")
    print(f"Total possible pairs: {total_possible_pairs}")
    print(f"Total relations: {total_relations}")
    print(f"Percentage positive class: {(total_relations / total_possible_pairs) * 100:.2f}%")
    
    print("\n2. BERT Training Approach:")
    df_full = create_training_pairs(samples)
    
    print("\nPosition-based pairs:")
    print(f"Total pairs: {len(df_full)}")
    print(f"Positive pairs: {len(df_full[df_full['label'] == 1])}")
    print(f"Negative pairs: {len(df_full[df_full['label'] == 0])}")
    print(f"Percentage positive: {(len(df_full[df_full['label'] == 1]) / len(df_full) * 100):.2f}%")
    
    unique_pairs_full = set()
    unique_positive_pairs_full = set()
    for _, row in df_full.iterrows():
        try:
            e1_start = row['marked_text'].find('[E1]')
            e1_end = row['marked_text'].find('[/E1]')
            e2_start = row['marked_text'].find('[E2]')
            e2_end = row['marked_text'].find('[/E2]')
            
            if all(pos != -1 for pos in [e1_start, e1_end, e2_start, e2_end]):
                entity = row['marked_text'][e1_start+4:e1_end].strip()
                date = row['marked_text'][e2_start+4:e2_end].strip()
                pair = (entity, date)
                unique_pairs_full.add(pair)
                if row['label'] == 1:
                    unique_positive_pairs_full.add(pair)
        except Exception:
            continue
    
    print("\nUnique pairs:")
    print(f"Total unique pairs: {len(unique_pairs_full)}")
    print(f"Unique positive pairs: {len(unique_positive_pairs_full)}")
    print(f"Unique negative pairs: {len(unique_pairs_full) - len(unique_positive_pairs_full)}")
    print(f"Percentage positive: {(len(unique_positive_pairs_full) / len(unique_pairs_full) * 100):.2f}%")
    
    print("\n3. Naive Approach:")
    naive_pairs_full = []
    for s in samples:
        all_dates = s['dates'] + s['relative_dates']
        pairs = naive_extraction(s['entities_list'], all_dates, max_distance=25)
        naive_pairs_full.extend(pairs)
    print(f"Total pairs predicted: {len(naive_pairs_full)}")
    print(f"Total possible pairs: {total_possible_pairs}")
    
    print("\n4. RelCAT Approach:")
    relcat_total = total_possible_pairs
    print(f"Total possible pairs: {relcat_total}")
    
    print("\n5. LLM Approach:")
    print("\nBinary method:")
    llm_pairs_full = []
    for s in samples:
        pairs = get_entity_date_pairs(s['entities_list'], s['dates'], s['relative_dates'])
        llm_pairs_full.extend(pairs)
    print(f"Total pairs to evaluate: {len(llm_pairs_full)}")
    print(f"Total possible pairs: {total_possible_pairs}")
    
    print("\nMulti method:")
    print(f"Total possible pairs: {total_possible_pairs}")

test_dataset_statistics(samples)

In [None]:
# Cell 9: Test Entity Distribution Analysis
def test_entity_distribution():
    """Analyze the distribution and frequency of entities across all documents"""
    print("=== Entity Distribution Analysis ===")
    
    # Define CUIs to exclude (date-related)
    DATE_CUIS = {'118578006', '410671006', '410670007'}
    
    # Collect all entities across documents, grouped by CUI
    cui_entities = {}
    for sample in samples:
        for entity in sample['entities_list']:
            cui = entity.get('cui', 'N/A')
            if cui in DATE_CUIS:  # Skip date-related CUIs
                continue
            if cui not in cui_entities:
                cui_entities[cui] = {
                    'mentions': 0,
                    'values': set(),  # Track unique text representations
                    'examples': []    # Keep some examples of the text
                }
            cui_entities[cui]['mentions'] += 1
            cui_entities[cui]['values'].add(entity['value'])
            if len(cui_entities[cui]['examples']) < 3:  # Keep up to 3 examples
                if entity['value'] not in cui_entities[cui]['examples']:
                    cui_entities[cui]['examples'].append(entity['value'])
    
    # Sort by mention count
    sorted_cuis = sorted(cui_entities.items(), key=lambda x: x[1]['mentions'], reverse=True)
    
    # Print summary stats
    all_mentions = sum(data['mentions'] for data in cui_entities.values())
    print(f"Total CUIs: {len(cui_entities)}")
    print(f"Total mentions: {all_mentions}")
    print(f"Average mentions per CUI: {all_mentions/len(cui_entities):.1f}")
    
    # Print table in tab-separated format for easy copying to Word
    print("\nCopy the following table (including header row) and paste into Word:\n")
    print("Rank\tCount\tCUI\tExample terms")
    for rank, (cui, data) in enumerate(sorted_cuis[:20], 1):
        examples = ', '.join(data['examples'])
        print(f"{rank}\t{data['mentions']}\t{cui}\t{examples}")

test_entity_distribution()