In [9]:
# Cell 1: Imports (unchanged)
import pandas as pd
import sys
import os
import json
from transformers import AutoTokenizer

# Add utils to path
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from general_utils import (
    parse_jsonish,
    load_data,
    prepare_sample,
    prepare_all_samples,
    get_entity_date_pairs,
    calculate_metrics
)

from bert_training_utils import create_training_pairs
from naive_extractor_utils import naive_extraction

In [10]:
# Cell 2: Test load_data
def test_load_data():
    """Test loading and parsing of training dataset"""
    df = load_data("../data/training_dataset.csv")
    
    print("Dataset Overview:")
    print(f"Number of documents: {len(df)}")
    print("\nColumns present:")
    for col in df.columns:
        print(f"- {col}")
    
    # Check first row
    first_row = df.iloc[0]
    print("\nFirst row contents:")
    print(f"Document ID: {first_row.get('doc_id')}")
    print(f"Text length: {len(first_row['note_text'])} characters")
    print(f"Number of entities: {len(first_row['entities_json'])}")
    print(f"Number of dates: {len(first_row['dates_json'])}")
    
    # Sample of parsed content
    print("\nSample entities (first 3):")
    for e in first_row['entities_json'][:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nSample dates (first 3):")
    for d in first_row['dates_json'][:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return df

# Run test
df = test_load_data()

Dataset Overview:
Number of documents: 119

Columns present:
- doc_id
- note_text
- entities_json
- dates_json
- relative_dates_json
- relations_json

First row contents:
Document ID: 26342
Text length: 5643 characters
Number of entities: 117
Number of dates: 27

Sample entities (first 3):
- LYMPHOCYTES (Position: 4748-4759)
- createnine (Position: 1611-1621)
- gliclazide (Position: 1862-1872)

Sample dates (first 3):
-  11/9/2019 (Position: 1773-1783)
- Dec 2018 (Position: 1629-1637)
- 10/09/2019 (Position: 4482-4492)


In [11]:
# Cell 3: Test prepare_sample
def test_prepare_sample():
    """Test preparation of a single sample"""
    # Get first row
    row = df.iloc[0]
    
    # Prepare sample
    note_text, entities_list, dates = prepare_sample(row)
    
    print("Sample Preparation Results:")
    print(f"\nText length: {len(note_text)} characters")
    print(f"Number of entities: {len(entities_list)}")
    print(f"Number of dates: {len(dates)}")
    
    print("\nFirst 3 entities:")
    for e in entities_list[:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nFirst 3 dates:")
    for d in dates[:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return note_text, entities_list, dates

# Run test
note_text, entities_list, dates = test_prepare_sample()

Sample Preparation Results:

Text length: 5643 characters
Number of entities: 117
Number of dates: 27

First 3 entities:
- LYMPHOCYTES (Position: 4748-4759)
- createnine (Position: 1611-1621)
- gliclazide (Position: 1862-1872)

First 3 dates:
-  11/9/2019 (Position: 1773-1783)
- Dec 2018 (Position: 1629-1637)
- 10/09/2019 (Position: 4482-4492)


In [12]:
# Cell 4: Test prepare_all_samples
def test_prepare_all_samples():
    """Test preparation of all samples"""
    samples = prepare_all_samples(df)
    
    print("All Samples Preparation Results:")
    print(f"Number of samples prepared: {len(samples)}")
    
    # Check first sample
    first_sample = samples[0]
    print("\nFirst sample contents:")
    print(f"- doc_id: {first_sample['doc_id']}")
    print(f"- Text length: {len(first_sample['note_text'])} characters")
    print(f"- Number of entities: {len(first_sample['entities_list'])}")
    print(f"- Number of dates: {len(first_sample['dates'])}")
    print(f"- Number of relative dates: {len(first_sample['relative_dates'])}")
    
    # Print first few entities and dates
    print("\nFirst 3 entities:")
    for e in first_sample['entities_list'][:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nFirst 3 dates:")
    for d in first_sample['dates'][:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    print("\nFirst 3 relative dates:")
    for rd in first_sample['relative_dates'][:3]:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    return samples

# Run test
samples = test_prepare_all_samples()

All Samples Preparation Results:
Number of samples prepared: 119

First sample contents:
- doc_id: 26342
- Text length: 5643 characters
- Number of entities: 117
- Number of dates: 27
- Number of relative dates: 5

First 3 entities:
- LYMPHOCYTES (Position: 4748-4759)
- createnine (Position: 1611-1621)
- gliclazide (Position: 1862-1872)

First 3 dates:
-  11/9/2019 (Position: 1773-1783)
- Dec 2018 (Position: 1629-1637)
- 10/09/2019 (Position: 4482-4492)

First 3 relative dates:
- last few month (Position: 196-210)
-  last 3 months (Position: 317-331)
-  start of 2018 (Position: 292-306)


In [13]:
# Cell 5: Test get_entity_date_pairs
def test_get_entity_date_pairs():
    """Test creation of entity-date pairs"""
    # Get first sample
    sample = samples[0]
    
    # Get pairs
    pairs = get_entity_date_pairs(
        sample['entities_list'],
        sample['dates'],
        sample['relative_dates']
    )
    
    print("Entity-Date Pairs Results:")
    print(f"Total pairs generated: {len(pairs)}")
    
    print("\nFirst 5 pairs:")
    for i, pair in enumerate(pairs[:5]):
        print(f"\nPair {i+1}:")
        print(f"Entity: {pair['entity_label']} ({pair['entity']['start']}-{pair['entity']['end']})")
        print(f"Date: {pair['date']} ({pair['date_info']['start']}-{pair['date_info']['end']})")
        print(f"Distance: {pair['distance']} chars")
        print(f"Date type: {pair['date_type']}")
    
    return pairs

# Run test
pairs = test_get_entity_date_pairs()

Entity-Date Pairs Results:
Total pairs generated: 3744

First 5 pairs:

Pair 1:
Entity: LYMPHOCYTES (4748-4759)
Date:  11/9/2019 (1773-1783)
Distance: 2975 chars
Date type: absolute

Pair 2:
Entity: LYMPHOCYTES (4748-4759)
Date: Dec 2018 (1629-1637)
Distance: 3119 chars
Date type: absolute

Pair 3:
Entity: LYMPHOCYTES (4748-4759)
Date: 10/09/2019 (4482-4492)
Distance: 266 chars
Date type: absolute

Pair 4:
Entity: LYMPHOCYTES (4748-4759)
Date: 10/09/2019 (4508-4518)
Distance: 240 chars
Date type: absolute

Pair 5:
Entity: LYMPHOCYTES (4748-4759)
Date: 10/09/2019 (4533-4543)
Distance: 215 chars
Date type: absolute


In [14]:
# Cell 6: Test Relative Date Handling
def test_relative_dates():
    """Test specific handling of relative dates in pairs"""
    
    # Get first sample
    sample = samples[0]
    
    print("Relative Date Analysis:")
    print(f"Total relative dates: {len(sample['relative_dates'])}")
    
    # Show all relative dates
    print("\nAll relative dates:")
    for rd in sample['relative_dates']:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    # Find pairs with relative dates
    pairs = get_entity_date_pairs(
        sample['entities_list'],
        sample['dates'],
        sample['relative_dates']
    )
    
    relative_pairs = [p for p in pairs if p['date_type'] == 'relative']
    print(f"\nPairs using relative dates: {len(relative_pairs)}")
    print("\nFirst 3 relative date pairs:")
    for p in relative_pairs[:3]:
        print(f"\nEntity: {p['entity_label']} ({p['entity']['start']}-{p['entity']['end']})")
        print(f"Date: {p['date']} ({p['date_info']['start']}-{p['date_info']['end']})")
        print(f"Distance: {p['distance']} chars")

# Run test
test_relative_dates()

Relative Date Analysis:
Total relative dates: 5

All relative dates:
- last few month (Position: 196-210)
-  last 3 months (Position: 317-331)
-  start of 2018 (Position: 292-306)
- 2/7 (Position: 421-424)
- today (Position: 664-669)

Pairs using relative dates: 585

First 3 relative date pairs:

Entity: LYMPHOCYTES (4748-4759)
Date: last few month (196-210)
Distance: 4552 chars

Entity: LYMPHOCYTES (4748-4759)
Date:  last 3 months (317-331)
Distance: 4431 chars

Entity: LYMPHOCYTES (4748-4759)
Date:  start of 2018 (292-306)
Distance: 4456 chars


In [None]:
# Cell 7: Test Duplicate Relations
def test_duplicate_relations(samples):
    """Investigate potential duplicate relations in training pairs"""
    print("=== Test 9: Duplicate Relations Analysis ===")
    
    # Get first sample
    sample = samples[0]
    
    # Analyze relations
    print("\nOriginal Relations:")
    relations = sample['relations_json']
    print(f"Total relations in gold set: {len(relations)}")
    
    # Check for duplicates in relations
    entity_date_pairs = [(r['entity'], r['date']) for r in relations]
    unique_pairs = set(entity_date_pairs)
    print(f"Unique entity-date pairs: {len(unique_pairs)}")
    
    if len(entity_date_pairs) != len(unique_pairs):
        print("\nFound duplicate relations:")
        from collections import Counter
        duplicates = Counter(entity_date_pairs)
        for pair, count in duplicates.items():
            if count > 1:
                print(f"- {pair[0]} -> {pair[1]} (appears {count} times)")
    
    # Create training pairs and analyze
    df = create_training_pairs([sample])
    positive_pairs = df[df['label'] == 1]
    
    print("\nTraining Pairs Analysis:")
    print(f"Total pairs created: {len(df)}")
    print(f"Positive pairs: {len(positive_pairs)}")
    
    # Check if same entity-date pair appears multiple times
    pair_texts = [(row['marked_text'].split('[E1]')[1].split('[/E1]')[0].strip(),
                  row['marked_text'].split('[E2]')[1].split('[/E2]')[0].strip())
                  for _, row in positive_pairs.iterrows()]
    unique_pair_texts = set(pair_texts)
    
    print(f"Unique positive pairs: {len(unique_pair_texts)}")
    
    if len(pair_texts) != len(unique_pair_texts):
        print("\nFound duplicate pairs in training data:")
        duplicates = Counter(pair_texts)
        for pair, count in duplicates.items():
            if count > 1:
                print(f"- {pair[0]} -> {pair[1]} (appears {count} times)")

# Run test
test_duplicate_relations(samples)

=== Test 9: Duplicate Relations Analysis ===

Original Relations:
Total relations in gold set: 30
Unique entity-date pairs: 29

Found duplicate relations:
- insulin aspart biphasic -> 11/9/2019 (appears 2 times)

Training Pairs Analysis:
Total pairs created: 3744
Positive pairs: 349
Unique positive pairs: 29

Found duplicate pairs in training data:
- LYMPHOCYTES -> 10/09/2019 (appears 19 times)
- insulin aspart biphasic -> 11/9/2019 (appears 8 times)
- rosuvastatin -> 11/9/2019 (appears 4 times)
- linagliptin -> 11/9/2019 (appears 4 times)
- HAEMOGLOBIN -> 10/09/2019 (appears 19 times)
- NEUTROPHILS -> 10/09/2019 (appears 19 times)
- EOSINOPHILS -> 10/09/2019 (appears 19 times)
- CREATININE -> 10/09/2019 (appears 19 times)
- PLATELET -> 10/09/2019 (appears 19 times)
- PCO2 -> 10/09/2019 (appears 19 times)
- HCO3 -> 10/09/2019 (appears 19 times)
- APTT -> 10/09/2019 (appears 19 times)
- MCV -> 10/09/2019 (appears 19 times)
- CRP -> 10/09/2019 (appears 19 times)
- PH -> 10/09/2019 (appea

In [19]:
# Cell 8: Test Token Length and Distance Analysis
def test_text_lengths_and_distances():
    """Analyze document lengths and distances between entities/dates"""
    print("=== Text Length and Distance Analysis ===")
    
    # Get first sample
    sample = samples[0]
    
    # 1. Document Length Analysis
    print("\nDocument Length Analysis:")
    text_length = len(sample['note_text'])
    print(f"Document length: {text_length} characters")
    print(f"Average words per document: {len(sample['note_text'].split())} words")
    
    # 2. Distance Analysis for Relations
    print("\nDistance Analysis for Relations:")
    
    # Analyze all relations
    distances = []
    abs_distances = []
    rel_distances = []
    
    for relation in sample['relations_json']:
        # Find corresponding entity and date objects
        entity = next((e for e in sample['entities_list'] if str(e['id']) == str(relation['entity_id'])), None)
        date = next((d for d in sample['dates'] if str(d['id']) == str(relation['date_id'])), None)
        rel_date = next((rd for rd in sample['relative_dates'] if str(rd['id']) == str(relation['date_id'])), None)
        
        if entity:
            if date:
                distance = abs(entity['start'] - date['start'])
                distances.append(distance)
                abs_distances.append(distance)
            elif rel_date:
                distance = abs(entity['start'] - rel_date['start'])
                distances.append(distance)
                rel_distances.append(distance)
    
    print("\nAll Relations:")
    print(f"Total relations: {len(distances)}")
    if distances:
        print(f"Mean distance: {sum(distances)/len(distances):.1f} chars")
        print(f"Min distance: {min(distances)} chars")
        print(f"Max distance: {max(distances)} chars")
    
    print("\nAbsolute Date Relations:")
    print(f"Total relations: {len(abs_distances)}")
    if abs_distances:
        print(f"Mean distance: {sum(abs_distances)/len(abs_distances):.1f} chars")
        print(f"Min distance: {min(abs_distances)} chars")
        print(f"Max distance: {max(abs_distances)} chars")
    
    print("\nRelative Date Relations:")
    print(f"Total relations: {len(rel_distances)}")
    if rel_distances:
        print(f"Mean distance: {sum(rel_distances)/len(rel_distances):.1f} chars")
        print(f"Min distance: {min(rel_distances)} chars")
        print(f"Max distance: {max(rel_distances)} chars")
    
    # Show examples of closest and furthest pairs
    if distances:
        print("\nExample Relations:")
        closest_idx = distances.index(min(distances))
        furthest_idx = distances.index(max(distances))
        
        # Get relation objects
        closest_relation = sample['relations_json'][closest_idx]
        furthest_relation = sample['relations_json'][furthest_idx]
        
        # Find entities and dates for these relations
        for relation, distance_type in [(closest_relation, "Closest"), (furthest_relation, "Furthest")]:
            entity = next((e for e in sample['entities_list'] if str(e['id']) == str(relation['entity_id'])), None)
            date = next((d for d in sample['dates'] if str(d['id']) == str(relation['date_id'])), None)
            rel_date = next((rd for rd in sample['relative_dates'] if str(rd['id']) == str(relation['date_id'])), None)
            
            if entity and (date or rel_date):
                date_obj = date if date else rel_date
                distance = abs(entity['start'] - date_obj['start'])
                print(f"\n{distance_type} Pair:")
                print(f"Distance: {distance} chars")
                print(f"Entity: {entity['value']} (Position: {entity['start']}-{entity['end']})")
                print(f"Date: {date_obj['value']} (Position: {date_obj['start']}-{date_obj['end']})")
                
test_text_lengths_and_distances()

=== Text Length and Distance Analysis ===

Document Length Analysis:
Document length: 5643 characters
Average words per document: 858 words

Distance Analysis for Relations:

All Relations:
Total relations: 30
Mean distance: 17.8 chars
Min distance: 8 chars
Max distance: 60 chars

Absolute Date Relations:
Total relations: 23
Mean distance: 12.9 chars
Min distance: 8 chars
Max distance: 21 chars

Relative Date Relations:
Total relations: 7
Mean distance: 33.9 chars
Min distance: 8 chars
Max distance: 60 chars

Example Relations:

Closest Pair:
Distance: 8 chars
Entity: CT scan (Position: 656-663)
Date: today (Position: 664-669)

Furthest Pair:
Distance: 60 chars
Entity: ventriculomegaly (Position: 724-740)
Date: today (Position: 664-669)


In [None]:
# Cell 9: Test Metrics Calculation
def test_metrics_calculation(samples):
    """Compare metrics calculation with position-based vs unique pairs"""
    print("=== Test 10: Metrics Calculation Analysis ===")
    
    # Get first sample
    sample = samples[0]
    
    # Create training pairs (position-based)
    df = create_training_pairs([sample])
    
    # Analyze position-based predictions
    print("\nPosition-based Training Pairs:")
    print(f"Total pairs: {len(df)}")
    print(f"Positive pairs: {len(df[df['label'] == 1])}")
    print(f"Negative pairs: {len(df[df['label'] == 0])}")
    
    # Convert to unique entity-date pairs
    unique_pairs = set()
    unique_positive_pairs = set()
    for _, row in df.iterrows():
        try:
            # Find all marker positions
            e1_start = row['marked_text'].find('[E1]')
            e1_end = row['marked_text'].find('[/E1]')
            e2_start = row['marked_text'].find('[E2]')
            e2_end = row['marked_text'].find('[/E2]')
            
            # Only process if all markers are found
            if all(pos != -1 for pos in [e1_start, e1_end, e2_start, e2_end]):
                entity = row['marked_text'][e1_start+4:e1_end].strip()
                date = row['marked_text'][e2_start+4:e2_end].strip()
                pair = (entity, date)
                unique_pairs.add(pair)
                if row['label'] == 1:
                    unique_positive_pairs.add(pair)
        except Exception as e:
            print(f"Warning: Could not process row due to {str(e)}")
            continue
    
    print("\nUnique Entity-Date Pairs:")
    print(f"Total unique pairs: {len(unique_pairs)}")
    print(f"Unique positive pairs: {len(unique_positive_pairs)}")
    print(f"Unique negative pairs: {len(unique_pairs) - len(unique_positive_pairs)}")
    
    # Show example of how same pair appears in different positions
    if unique_positive_pairs:
        print("\nExample of position variations for same pair:")
        example_pair = next(iter(unique_positive_pairs))
        positions = []
        for _, row in df[df['label'] == 1].iterrows():
            try:
                entity = row['marked_text'][row['marked_text'].find('[E1]')+4:row['marked_text'].find('[/E1]')].strip()
                date = row['marked_text'][row['marked_text'].find('[E2]')+4:row['marked_text'].find('[/E2]')].strip()
                if (entity, date) == example_pair:
                    positions.append({
                        'entity_pos': (row['ent1_start'], row['ent1_end']),
                        'date_pos': (row['ent2_start'], row['ent2_end']),
                        'distance': row['distance']
                    })
            except Exception:
                continue
        
        print(f"\nPositions for pair '{example_pair[0]} -> {example_pair[1]}':")
        for pos in positions:
            print(f"- Entity at {pos['entity_pos']}, Date at {pos['date_pos']}, Distance: {pos['distance']} chars")
    
    print("\nMetrics Calculation:")
    
    # Position-based predictions
    position_predictions = [
        {'entity_label': row['marked_text'][row['marked_text'].find('[E1]')+4:row['marked_text'].find('[/E1]')].strip(),
         'date': row['marked_text'][row['marked_text'].find('[E2]')+4:row['marked_text'].find('[/E2]')].strip()}
        for _, row in df[df['label'] == 1].iterrows()
    ]
    
    print("\nMetrics when using all position-based pairs:")
    metrics = calculate_metrics(position_predictions, pd.DataFrame([sample]))
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1: {metrics['f1']:.3f}")
    print(f"True Positives: {metrics['tp']}")
    print(f"False Positives: {metrics['fp']}")
    print(f"False Negatives: {metrics['fn']}")
    
    # Unique pair predictions
    unique_predictions = [
        {'entity_label': entity, 'date': date}
        for entity, date in unique_positive_pairs
    ]
    
    print("\nMetrics when using unique pairs:")
    metrics = calculate_metrics(unique_predictions, pd.DataFrame([sample]))
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1: {metrics['f1']:.3f}")
    print(f"True Positives: {metrics['tp']}")
    print(f"False Positives: {metrics['fp']}")
    print(f"False Negatives: {metrics['fn']}")

# Run test
test_metrics_calculation(samples)

=== Test 10: Metrics Calculation Analysis ===

Position-based Training Pairs:
Total pairs: 3744
Positive pairs: 349
Negative pairs: 3395

Unique Entity-Date Pairs:
Total unique pairs: 931
Unique positive pairs: 29
Unique negative pairs: 902

Example of position variations for same pair:

Positions for pair 'rosuvastatin -> 11/9/2019':
- Entity at (3258, 3270), Date at (2561, 2570), Distance: 697 chars
- Entity at (3258, 3270), Date at (2735, 2744), Distance: 523 chars
- Entity at (3258, 3270), Date at (3150, 3159), Distance: 108 chars
- Entity at (3258, 3270), Date at (3247, 3256), Distance: 11 chars

Metrics Calculation:

Metrics when using all position-based pairs:
Precision: 0.897
Recall: 0.897
F1: 0.897
True Positives: 26
False Positives: 3
False Negatives: 3

Metrics when using unique pairs:
Precision: 0.897
Recall: 0.897
F1: 0.897
True Positives: 26
False Positives: 3
False Negatives: 3


In [18]:
# Cell 10: Test Dataset Statistics
def test_dataset_statistics(samples):
    """Compare dataset statistics across different approaches"""
    print("=== Dataset Statistics Comparison ===")
    
    print("\n=== Single Document Analysis ===")
    sample = samples[0]
    
    print("\n1. Create Training Dataset Approach:")
    n_entities = len(sample['entities_list'])
    n_abs_dates = len(sample['dates'])
    n_rel_dates = len(sample['relative_dates'])
    n_relations = len(sample['relations_json'])
    total_possible = n_entities * (n_abs_dates + n_rel_dates)
    
    print(f"Number of entities: {n_entities}")
    print(f"Number of absolute dates: {n_abs_dates}")
    print(f"Number of relative dates: {n_rel_dates}")
    print(f"Total possible unique pairs: {total_possible}")
    print(f"Number of actual relations: {n_relations}")
    print(f"Percentage positive class: {(n_relations / total_possible) * 100:.2f}%")
    
    print("\n2. BERT Training Approach:")
    df = create_training_pairs([sample])
    
    print("\nPosition-based pairs:")
    print(f"Total pairs: {len(df)}")
    print(f"Positive pairs: {len(df[df['label'] == 1])}")
    print(f"Negative pairs: {len(df[df['label'] == 0])}")
    print(f"Percentage positive: {(len(df[df['label'] == 1]) / len(df) * 100):.2f}%")
    
    unique_pairs = set()
    unique_positive_pairs = set()
    for _, row in df.iterrows():
        try:
            e1_start = row['marked_text'].find('[E1]')
            e1_end = row['marked_text'].find('[/E1]')
            e2_start = row['marked_text'].find('[E2]')
            e2_end = row['marked_text'].find('[/E2]')
            
            if all(pos != -1 for pos in [e1_start, e1_end, e2_start, e2_end]):
                entity = row['marked_text'][e1_start+4:e1_end].strip()
                date = row['marked_text'][e2_start+4:e2_end].strip()
                pair = (entity, date)
                unique_pairs.add(pair)
                if row['label'] == 1:
                    unique_positive_pairs.add(pair)
        except Exception:
            continue
    
    print("\nUnique pairs:")
    print(f"Total unique pairs: {len(unique_pairs)}")
    print(f"Unique positive pairs: {len(unique_positive_pairs)}")
    print(f"Unique negative pairs: {len(unique_pairs) - len(unique_positive_pairs)}")
    print(f"Percentage positive: {(len(unique_positive_pairs) / len(unique_pairs) * 100):.2f}%")
    
    print("\n3. Naive Approach:")
    all_dates = sample['dates'] + sample['relative_dates']
    naive_pairs = naive_extraction(sample['entities_list'], all_dates, max_distance=25)
    print(f"Total pairs predicted: {len(naive_pairs)}")
    print(f"Total possible pairs: {len(sample['entities_list']) * len(all_dates)}")
    
    print("\n4. RelCAT Approach:")
    relcat_total = n_entities * (n_abs_dates + n_rel_dates)
    print(f"Total possible pairs: {relcat_total}")
    
    print("\n5. LLM Approach:")
    print("\nBinary method:")
    pairs = get_entity_date_pairs(sample['entities_list'], sample['dates'], sample['relative_dates'])
    print(f"Total pairs to evaluate: {len(pairs)}")
    print(f"Total possible pairs: {n_entities * (n_abs_dates + n_rel_dates)}")
    
    print("\nMulti method:")
    print(f"Total possible pairs: {n_entities * (n_abs_dates + n_rel_dates)}")
    
    print("\n=== Full Dataset Analysis ===")
    
    print("\n1. Create Training Dataset Approach:")
    total_entities = sum(len(s['entities_list']) for s in samples)
    total_abs_dates = sum(len(s['dates']) for s in samples)
    total_rel_dates = sum(len(s['relative_dates']) for s in samples)
    total_relations = sum(len(s['relations_json']) for s in samples)
    total_possible_pairs = sum(len(s['entities_list']) * (len(s['dates']) + len(s['relative_dates'])) for s in samples)
    
    print(f"Total entities across all documents: {total_entities}")
    print(f"Total absolute dates: {total_abs_dates}")
    print(f"Total relative dates: {total_rel_dates}")
    print(f"Total possible pairs: {total_possible_pairs}")
    print(f"Total relations: {total_relations}")
    print(f"Percentage positive class: {(total_relations / total_possible_pairs) * 100:.2f}%")
    
    print("\n2. BERT Training Approach:")
    df_full = create_training_pairs(samples)
    
    print("\nPosition-based pairs:")
    print(f"Total pairs: {len(df_full)}")
    print(f"Positive pairs: {len(df_full[df_full['label'] == 1])}")
    print(f"Negative pairs: {len(df_full[df_full['label'] == 0])}")
    print(f"Percentage positive: {(len(df_full[df_full['label'] == 1]) / len(df_full) * 100):.2f}%")
    
    unique_pairs_full = set()
    unique_positive_pairs_full = set()
    for _, row in df_full.iterrows():
        try:
            e1_start = row['marked_text'].find('[E1]')
            e1_end = row['marked_text'].find('[/E1]')
            e2_start = row['marked_text'].find('[E2]')
            e2_end = row['marked_text'].find('[/E2]')
            
            if all(pos != -1 for pos in [e1_start, e1_end, e2_start, e2_end]):
                entity = row['marked_text'][e1_start+4:e1_end].strip()
                date = row['marked_text'][e2_start+4:e2_end].strip()
                pair = (entity, date)
                unique_pairs_full.add(pair)
                if row['label'] == 1:
                    unique_positive_pairs_full.add(pair)
        except Exception:
            continue
    
    print("\nUnique pairs:")
    print(f"Total unique pairs: {len(unique_pairs_full)}")
    print(f"Unique positive pairs: {len(unique_positive_pairs_full)}")
    print(f"Unique negative pairs: {len(unique_pairs_full) - len(unique_positive_pairs_full)}")
    print(f"Percentage positive: {(len(unique_positive_pairs_full) / len(unique_pairs_full) * 100):.2f}%")
    
    print("\n3. Naive Approach:")
    naive_pairs_full = []
    for s in samples:
        all_dates = s['dates'] + s['relative_dates']
        pairs = naive_extraction(s['entities_list'], all_dates, max_distance=25)
        naive_pairs_full.extend(pairs)
    print(f"Total pairs predicted: {len(naive_pairs_full)}")
    print(f"Total possible pairs: {total_possible_pairs}")
    
    print("\n4. RelCAT Approach:")
    relcat_total = total_possible_pairs
    print(f"Total possible pairs: {relcat_total}")
    
    print("\n5. LLM Approach:")
    print("\nBinary method:")
    llm_pairs_full = []
    for s in samples:
        pairs = get_entity_date_pairs(s['entities_list'], s['dates'], s['relative_dates'])
        llm_pairs_full.extend(pairs)
    print(f"Total pairs to evaluate: {len(llm_pairs_full)}")
    print(f"Total possible pairs: {total_possible_pairs}")
    
    print("\nMulti method:")
    print(f"Total possible pairs: {total_possible_pairs}")

# Run test
test_dataset_statistics(samples)

=== Dataset Statistics Comparison ===

=== Single Document Analysis ===

1. Create Training Dataset Approach:
Number of entities: 117
Number of absolute dates: 27
Number of relative dates: 5
Total possible unique pairs: 3744
Number of actual relations: 30
Percentage positive class: 0.80%

2. BERT Training Approach:

Position-based pairs:
Total pairs: 3744
Positive pairs: 349
Negative pairs: 3395
Percentage positive: 9.32%

Unique pairs:
Total unique pairs: 931
Unique positive pairs: 29
Unique negative pairs: 902
Percentage positive: 3.11%

3. Naive Approach:
Total pairs predicted: 39
Total possible pairs: 3744

4. RelCAT Approach:
Total possible pairs: 3744

5. LLM Approach:

Binary method:
Total pairs to evaluate: 3744
Total possible pairs: 3744

Multi method:
Total possible pairs: 3744

=== Full Dataset Analysis ===

1. Create Training Dataset Approach:
Total entities across all documents: 4944
Total absolute dates: 620
Total relative dates: 342
Total possible pairs: 47219
Total rela