Imports

In [None]:
#imports
import sys
import os

utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))

if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from utils import load_data, prepare_all_samples
from bert_training import build_gold_lookup, get_label_for_pair
from bert_extractor import preprocess_input, mark_entities_full_text

Test BERT Pre-Processing & Utility Functions

In [7]:
# Example note for testing
note_text = (
    "Patient diagnosed with asthma on 2024-08-02. "
    "Diabetes was ruled out on 2024-08-02. "
    "Family history of hypertension, last reviewed in 2022. "
    "Patient may have pneumonia, last seen on 2024-08-02."
)

# Example entity and date spans
entity = {'start': 23, 'end': 29, 'value': 'asthma'}
date   = {'start': 33, 'end': 43, 'value': '2024-08-02'}

print("Example note:", note_text)
print("Entity:", entity)
print("Date:", date)

Example note: Patient diagnosed with asthma on 2024-08-02. Diabetes was ruled out on 2024-08-02. Family history of hypertension, last reviewed in 2022. Patient may have pneumonia, last seen on 2024-08-02.
Entity: {'start': 23, 'end': 29, 'value': 'asthma'}
Date: {'start': 33, 'end': 43, 'value': '2024-08-02'}


In [8]:
# Test each function explicitly
print("Testing build_gold_lookup...")
gold_map = build_gold_lookup(samples[0]['links_json'])
print(f"Gold map: {gold_map}")

Testing build_gold_lookup...
Gold map: {('rheumatoid_arthritis', "16 Sep'24"), ('pituitary_adenoma', '12nd Sep 2024'), ('GERD', '17.12.24'), ('headache', '23rd Oct 2024')}


In [9]:
print("Testing get_label_for_pair...")
label = get_label_for_pair(57, 311, gold_map)  # Example positions
print(f"Label: {label}")

Testing get_label_for_pair...
Label: no_link


In [10]:
print("Testing mark_entities_full_text...")
marked = mark_entities_full_text(note_text, 23, 29, 33, 43, "asthma", "2024-08-02")
print(f"Marked text: {marked}")

Testing mark_entities_full_text...
Marked text: Patient diagnosed with [E1] asthma [/E1] on [E2] 2024-08-02 [/E2]. Diabetes was ruled out on 2024-08-02. Family history of hypertension, last reviewed in 2022. Patient may have pneumonia, last seen on 2024-08-02.


In [11]:
# Test preprocessing
preprocessed = preprocess_input(note_text, entity, date)
print("\nPreprocessed input:")
print(preprocessed['marked_text'])


Preprocessed input:
Patient diagnosed with [E1] asthma [/E1] on [E2] 2024-08-02 [/E2]. Diabetes was ruled out on 2024-08-02. Family history of hypertension, last reviewed in 2022. Patient may have pneumonia, last seen on 2024-08-02.


In [12]:
# ============================================================================
# COMPREHENSIVE ENTITY MARKING AND PREPROCESSING TESTS (new schema)
# ============================================================================

# Test with the full first sample from our data
sample = samples[0]
print("=" * 60)
print("TESTING WITH FULL SAMPLE")
print("=" * 60)
print(f"Sample note length: {len(sample['note_text'])}")
print(f"Number of entities: {len(sample['entities_list'])}")
print(f"Number of dates: {len(sample['dates'])}")
print(f"Number of gold relationships: {len(sample['links_json'])}")

# Test all entity-date combinations
print("\nTesting all entity-date combinations:")
for i, entity in enumerate(sample['entities_list']):
    for j, date in enumerate(sample['dates']):
        print(f"\n--- Combination {i+1}-{j+1}: {entity['value']} + {date['value']} ---")
        
        # Test preprocessing
        processed = preprocess_input(sample['note_text'], entity, date)
        
        # Show the marked text (truncated for readability)
        marked_text = processed['marked_text']
        print(f"Original text length: {len(sample['note_text'])}")
        print(f"Marked text length: {len(marked_text)}")
        
        # Show a snippet around the marked entities
        ent_start = processed['ent1_start']
        date_start = processed['ent2_start']
        
        # Find the context around both entities
        context_start = max(0, min(ent_start, date_start) - 50)
        context_end = min(len(marked_text), max(ent_start, date_start) + 100)
        context = marked_text[context_start:context_end]
        print(f"Context snippet: ...{context}...")
        
        # Test gold lookup (value-based)
        gold_set = build_gold_lookup(sample['links_json'])
        label = get_label_for_pair(entity['value'], date['value'], gold_set)
        print(f"Gold label: {label}")

# Test edge cases
print("\n" + "=" * 60)
print("TESTING EDGE CASES")
print("=" * 60)

# Test with entities at the very beginning and end of text
print("Testing entities at text boundaries...")
first_entity = sample['entities_list'][0]
last_date = sample['dates'][-1]

processed_edge = preprocess_input(sample['note_text'], first_entity, last_date)
print(f"First entity position: {first_entity['start']}-{first_entity['end']}")
print(f"Last date position: {last_date['start']}-{last_date['end']}")

# Show beginning and end of marked text
print(f"Marked text start: {processed_edge['marked_text'][:100]}...")
print(f"Marked text end: ...{processed_edge['marked_text'][-100:]}")

# Test for potential overlapping entities
print("\nTesting for potential overlapping entities...")
for i, entity in enumerate(sample['entities_list']):
    for j, date in enumerate(sample['dates']):
        if abs(entity['start'] - date['start']) < 10:  # Close entities
            print(f"Close entities found: {entity['value']} at {entity['start']}, {date['value']} at {date['start']}")
            processed_close = preprocess_input(sample['note_text'], entity, date)
            print(f"Marked text: {processed_close['marked_text'][entity['start']-20:date['end']+20]}")

# Test gold relationship mapping (value pairs)
print("\n" + "=" * 60)
print("TESTING GOLD RELATIONSHIP MAPPING")
print("=" * 60)

gold_set = build_gold_lookup(sample['links_json'])
print(f"Gold set size: {len(gold_set)}")

# Show each gold relationship
for rel in sample['links_json']:
    print(f"Gold relationship: {rel['entity']} <-> {rel['date']}")

TESTING WITH FULL SAMPLE
Sample note length: 1319
Number of entities: 64
Number of dates: 6
Number of gold relationships: 4

Testing all entity-date combinations:

--- Combination 1-1: history of meningitis + 30nd Jun 2024 ---
Original text length: 1319
Marked text length: 1341
Context snippet: ...Ultrasound ([E2] 30nd Jun 2024 [/E2]): no significant findings.imp: asthma

She denies any nausea, vomiting, or diarrhea.
C Patient reports compliance with current medication regimen. Basic metabolic panel within normal limits with sodium 140, potassium 4.2, creatinine 0.9.
Patient is afebrile with normal vital signs. T (02nd Aug 2024): reveals asthma.imp: asthma

X-ray (12nd Sep 2024): shows 3.1cm mass in brain.imp: pituitary_adenoma

CLINIC VISIT (16 Sep'24): nausea/vomiting worsening confirmed rheumatoid_arthritis switch to aspirin

Past medical history is non-contributory.
URGENT REVIEW (23rd Oct 2024): headache x1 day.r Will order additional laboratory studies at next visit if symptoms p