Setup

In [1]:
#Imports
import pandas as pd
import os
import sys
import json

In [2]:
# Add project root to Python path
# Assuming the notebook is in a subdirectory of the project root (e.g., notebooks/)
try:
    # Get the absolute path of the current notebook (if running in an environment that supports it)
    notebook_path = os.path.abspath(__file__) # Fails in some interactive environments
except NameError:
    # Fallback for interactive environments like Jupyter
    notebook_path = os.path.abspath('.')

project_root = os.path.dirname(notebook_path) # If notebook is in root
if os.path.basename(project_root) == 'notebooks': # Check if we are in the 'notebooks' subdir
    project_root = os.path.dirname(project_root) # Go one level up to the actual project root

if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Project root added to path: {project_root}")
else:
    print(f"Project root already in path: {project_root}")

Project root added to path: c:\Users\Satyam\Downloads\pituitary_adenoma


Config

In [3]:
#Look at config
import config

print(f"Using device: {config.DEVICE}")
print(f"Using extraction method: {config.EXTRACTION_METHOD}")
print(f"Using run mode: {config.RUN_MODE}")
print(f"Dataset path: {config.DATASET_PATH}")

Using device: cpu
Using extraction method: llm
Using run mode: single
Dataset path: data/synthetic_data.json


Extraction on Single Note

In [4]:
#Sample note
from data.sample_note import CLINICAL_NOTE
text=CLINICAL_NOTE
text

"\nInitialPresentation (15-01-2023)[date]  \npt presents w severe HA x2/52,getting worse.no relief w/regular painkillers\nimp: chronic_migraine[diagnosis]vs tension_headache[diagnosis]??? \nplan- start sumatriptan,f/u 2/52\n\nURGENT REVIEW(20.01.23)[date]:pt called- new onset visual symps.  \nrapid worsening,Now with diplopia.urgent referral to ophth\n-> ?pituitary mass vs IIH[diagnosis]\n\nophth notes(Jan 25th 2023)[date]  \nbilat_papilloedema[diagnosis]+++\nvf loss temp sides\nurgent mri requested..   \n\nradiology(02/02/2023 @1445)[date]\nfindings:large sellar/suprasellar mass 2.8x3.1x2.9cm\nimp:macroadenoma[diagnosis],severe mass effect\nneeds urgent endo r/v\n\nendocrine assessment (3rd Feb'23)[date]\nhormonal panel results:\nprolactin>2000\ncortisol-low\nimp:giant_prolactinoma[diagnosis] w/partial_hypopituitarism[diagnosis]\nstarted cab 0.25mg 2x/wk\nstress dose steroids given\n\nf/u (17/2/23)[date]-phone\nHA better.no more diplopia\ncont cab\n\nclinic rv - (2023-03-15)[date]\ndo

In [7]:
#Base extractor
#Extract method takes text and optionally entities as a tuple of (diagnoses, dates)
#Returns list of dictionaries with diagnosis and date
#Date is returned as a string

In [5]:
#Start by extracting entities
from utils.extraction_utils import extract_entities, parse_date_string
diagnoses, dates = extract_entities(text)
print(f"Found {len(diagnoses)} diagnoses and {len(dates)} dates")

Found 10 diagnoses and 9 dates


In [6]:
#Look at diagnoses and dates
diagnoses, dates

([('chronic_migraine', 123),
  ('tension_headache', 153),
  ('iih', 367),
  ('bilat_papilloedema', 418),
  ('macroadenoma', 588),
  ('giant_prolactinoma', 750),
  ('partial_hypopituitarism', 782),
  ('tumor', 1118),
  ('adrenal_insufficiency', 1213),
  ('adrenal_insufficiency', 1334)],
 [('2023-01-15', '15-01-2023', 22),
  ('2023-01-20', '20.01.23', 233),
  ('2023-02-02', '02/02/2023 @1445', 507),
  ('2023-02-03', "3rd Feb'23", 676),
  ('2023-02-17', '17/2/23', 875),
  ('2023-03-15', '2023-03-15', 946),
  ('2023-04-12', '12 apr 23', 1044),
  ('2023-05-15', '15-5-2023', 1166),
  ('2023-05-16', '16/5/23', 1300)])

In [7]:
#Create extractor
from extractors.extractor_factory import create_extractor
extractor = create_extractor(config.EXTRACTION_METHOD, config)

In [None]:
#Custom extractor only
if config.EXTRACTION_METHOD == 'custom':
    relative_model_path = extractor.model_path.lstrip('/')
    relative_vocab_path = extractor.vocab_path.lstrip('/') # Assuming vocab_path has the same issue

    extractor.model_path = os.path.join(project_root, relative_model_path)
    extractor.vocab_path = os.path.join(project_root, relative_vocab_path)
else:
    pass

In [11]:
# Load the extractor (models, API clients, etc.)
if not extractor.load():
    print(f"Failed to load {extractor.name} extractor. Check configuration and dependencies.")
else:
    print(f"Successfully loaded {extractor.name} extractor")

LLM Extractor: OpenAI client initialized successfully.
Successfully loaded LLM (gpt-4o) extractor


In [12]:
relationships = extractor.extract(text, entities=(diagnoses, dates))
print(f"Found {len(relationships)} relationships")
relationships

Found 9 relationships


[{'diagnosis': 'chronic_migraine', 'date': '2023-01-15', 'confidence': 0.9},
 {'diagnosis': 'tension_headache', 'date': '2023-01-15', 'confidence': 0.8},
 {'diagnosis': 'iih', 'date': '2023-01-20', 'confidence': 0.85},
 {'diagnosis': 'bilat_papilloedema', 'date': '2023-01-25', 'confidence': 0.9},
 {'diagnosis': 'macroadenoma', 'date': '2023-02-02', 'confidence': 0.95},
 {'diagnosis': 'giant_prolactinoma', 'date': '2023-02-03', 'confidence': 0.95},
 {'diagnosis': 'partial_hypopituitarism',
  'date': '2023-02-03',
  'confidence': 0.9},
 {'diagnosis': 'tumor', 'date': '2023-04-12', 'confidence': 0.9},
 {'diagnosis': 'adrenal_insufficiency',
  'date': '2023-05-16',
  'confidence': 0.95}]

In [13]:
# Convert to list of (date, diagnosis) tuples
output_tuples = [(rel['date'], rel['diagnosis']) for rel in relationships]
output_tuples.sort(key=lambda x: x[0])
output_tuples

[('2023-01-15', 'chronic_migraine'),
 ('2023-01-15', 'tension_headache'),
 ('2023-01-20', 'iih'),
 ('2023-01-25', 'bilat_papilloedema'),
 ('2023-02-02', 'macroadenoma'),
 ('2023-02-03', 'giant_prolactinoma'),
 ('2023-02-03', 'partial_hypopituitarism'),
 ('2023-04-12', 'tumor'),
 ('2023-05-16', 'adrenal_insufficiency')]

Extraction On Synthetic Data

In [14]:
#Synthetic dataset
dataset_path = config.DATASET_PATH
# Construct absolute path using project_root
absolute_dataset_path = os.path.join(project_root, dataset_path)
print(f"Attempting to open: {absolute_dataset_path}") # Add a print for debugging
with open(absolute_dataset_path, 'r') as f:
    dataset = json.load(f)

dataset

Attempting to open: c:\Users\Satyam\Downloads\pituitary_adenoma\data/synthetic_data.json


[{'id': 0,
  'clinical_note': "Ultrasound (30nd Jun 2024)[date]: no significant findings.imp: asthma[dx]\n\nCT (02nd Aug 2024)[date]: reveals asthma[dx].imp: asthma[dx]\n\nX-ray (12nd Sep 2024)[date]: shows 3.1cm mass in brain.imp: pituitary_adenoma[dx]\n\nCLINIC VISIT (16 Sep'24)[date]: nausea/vomiting worsening confirmed rheumatoid_arthritis[dx] switch to aspirin\n\nURGENT REVIEW (23rd Oct 2024)[date]: headache x1 day.resolving pneumonia[dx]\n\nF/U (16st Nov 2024)[date]: improved.rule out pituitary_adenoma[dx] observe and reassess\n\nX-ray (17.12.24)[date]: reveals GERD[dx].imp: GERD[dx]",
  'ground_truth': [{'date': '2024-06-30',
    'date_position': 13,
    'diagnoses': [{'diagnosis': 'asthma', 'position': 64}]},
   {'date': '2024-08-02',
    'date_position': 5,
    'diagnoses': [{'diagnosis': 'asthma', 'position': 34},
     {'diagnosis': 'asthma', 'position': 58}]},
   {'date': '2024-09-12',
    'date_position': 8,
    'diagnoses': [{'diagnosis': 'pituitary_adenoma', 'position': 6

In [15]:
# For testing, use a subset
num_test_samples = 10
test_data = dataset[:num_test_samples] if len(dataset) > num_test_samples else dataset
print(f"Using {len(test_data)} samples for evaluation.")
test_data

Using 10 samples for evaluation.


[{'id': 0,
  'clinical_note': "Ultrasound (30nd Jun 2024)[date]: no significant findings.imp: asthma[dx]\n\nCT (02nd Aug 2024)[date]: reveals asthma[dx].imp: asthma[dx]\n\nX-ray (12nd Sep 2024)[date]: shows 3.1cm mass in brain.imp: pituitary_adenoma[dx]\n\nCLINIC VISIT (16 Sep'24)[date]: nausea/vomiting worsening confirmed rheumatoid_arthritis[dx] switch to aspirin\n\nURGENT REVIEW (23rd Oct 2024)[date]: headache x1 day.resolving pneumonia[dx]\n\nF/U (16st Nov 2024)[date]: improved.rule out pituitary_adenoma[dx] observe and reassess\n\nX-ray (17.12.24)[date]: reveals GERD[dx].imp: GERD[dx]",
  'ground_truth': [{'date': '2024-06-30',
    'date_position': 13,
    'diagnoses': [{'diagnosis': 'asthma', 'position': 64}]},
   {'date': '2024-08-02',
    'date_position': 5,
    'diagnoses': [{'diagnosis': 'asthma', 'position': 34},
     {'diagnosis': 'asthma', 'position': 58}]},
   {'date': '2024-09-12',
    'date_position': 8,
    'diagnoses': [{'diagnosis': 'pituitary_adenoma', 'position': 6

In [16]:
# Prepare gold standard
gold_standard = []
for i, entry in enumerate(test_data):
    for section in entry['ground_truth']:
        for diag in section['diagnoses']:
            gold_standard.append({'note_id': i, 'diagnosis': diag['diagnosis'].lower(), 'date': section['date']})
print(f"Prepared gold standard with {len(gold_standard)} relationships.")

gold_standard

Prepared gold standard with 62 relationships.


[{'note_id': 0, 'diagnosis': 'asthma', 'date': '2024-06-30'},
 {'note_id': 0, 'diagnosis': 'asthma', 'date': '2024-08-02'},
 {'note_id': 0, 'diagnosis': 'asthma', 'date': '2024-08-02'},
 {'note_id': 0, 'diagnosis': 'pituitary_adenoma', 'date': '2024-09-12'},
 {'note_id': 0, 'diagnosis': 'rheumatoid_arthritis', 'date': '2024-09-16'},
 {'note_id': 0, 'diagnosis': 'pneumonia', 'date': '2024-10-23'},
 {'note_id': 0, 'diagnosis': 'pituitary_adenoma', 'date': '2024-11-16'},
 {'note_id': 0, 'diagnosis': 'gerd', 'date': '2024-12-17'},
 {'note_id': 0, 'diagnosis': 'gerd', 'date': '2024-12-17'},
 {'note_id': 1, 'diagnosis': 'multiple_sclerosis', 'date': '2024-09-27'},
 {'note_id': 1, 'diagnosis': 'bronchitis', 'date': '2024-12-26'},
 {'note_id': 1, 'diagnosis': 'multiple_sclerosis', 'date': '2025-01-26'},
 {'note_id': 1, 'diagnosis': 'tension_headache', 'date': '2025-01-29'},
 {'note_id': 1, 'diagnosis': 'gerd', 'date': '2025-02-16'},
 {'note_id': 1, 'diagnosis': 'rheumatoid_arthritis', 'date': 

In [17]:
# Pre-extract entities
prepared_test_data = [
    {'text': entry['clinical_note'], 'entities': extract_entities(entry['clinical_note'])}
    for entry in test_data
]

prepared_test_data

[{'text': "Ultrasound (30nd Jun 2024)[date]: no significant findings.imp: asthma[dx]\n\nCT (02nd Aug 2024)[date]: reveals asthma[dx].imp: asthma[dx]\n\nX-ray (12nd Sep 2024)[date]: shows 3.1cm mass in brain.imp: pituitary_adenoma[dx]\n\nCLINIC VISIT (16 Sep'24)[date]: nausea/vomiting worsening confirmed rheumatoid_arthritis[dx] switch to aspirin\n\nURGENT REVIEW (23rd Oct 2024)[date]: headache x1 day.resolving pneumonia[dx]\n\nF/U (16st Nov 2024)[date]: improved.rule out pituitary_adenoma[dx] observe and reassess\n\nX-ray (17.12.24)[date]: reveals GERD[dx].imp: GERD[dx]",
  'entities': ([('asthma', 63),
    ('asthma', 109),
    ('asthma', 125),
    ('pituitary_adenoma', 197),
    ('rheumatoid_arthritis', 288),
    ('pneumonia', 395),
    ('pituitary_adenoma', 455),
    ('gerd', 531),
    ('gerd', 545)],
   [('2024-06-30', '30nd Jun 2024', 12),
    ('2024-08-02', '02nd Aug 2024', 79),
    ('2024-09-12', '12nd Sep 2024', 144),
    ('2024-09-16', "16 Sep'24", 234),
    ('2024-10-23', '23r

In [18]:
#Create extractor
extractor = create_extractor(config.EXTRACTION_METHOD, config)
extractor

<extractors.llm_extractor.LLMExtractor at 0x222cb2cbf40>

In [19]:
#Custom extractor only
if config.EXTRACTION_METHOD == 'custom':
    relative_model_path = extractor.model_path.lstrip('/')
    relative_vocab_path = extractor.vocab_path.lstrip('/') # Assuming vocab_path has the same issue

    extractor.model_path = os.path.join(project_root, relative_model_path)
    extractor.vocab_path = os.path.join(project_root, relative_vocab_path)
else:
    pass

In [20]:
# Load the extractor (models, API clients, etc.)
if not extractor.load():
    print(f"Failed to load {extractor.name} extractor. Check configuration and dependencies.")
else:
    print(f"Successfully loaded {extractor.name} extractor")

LLM Extractor: OpenAI client initialized successfully.
Successfully loaded LLM (gpt-4o) extractor


In [21]:
# Generate predictions
print(f"Generating predictions using {extractor.name}...")
all_predictions = []
skipped_rels = 0
for i, note_entry in enumerate(prepared_test_data):
    try:
        relationships = extractor.extract(note_entry['text'], entities=note_entry['entities'])
        for rel in relationships:
            parsed_date = parse_date_string(rel['date'])
            normalized_diagnosis = rel['diagnosis'].strip().lower()
            if parsed_date and normalized_diagnosis:
                all_predictions.append({
                    'note_id': i, 
                    'diagnosis': normalized_diagnosis, 
                    'date': parsed_date, 
                    'confidence': rel.get('confidence', 1.0)})
            else: skipped_rels += 1
    except Exception as e: print(f"Extraction error on note {i}: {e}"); continue
print(f"Generated {len(all_predictions)} predictions. Skipped {skipped_rels} due to parsing issues.")

Generating predictions using LLM (gpt-4o)...
Generated 53 predictions. Skipped 0 due to parsing issues.


In [22]:
#Look at predictions
all_predictions

[{'note_id': 0,
  'diagnosis': 'asthma',
  'date': '2024-08-02',
  'confidence': 0.9},
 {'note_id': 0,
  'diagnosis': 'pituitary_adenoma',
  'date': '2024-09-12',
  'confidence': 0.8},
 {'note_id': 0,
  'diagnosis': 'rheumatoid_arthritis',
  'date': '2024-09-16',
  'confidence': 0.9},
 {'note_id': 0,
  'diagnosis': 'pneumonia',
  'date': '2024-10-23',
  'confidence': 0.9},
 {'note_id': 0, 'diagnosis': 'gerd', 'date': '2024-12-17', 'confidence': 0.9},
 {'note_id': 1,
  'diagnosis': 'multiple_sclerosis',
  'date': '2024-09-27',
  'confidence': 0.9},
 {'note_id': 1,
  'diagnosis': 'bronchitis',
  'date': '2024-12-26',
  'confidence': 0.9},
 {'note_id': 1,
  'diagnosis': 'multiple_sclerosis',
  'date': '2025-01-26',
  'confidence': 0.9},
 {'note_id': 1,
  'diagnosis': 'tension_headache',
  'date': '2025-01-29',
  'confidence': 0.9},
 {'note_id': 1, 'diagnosis': 'gerd', 'date': '2025-02-16', 'confidence': 0.9},
 {'note_id': 1,
  'diagnosis': 'rheumatoid_arthritis',
  'date': '2025-03-29',
 

Evaluation i.e. compare_with_gold_standard

In [23]:
# Convert predictions and gold_standard to sets of tuples for efficient comparison
pred_set = set((p['note_id'], p['diagnosis'], p['date']) for p in all_predictions)
gold_set = set((g['note_id'], g['diagnosis'], g['date']) for g in gold_standard)

pred_set, gold_set

({(0, 'asthma', '2024-08-02'),
  (0, 'gerd', '2024-12-17'),
  (0, 'pituitary_adenoma', '2024-09-12'),
  (0, 'pneumonia', '2024-10-23'),
  (0, 'rheumatoid_arthritis', '2024-09-16'),
  (1, 'bronchitis', '2024-12-26'),
  (1, 'gerd', '2025-02-16'),
  (1, 'multiple_sclerosis', '2024-09-27'),
  (1, 'multiple_sclerosis', '2025-01-26'),
  (1, 'rheumatoid_arthritis', '2025-03-29'),
  (1, 'tension_headache', '2025-01-29'),
  (2, 'anemia', '2025-02-23'),
  (2, 'anemia', '2025-05-19'),
  (2, 'bronchitis', '2024-11-10'),
  (2, 'copd', '2025-05-07'),
  (2, 'depression', '2025-04-04'),
  (2, 'diabetes_mellitus', '2025-01-12'),
  (2, 'gerd', '2024-11-21'),
  (2, 'hyperlipidemia', '2025-05-07'),
  (2, 'ibs', '2025-05-19'),
  (2, 'osteoarthritis', '2024-10-04'),
  (3, 'copd', '2025-03-05'),
  (3, 'ibs', '2025-06-27'),
  (3, 'microadenoma', '2025-04-09'),
  (3, 'multiple_sclerosis', '2025-05-28'),
  (3, 'schizophrenia', '2025-02-13'),
  (3, 'stroke', '2025-04-09'),
  (4, 'diabetes_mellitus', '2025-02-12'

In [24]:
# Calculate true/false positives/negatives
true_positives = len(pred_set & gold_set)
false_positives = len(pred_set - gold_set)
false_negatives = len(gold_set - pred_set)
true_negatives = 0 # Cannot be accurately calculated here.

# Calculate metrics
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Confusion matrix values [TN, FP, FN, TP]
conf_matrix_values = [true_negatives, false_positives, false_negatives, true_positives]

precision, recall, f1

(0.9811320754716981, 0.9122807017543859, 0.9454545454545454)

LLM Extractor

In [26]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)
model_name = config.OPENAI_MODEL if hasattr(config, 'OPENAI_MODEL') else 'gpt-4o'

In [27]:
diagnoses_list = [d[0] for d in diagnoses]
dates_list = [d[0] for d in dates]

# Construct the prompt
prompt = f"""

You are tasked with doing relationship extraction between diagnoses and dates from unstructured medical text.

The diagnoses and date entities have already been extracted from the text and are provided to you.

For each diagnosis, identify the most relevant date based on the context.

Return ONLY a JSON array where each object has the following structure:
{{
    "diagnosis": "name of diagnosis",
    "date": "associated date",
    "confidence": a number between 0 and 1 indicating your confidence in this association
}}

The full clinical note and the lists of extracted diagnoses and dates are provided below.

Clinical Note: {text}

Extracted Diagnoses: {diagnoses_list}
Extracted Dates: {dates_list}

Provide ONLY the JSON array, no other explanation.
"""

prompt

'\n\nYou are tasked with doing relationship extraction between diagnoses and dates from unstructured medical text.\n\nThe diagnoses and date entities have already been extracted from the text and are provided to you.\n\nFor each diagnosis, identify the most relevant date based on the context.\n\nReturn ONLY a JSON array where each object has the following structure:\n{\n    "diagnosis": "name of diagnosis",\n    "date": "associated date",\n    "confidence": a number between 0 and 1 indicating your confidence in this association\n}\n\nThe full clinical note and the lists of extracted diagnoses and dates are provided below.\n\nClinical Note: \nInitialPresentation (15-01-2023)[date]  \npt presents w severe HA x2/52,getting worse.no relief w/regular painkillers\nimp: chronic_migraine[diagnosis]vs tension_headache[diagnosis]??? \nplan- start sumatriptan,f/u 2/52\n\nURGENT REVIEW(20.01.23)[date]:pt called- new onset visual symps.  \nrapid worsening,Now with diplopia.urgent referral to ophth\

In [28]:
response = client.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "system", "content": "You are a medical AI assistant specialized in analyzing unstructured clinical notes."},
        {"role": "user", "content": prompt}
    ],
    temperature=0,  
    max_tokens=2000
)

response_text = response.choices[0].message.content.strip()

In [29]:
import json
            
start_idx = response_text.find('[')
end_idx = response_text.rfind(']') + 1

if start_idx >= 0 and end_idx > start_idx:
    json_str = response_text[start_idx:end_idx]
    relationships = json.loads(json_str)
    for rel in relationships:
        if 'confidence' in rel:
                try:
                    rel['confidence'] = float(rel['confidence'])
                except (ValueError, TypeError):
                    rel['confidence'] = 0.0 
        else:
                rel['confidence'] = 1.0 

relationships

[{'diagnosis': 'chronic_migraine', 'date': '2023-01-15', 'confidence': 0.9},
 {'diagnosis': 'tension_headache', 'date': '2023-01-15', 'confidence': 0.8},
 {'diagnosis': 'iih', 'date': '2023-01-20', 'confidence': 0.85},
 {'diagnosis': 'bilat_papilloedema', 'date': '2023-01-25', 'confidence': 0.9},
 {'diagnosis': 'macroadenoma', 'date': '2023-02-02', 'confidence': 0.95},
 {'diagnosis': 'giant_prolactinoma', 'date': '2023-02-03', 'confidence': 0.95},
 {'diagnosis': 'partial_hypopituitarism',
  'date': '2023-02-03',
  'confidence': 0.9},
 {'diagnosis': 'tumor', 'date': '2023-04-12', 'confidence': 0.9},
 {'diagnosis': 'adrenal_insufficiency',
  'date': '2023-05-16',
  'confidence': 0.95}]