Imports

In [6]:
#imports
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed
from tqdm import tqdm

# Import our modules
import sys
import os

utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
models_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'models'))

if utils_path not in sys.path:
    sys.path.insert(0, utils_path)
if models_path not in sys.path:
    sys.path.insert(0, models_path)

from general_utils import load_data, prepare_all_samples, get_entity_date_pairs
from bert_training_utils import add_special_tokens
from bert_extractor_utils import preprocess_input, bert_extraction, mark_entities_full_text
from bert_model import BertRC

Data Loading

In [2]:
# Load data
df = load_data("../data/inference_dataset.csv")
print(f"Loaded {len(df)} records")

Loaded 101 records


In [3]:
#Inspect df
df.head()

Unnamed: 0,doc_id,note_text,entities_json,dates_json,relative_dates_json
0,0,Ultrasound (30nd Jun 2024): no significant fin...,"[{'id': 'ent_1', 'value': 'Ultrasound', 'cui':...","[{'id': 'abs_1', 'value': '30nd Jun 2024', 'st...",[]
1,1,Labs (27th Sep 2024): anemia. resolving Skin:...,"[{'id': 'ent_1', 'value': 'anemia', 'cui': 'C0...","[{'id': 'abs_1', 'value': '27th Sep 2024', 'st...",[]
2,2,URGENT REVIEW (2024-10-04): cough. suspect ost...,"[{'id': 'ent_1', 'value': 'REVIEW', 'cui': 'C1...","[{'id': 'abs_1', 'value': '2024-10-04', 'start...",[]
3,3,URGENT REVIEW (13rd Feb 2025) MRI of the brain...,"[{'id': 'ent_1', 'value': 'REVIEW', 'cui': 'C0...","[{'id': 'abs_1', 'value': '13rd Feb 2025', 'st...",[]
4,4,New pt((18/11/24)): pt presents with nausea/vo...,"[{'id': 'ent_1', 'value': 'nausea', 'cui': 'C0...","[{'id': 'abs_1', 'value': '18/11/24', 'start':...",[]


In [4]:
# Prepare all samples
samples = prepare_all_samples(df)
print(f"Prepared {len(samples)} samples")
#samples[0]

Prepared 101 samples


BERT Inference

In [7]:
# Set seed for reproducibility
set_seed(42)

In [9]:
#Set path to load model from
model_path = '../models/bert_models/'

In [None]:
#Choose model to use - any BERT model from HuggingFace can be used, see: https://huggingface.co/google-bert
#model_name = "google/bert_uncased_L-2_H-128_A-2"

In [12]:
# Load trained model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, ignore_mismatched_sizes=True)

ValueError: The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?

In [None]:
# Make predictions
predictions = []

for sample in tqdm(samples, desc="Samples"):
    # Get absolute date pairs
    absolute_pairs = get_entity_date_pairs(sample['entities_list'], sample['dates'])
    
    # Get relative date pairs if available
    if sample.get('relative_dates') and len(sample['relative_dates']) > 0:
        relative_pairs = get_entity_date_pairs(sample['entities_list'], [], sample['relative_dates'])
        pairs = absolute_pairs + relative_pairs
    else:
        pairs = absolute_pairs
    
    for pair in pairs:
        entity = pair['entity']
        date = pair['date_info']
        pred, conf = bert_extraction(sample['note_text'], entity, date, model, tokenizer)
        if pred == 1:
            predictions.append({'entity_label': entity['value'], 'date': date['value'], 'confidence': conf})


Testing base model (no finetuning):


Base model testing: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]

Base model predictions: 756





In [None]:
#Look at prediction
predictions

[{'entity_label': 'history of meningitis',
  'date': '30nd Jun 2024',
  'confidence': 0.5474071502685547},
 {'entity_label': 'history of meningitis',
  'date': '12nd Sep 2024',
  'confidence': 0.5467742085456848},
 {'entity_label': 'history of meningitis',
  'date': "16 Sep'24",
  'confidence': 0.5474152565002441},
 {'entity_label': 'history of meningitis',
  'date': '23rd Oct 2024',
  'confidence': 0.5460683703422546},
 {'entity_label': 'history of meningitis',
  'date': '16st Nov 2024',
  'confidence': 0.5437679886817932},
 {'entity_label': 'history of meningitis',
  'date': '17.12.24',
  'confidence': 0.5437679886817932},
 {'entity_label': 'rheumatoid_arthritis',
  'date': '30nd Jun 2024',
  'confidence': 0.5458624362945557},
 {'entity_label': 'rheumatoid_arthritis',
  'date': '12nd Sep 2024',
  'confidence': 0.5452666282653809},
 {'entity_label': 'rheumatoid_arthritis',
  'date': "16 Sep'24",
  'confidence': 0.546122670173645},
 {'entity_label': 'rheumatoid_arthritis',
  'date': '2