# UniversalNER

UniversalNER is a prompt-based NER model, where entitites are given the model which are then extracted from the text. It can recognize diverse types of entities or concepts in text corpora from a wide range of domains. Here I'm experimenting with using it to recognise words in text that might be relevant or interesting

[More information here](https://universal-ner.github.io/)

UniversalNER is a large model - equating to roughly 28GB - quantising this model reduces this down to just under 4GB which allows us to pull in a model locally, and run queries using langchain. 

Guidance on quantising it [here](https://medium.com/vendi-ai/efficiently-run-your-fine-tuned-llm-locally-using-llama-cpp-66e2a7c51300)

In [17]:
notebook_name = 'explore_UniversalNER'

In [21]:
import os
import pandas as pd
import sys

from pathlib import Path

# Import the variables that have been set in the init.py folder in the root directory
# These include a constant called PROJECT_ROOT which stores the absolute path to this folder
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import init
PROJECT_ROOT = os.getenv("PROJECT_ROOT")

In [2]:
import json
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [137]:
from langchain.llms import LlamaCpp
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")




## Load data from other experiments

In [117]:
# structured data
with open('../../privfp-experiments/data/llm_dataset.json') as f:
    data = json.load(f)

# unstuctured data
with open('../src/data_exports/explore_llama_20240225_1202.json') as f:
    notes_data = json.load(f)


FIHR_DIR = '../../privfp-poc-zk/experiments/02_generate_dataset'



with open(FIHR_DIR + '/' + 'long_visit_dict.json') as f:
   long_visit_dict = json.load(f)

print(len(long_visit_dict.keys()))
   
ex = list(long_visit_dict.keys())[0]

import re
import datetime
import os
for key, value in long_visit_dict.items():

   encounter_id = long_visit_dict[key]['Encounter']['Encounter id']
   encounter_started = long_visit_dict[key]['Encounter']["Encounter Started"]
   uuid  = key[9:]

   patient_file = [file for file in os.listdir( FIHR_DIR + '/synthea/fhir') if re.search(uuid, file, re.IGNORECASE)]
   with open(FIHR_DIR + '/synthea/fhir/' + patient_file[0]) as f:
      patient_fhir = json.load(f)

   patient_dict = {}
   patient_uuid = long_visit_dict[key]['Patient']
   patient_dict['name'] = ' '.join(patient_fhir['entry'][0]['resource']['name'][0]['given']) + ' ' + patient_fhir['entry'][0]['resource']['name'][0]['family']
   patient_dict['birthDate'] = patient_fhir['entry'][0]['resource']['birthDate']
   patient_dict['gender'] = patient_fhir['entry'][0]['resource']['gender']
   patient_dict['age'] = int((datetime.datetime.fromisoformat(encounter_started).date() - datetime.datetime.strptime(patient_dict['birthDate'], '%Y-%m-%d').date()).days / 365.25)
   long_visit_dict[key]['Patient'] = patient_dict

### Set up environment for UniversalNER

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

n_gpu_layers = 1
n_batch = 512
llm = LlamaCpp(
    model_path="../../llama.cpp/models/quantized_q4_1.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True
)

### Review what we know about this patient

In [119]:
# structured text - generated from Synthea
long_visit_dict['urn:uuid:811ef822-fab7-3f91-35dd-106227ca25c1']

{'Patient': {'name': 'Margeret29 Hauck852',
  'birthDate': '1988-08-25',
  'gender': 'female',
  'age': 33},
 'Encounter': {'Encounter id': 'urn:uuid:ff4668b4-92d0-53a6-4fa2-534e7bfc1edc',
  'Encounter Started': '2022-05-20 19:37:52',
  'Encounter Ended': '2022-06-25 19:52:52',
  'Encounter Duration': 3111300,
  'Hospital Staff': 'Dr. Delcie812 Casper496',
  'Type of admission': 'Admission to skilled nursing facility (procedure)'},
 'Condition': {},
 'Procedure': {'1': {'Text': 'History AND physical examination (procedure)',
   'Started': '2022-05-20T19:37:52+01:00',
   'Ended': '2022-05-20T19:52:52+01:00'},
  '2': {'Text': 'Initial patient assessment (procedure)',
   'Started': '2022-05-20T19:37:52+01:00',
   'Ended': '2022-05-20T19:52:52+01:00'},
  '3': {'Text': 'Development of individualized plan of care (procedure)',
   'Started': '2022-05-20T19:37:52+01:00',
   'Ended': '2022-05-20T19:52:52+01:00'},
  '4': {'Text': 'Nursing care/supplementary surveillance (regime/therapy)',
   'St

In [39]:
# unstructured text - an admission note generated by Llama
notes_data[0]['urn:uuid:811ef822-fab7-3f91-35dd-106227ca25c1']

{'history_of_present_illness': 'The patient is a 45-year-old male who presents with a 2-day history of worsening headache, nausea, and vomiting. The patient states that the symptoms started suddenly and have been getting worse over time. He also reports feeling dizzy and disoriented, with difficulty walking and maintaining balance. The patient has no significant medical history or recent travel. He is currently working as a manual laborer and denies any trauma or recent injuries.',
 'physical_examination': "On examination, the patient is pale and sweaty. Vital signs show a temperature of 38.5 degrees Celsius, pulse rate of 120 beats per minute, respiratory rate of 24 breaths per minute, and blood pressure of 100/60 mmHg. The patient's headache is severe and localized to the frontal region, with no other focal neurological signs. There are no signs of dehydration or electrolyte imbalance. The patient's movements are slow and deliberate, with a slight tremor in both hands. The patient's 

### Truncate the text into a length suitable for UniversalNER

The suggested max length for UniversalNER (based on error messages) is 512. It is not obvious what pads out the text. We want to input instructions, plus patient information, and ideally some instruction words wrap around the patient information so we can remind the LLM at the end of the prompt of the output. 

Therefore I'm truncating the note information so that it, plus the instructions, fit into 512 tokens. This may not be necessary 

In [None]:
# A function to shorten the note
def shorten_note(patient_note, instructions, max_length):

    # Tokenize the instructions to find out how long they are
    instructions_tokens = tokenizer(instructions, truncation=True, return_tensors="pt")

    # count tokens in patient note 
    patient_note_tokens = tokenizer(patient_note, truncation=True, return_tensors="pt")
    num_patient_note_tokens = len(patient_note_tokens['input_ids'][0])

    # work out how many tokens the patient note can occupy
    truncate_note_tokens_at = max_length - len(instructions_tokens['input_ids'][0]) 

    # find which word to truncate the non-tokenized input at, by (clumsily) taking the proportion of the two token lengths
    truncate_note_words_at = round(len(patient_note.split()) * truncate_note_tokens_at/ num_patient_note_tokens)

    # get truncated note
    result_list = patient_note.split()[:truncate_note_words_at]
    truncated_note = " ".join(result_list) + '...} ' # add this to show that the note continues

    return(truncated_note)

In [120]:
# The full prompt looks like this. But as noted above, {input_text} needs to be truncated 

prompt_template = """A virtual assistant answers questions from a user based on the provided text.
USER: Text: {input_text}
ASSISTANT: I’ve read this text.
USER: What describes {entity_name} in the text?
ASSISTANT: (model's predictions in JSON format)
"""


instructions = ''' } ASSISTANT: I’ve read this text.
USER: What describes {entity_name} in the text?
ASSISTANT: (model's predictions in JSON format)'''



# Load the unstructured text 
patient_note = json.dumps(notes_data[0]['urn:uuid:811ef822-fab7-3f91-35dd-106227ca25c1']).replace("_", " ")
prompt = prompt_template.format_map({"input_text": shorten_note(patient_note, max_length), "entity_name": "symptoms"})
print(prompt)
output = llm(prompt)
output


In [136]:
# Load the structured text 
patient_note = json.dumps(long_visit_dict['urn:uuid:811ef822-fab7-3f91-35dd-106227ca25c1'])

# count tokens in patient note 
patient_note_tokens = tokenizer(patient_note, truncation=True, return_tensors="pt")
num_patient_note_tokens = len(patient_note_tokens['input_ids'][0])
# prompt = prompt_template.format_map({"input_text": shorten_note(patient_note, max_length), "entity_name": "symptoms"})
# print(prompt)
# output = llm(prompt)
# output
patient_note_tokens
truncate_note_tokens_at = max_length - len(instructions_tokens['input_ids'][0]) 
truncate_note_words_at = round(len(patient_note.split()) * truncate_note_tokens_at/ num_patient_note_tokens)
truncate_note_words_at

522

In [130]:
patient_note

'{"Patient": {"name": "Margeret29 Hauck852", "birthDate": "1988-08-25", "gender": "female", "age": 33}, "Encounter": {"Encounter id": "urn:uuid:ff4668b4-92d0-53a6-4fa2-534e7bfc1edc", "Encounter Started": "2022-05-20 19:37:52", "Encounter Ended": "2022-06-25 19:52:52", "Encounter Duration": 3111300, "Hospital Staff": "Dr. Delcie812 Casper496", "Type of admission": "Admission to skilled nursing facility (procedure)"}, "Condition": {}, "Procedure": {"1": {"Text": "History AND physical examination (procedure)", "Started": "2022-05-20T19:37:52+01:00", "Ended": "2022-05-20T19:52:52+01:00"}, "2": {"Text": "Initial patient assessment (procedure)", "Started": "2022-05-20T19:37:52+01:00", "Ended": "2022-05-20T19:52:52+01:00"}, "3": {"Text": "Development of individualized plan of care (procedure)", "Started": "2022-05-20T19:37:52+01:00", "Ended": "2022-05-20T19:52:52+01:00"}, "4": {"Text": "Nursing care/supplementary surveillance (regime/therapy)", "Started": "2022-05-20T19:37:52+01:00", "Ended":