In [1]:
from datasets import load_dataset 
import numpy as np
import spacy
from spacy.tokens import Span
from spacy.training import Example
import json
from spacy.scorer import Scorer

In [2]:
def get_random_entry(dataset, seed=None):
    np.random.seed(seed=seed)
    random_entry = int(dataset["train"].num_rows*np.random.random())
    return dataset["train"][random_entry]

### Identify NER Types

In [3]:
medical_dataset = load_dataset("argilla/medical-domain")

In [4]:
# Manuel data inspection for NER Types by printing random health records
print("Medical Field: " + str(get_random_entry(medical_dataset)["prediction"][0]["label"]))
print(get_random_entry(medical_dataset)["text"])

Medical Field:  Surgery
Assessment for peripheral vestibular function follows:,OTOSCOPY:, showed bilateral intact tympanic membranes with central  Weber test and bilateral positive Rinne.,ROMBERG TEST:, maintained postural stability.,FRENZEL GLASSES EXAMINATION:, no spontaneous, end gaze nystagmus.,HEAD SHAKING:, No provocation nystagmus.,DIX-HALLPIKE:, showed no positional nystagmus excluding benign paroxysmal positional vertigo.,VESTIBULOCULAR REFLEX [HALMAGYI TEST]:, showed corrective saccades giving the impression of decompensated vestibular hypofunction.,IMPRESSION: , The patient was advised to continue her vestibular rehabilitation exercises and the additional medical treatment of betahistine at 24 mg dose bid.  ,PLAN:  ,Planned for  electronystagmography to document the degree of vestibular hypofunction.,


#### Identified NER Types by manual data inspection
General
- Person
- Organization
- Time
- Date
- Event

Specific
- Healthcare profession 
- Administrative event (e.g. hospitalization)
- Care environment
- Demographic
- Anatomy
- Diseases
- Symptom
- Examination
- Measurement unit
- Measurement value
- Treatment
- Medication
- Medication Dosage
- Medication form (e.g. tablet)
- Medication route





### Apply standard NER classifier of spaCy

In [5]:
#literature
#available NER Types in spaCy: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py

In [6]:
#get 2 random entry
nlp = spacy.load("en_core_web_sm")
random_entry_1 = get_random_entry(medical_dataset, seed=1234) # id=32519f69-7893-4b1a-868e-2cdca510a2d6
random_entry_2 = get_random_entry(medical_dataset, seed=1294) # id=f759108e-4428-403b-a1cc-c364bb3fc5b9
#annotations were done with an ' character at the beginning, which shifted the annotations by one. 
random_entry_1["text"] = '\'' + random_entry_1["text"]

print("First 1000 Char of Doc 1 with id: " + random_entry_1["id"] + ": ")
print(random_entry_1["text"][0:1000])

First 1000 Char of Doc 1 with id: 32519f69-7893-4b1a-868e-2cdca510a2d6: 
'REASON FOR REFERRAL:,  The patient is a 58-year-old African-American right-handed female with 16 years of education who was referred for a neuropsychological evaluation by Dr. X. She is presenting for a second opinion following a recent neuropsychological evaluation that was ordered by her former place of employment that suggested that she was in the "early stages of a likely dementia" and was thereafter terminated from her position as a psychiatric nurse.  A comprehensive evaluation was requested to assess current cognitive functioning and assist with diagnostic decisions and treatment planning.  Note that this evaluation was undertaken as a clinical exam and intended for the purposes of aiding with treatment planning.  The patient was fully informed about the nature of this evaluation and intended use of the results.,RELEVANT BACKGROUND INFORMATION:  ,Historical information was obtained from a review of availab

In [7]:
#Retrieve NE 
def retrieve_NE(text):
    doc = nlp(text) 
    ents = dict((e.text, e.label_) for e in doc.ents)
    for key, value in ents.items():
        print(f"\"{key}\"->{value}, ", end="")
    
    print("")
    return doc, ents

print("Found NE for Doc 1:")
doc1, ents_1 = retrieve_NE(random_entry_1["text"])
print("Found NE for Doc 2:")
doc2, ents_2 = retrieve_NE(random_entry_2["text"])

Found NE for Doc 1:
"58-year-old"->DATE, "African-American"->NORP, "16 years"->DATE, "X."->PERSON, "second"->ORDINAL, "Hospital Center"->FAC, "four years"->DATE, "September 2009"->DATE, "hours"->TIME, "three"->CARDINAL, "90 days"->DATE, "State Services"->ORG, "these 90 days"->DATE, "around the end of November"->DATE, "one"->CARDINAL, "days"->DATE, "September 2008"->DATE, "two sick days"->DATE, "July of this year"->DATE, "Y"->PERSON, "Ph.D."->WORK_OF_ART, "08/14/2009"->DATE, "Henry Fein"->ORG, "M.D."->GPE, "09/23/2009"->LAW, "20/30"->CARDINAL, "09/14/2009"->DATE, "daily"->DATE, "1991"->DATE, "approximately five"->CARDINAL, "the past year"->DATE, "C7"->CARDINAL, "C6-C7"->PRODUCT, "GERD"->ORG, "1976"->DATE, "1974"->DATE, "2007"->DATE, "seven to eight hours"->TIME, "NovoLog"->ORG, "Topamax, Lortab"->WORK_OF_ART, "Naprosyn"->ORG, "two to four"->CARDINAL, "North Carolina"->GPE, "sixth"->ORDINAL, "nine"->CARDINAL, "third"->ORDINAL, "60"->CARDINAL, "93 years old"->DATE, "fourth"->ORDINAL, "Sta

In [8]:
#Get ground truth from annotations which were manually labelled with https://tecoholic.github.io/ner-annotator/
def get_ground_truth(file_name):
    file = open(file_name, "r")
    annotations = json.load(file)
    return annotations['annotations'][0][1]  

ground_truth_1 = get_ground_truth("annotations_1.json")
ground_truth_2 = get_ground_truth("annotations_2.json")

In [9]:
#Calculcate scores
def print_scores(docs_with_ground_truth):
    scorer = Scorer()
    scores = scorer.score(docs_with_ground_truth) 
    entities_precision=scores["ents_p"]
    entities_recall=scores["ents_r"]
    entities_f1=scores["ents_f"]
    entities_per_type=scores["ents_per_type"]
    
    #print scores
    print(f"Precision: {entities_precision}")
    print(f"Recall: {entities_recall}")
    print(f"F1-Score: {entities_f1}")
    print("Scores per Entity:")
    for entity, entity_scores in entities_per_type.items():
        print(f"{entity} -> ", end="")
        for entity_score, entity_value in entity_scores.items():
            print(f"{entity_score}: {entity_value} ", end="")
        print("")

#docs_with_ground_truth = [Example.from_dict(doc1, ground_truth_1), Example.from_dict(doc2, ground_truth_2)]
docs_with_ground_truth = [Example.from_dict(doc2, ground_truth_2)]

print("Scores:")
print_scores(docs_with_ground_truth) 


Scores:




Precision: 0.0975609756097561
Recall: 0.22857142857142856
F1-Score: 0.13675213675213677
Scores per Entity:
DATE -> p: 0.3333333333333333 r: 0.42857142857142855 f: 0.375 
NORP -> p: 0.0 r: 0.0 f: 0.0 
PERSON -> p: 0.0 r: 0.0 f: 0.0 
ORDINAL -> p: 1.0 r: 1.0 f: 1.0 
FAC -> p: 0.0 r: 0.0 f: 0.0 
TIME -> p: 0.0 r: 0.0 f: 0.0 
CARDINAL -> p: 0.2 r: 1.0 f: 0.33333333333333337 
ORG -> p: 0.0 r: 0.0 f: 0.0 
WORK_OF_ART -> p: 0.0 r: 0.0 f: 0.0 
GPE -> p: 0.0 r: 0.0 f: 0.0 
LAW -> p: 0.0 r: 0.0 f: 0.0 
PRODUCT -> p: 0.0 r: 0.0 f: 0.0 
MONEY -> p: 0.0 r: 0.0 f: 0.0 
QUANTITY -> p: 0.0 r: 0.0 f: 0.0 


In [10]:
#playground vinz