In [1]:
from datasets import load_dataset 
import numpy as np
import spacy
from spacy.tokens import Span
from spacy.training import Example
import json
from spacy.scorer import Scorer

In [2]:
def get_random_entry(dataset, seed=None):
    np.random.seed(seed=seed)
    random_entry = int(dataset["train"].num_rows*np.random.random())
    return dataset["train"][random_entry]

### Identify NER Types

In [3]:
medical_dataset = load_dataset("argilla/medical-domain")

Downloading readme:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/4966 [00:00<?, ? examples/s]

In [4]:
# Manuel data inspection for NER Types by printing random health records
print("Medical Field: " + str(get_random_entry(medical_dataset)["prediction"][0]["label"]))
print(get_random_entry(medical_dataset)["text"])

Medical Field:  Neurology
CT HEAD WITHOUT CONTRAST, CT FACIAL BONES WITHOUT CONTRAST, AND CT CERVICAL SPINE WITHOUT CONTRAST,REASON FOR EXAM: , Motor vehicle collision.,CT HEAD,TECHNIQUE: , Noncontrast axial CT images of the head were obtained without contrast.,FINDINGS: , There is no acute intracranial hemorrhage, mass effect, midline shift, or extra-axial fluid collection.  The ventricles and cortical sulci are normal in shape and configuration.  The gray/white matter junctions are well preserved.  No calvarial fracture is seen.,IMPRESSION:  ,Negative for acute intracranial disease.,CT FACIAL BONES WITHOUT CONTRAST,TECHNIQUE:  ,Noncontrast axial CT images of the facial bones were obtained with coronal reconstructions.,FINDINGS:,  There is no facial bone fracture.  The maxilla and mandible are intact.  The visualized paranasal sinuses are clear.  The temporomandibular joints are intact.  The nasal bone is intact.  The orbits are intact.  The extra-ocular muscles and orbital nerves are

#### Identified NER Types by manual data inspection
General
- Person
- Organization
- Time
- Date
- Event

Specific
- Healthcare profession 
- Administrative event (e.g. hospitalization)
- Care environment
- Demographic
- Anatomy
- Diseases
- Symptom
- Examination
- Measurement unit
- Measurement value
- Treatment
- Medication
- Medication Dosage
- Medication form (e.g. tablet)
- Medication route





### Apply standard NER classifier of spaCy

In [31]:
#literature
#available NER Types in spaCy: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py

In [32]:
#get random entry
nlp = spacy.load("en_core_web_sm")
random_entry = get_random_entry(medical_dataset, seed=1234) # id=32519f69-7893-4b1a-868e-2cdca510a2d6
#annotations were done with an ' character at the beginning, which shifted the annotations by one. 
random_entry["text"] = '\'' + random_entry["text"]

print("First 1000 Char of Entry with id: " + random_entry["id"] + ": ")
print(random_entry["text"][0:1000])

First 1000 Char of Entry with id: 32519f69-7893-4b1a-868e-2cdca510a2d6: 
'REASON FOR REFERRAL:,  The patient is a 58-year-old African-American right-handed female with 16 years of education who was referred for a neuropsychological evaluation by Dr. X. She is presenting for a second opinion following a recent neuropsychological evaluation that was ordered by her former place of employment that suggested that she was in the "early stages of a likely dementia" and was thereafter terminated from her position as a psychiatric nurse.  A comprehensive evaluation was requested to assess current cognitive functioning and assist with diagnostic decisions and treatment planning.  Note that this evaluation was undertaken as a clinical exam and intended for the purposes of aiding with treatment planning.  The patient was fully informed about the nature of this evaluation and intended use of the results.,RELEVANT BACKGROUND INFORMATION:  ,Historical information was obtained from a review of availab

In [33]:
#Retrieve NE 
doc = nlp(random_entry["text"]) 
ents = dict((e.text, e.label_) for e in doc.ents)
print("Found NE:")
for key, value in ents.items():
    print(f"\"{key}\"->{value}, ")

Found NE:
"58-year-old"->DATE, 
"African-American"->NORP, 
"16 years"->DATE, 
"X."->PERSON, 
"second"->ORDINAL, 
"Hospital Center"->FAC, 
"four years"->DATE, 
"September 2009"->DATE, 
"hours"->TIME, 
"three"->CARDINAL, 
"90 days"->DATE, 
"State Services"->ORG, 
"these 90 days"->DATE, 
"around the end of November"->DATE, 
"one"->CARDINAL, 
"days"->DATE, 
"September 2008"->DATE, 
"two sick days"->DATE, 
"July of this year"->DATE, 
"Y"->PERSON, 
"Ph.D."->WORK_OF_ART, 
"08/14/2009"->DATE, 
"Henry Fein"->ORG, 
"M.D."->GPE, 
"09/23/2009"->LAW, 
"20/30"->CARDINAL, 
"09/14/2009"->DATE, 
"daily"->DATE, 
"1991"->DATE, 
"approximately five"->CARDINAL, 
"the past year"->DATE, 
"C7"->CARDINAL, 
"C6-C7"->PRODUCT, 
"GERD"->ORG, 
"1976"->DATE, 
"1974"->DATE, 
"2007"->DATE, 
"seven to eight hours"->TIME, 
"NovoLog"->ORG, 
"Topamax, Lortab"->WORK_OF_ART, 
"Naprosyn"->ORG, 
"two to four"->CARDINAL, 
"North Carolina"->GPE, 
"sixth"->ORDINAL, 
"nine"->CARDINAL, 
"third"->ORDINAL, 
"60"->CARDINAL, 
"93 year

In [34]:
#Get ground truth from annotations which were manually labelled with https://tecoholic.github.io/ner-annotator/
file = open("annotations.json", "r")
annotations = json.load(file)
ground_truth = annotations['annotations'][0][1] 

In [35]:
#Calculcate scores
doc_with_ground_truth = Example.from_dict(doc, ground_truth)
scorer = Scorer()
scores = scorer.score([doc_with_ground_truth])
entities_precision=scores["ents_p"]
entities_recall=scores["ents_r"]
entities_f1=scores["ents_f"]
entities_per_type=scores["ents_per_type"]

#print scores
print(f"Precision: {entities_precision}")
print(f"Recall: {entities_recall}")
print(f"F1-Score: {entities_f1}")
print("Scores per Entity:")
for entity, entity_scores in entities_per_type.items():
    print(f"{entity} -> ", end="")
    for entity_score, entity_value in entity_scores.items():
        print(f"{entity_score}: {entity_value} ", end="")
    print("")

Precision: 0.046511627906976744
Recall: 0.6666666666666666
F1-Score: 0.08695652173913045
Scores per Entity:
DATE -> p: 0.1111111111111111 r: 1.0 f: 0.19999999999999998 
NORP -> p: 0.0 r: 0.0 f: 0.0 
PERSON -> p: 0.14285714285714285 r: 1.0 f: 0.25 
ORDINAL -> p: 0.0 r: 0.0 f: 0.0 
FAC -> p: 0.0 r: 0.0 f: 0.0 
TIME -> p: 0.0 r: 0.0 f: 0.0 
CARDINAL -> p: 0.0 r: 0.0 f: 0.0 
ORG -> p: 0.0 r: 0.0 f: 0.0 
WORK_OF_ART -> p: 0.0 r: 0.0 f: 0.0 
GPE -> p: 0.0 r: 0.0 f: 0.0 
LAW -> p: 0.0 r: 0.0 f: 0.0 
PRODUCT -> p: 0.0 r: 0.0 f: 0.0 


In [None]:
#playground vinz

In [6]:
len(get_random_entry(medical_dataset, seed=1234)["text"])

12779

In [15]:
doc.ents

(58-year-old,
 African-American,
 16 years,
 X.,
 second,
 Hospital Center,
 four years,
 September 2009,
 hours,
 three,
 90 days,
 State Services,
 these 90 days,
 around the end of November,
 one,
 days,
 September 2008,
 two sick days,
 July of this year,
 Y,
 Ph.D.,
 08/14/2009,
 Henry Fein,
 M.D.,
 second,
 09/23/2009,
 20/30,
 one,
 three,
 09/14/2009,
 daily,
 1991,
 approximately five,
 the past year,
 C7,
 C6-C7,
 1991,
 GERD,
 1976,
 1974,
 2007,
 seven to eight hours,
 NovoLog,
 Topamax, Lortab,
 Naprosyn,
 two to four,
 North Carolina,
 sixth,
 nine,
 third,
 60,
 93 years old,
 fourth,
 State University,
 1979,
 two years,
 1980,
 two children ages 43 and 30,
 30,
 New York,
 Hospital Center,
 four years,
 2-1/2 years,
 Walter P. Carter Center,
 21 years,
 two,
 between 1991 and 1997,
 Prozac,
 Prozac,
 Wechsler Test,
 Mental Status Exam,
 Repeatable Battery for the Assessment of Neuropsychological Status,
 Form XX),Mattis Dementia Rating Scale,
 2nd Edition,
 Assessment 

In [38]:
len(ground_truth["entities"])

62