In [40]:
from datasets import load_dataset 
import numpy as np
import spacy
import re
from spacy.tokens import Span
from spacy.training import Example
import json
from spacy.scorer import Scorer

In [2]:
def get_random_entry(dataset, seed=None):
    np.random.seed(seed=seed)
    random_entry = int(dataset["train"].num_rows*np.random.random())
    return dataset["train"][random_entry]

### Identify NER Types

In [3]:
medical_dataset = load_dataset("argilla/medical-domain")

In [4]:
# Manuel data inspection for NER Types by printing random health records
print("Medical Field: " + str(get_random_entry(medical_dataset)["prediction"][0]["label"]))
print(get_random_entry(medical_dataset)["text"])

Medical Field:  Surgery
HISTORY: , Patient is a 21-year-old white woman who presented with a chief complaint of chest pain.  She had been previously diagnosed with hyperthyroidism.  Upon admission, she had complaints of constant left sided chest pain that radiated to her left arm.  She had been experiencing palpitations and tachycardia.  She had no diaphoresis, no nausea, vomiting, or dyspnea.,She had a significant TSH of 0.004 and a free T4 of 19.3.  Normal ranges for TSH and free T4 are 0.5-4.7 µIU/mL and 0.8-1.8 ng/dL, respectively.  Her symptoms started four months into her pregnancy as tremors, hot flashes, agitation, and emotional inconsistency.  She gained 16 pounds during her pregnancy and has lost 80 pounds afterwards.  She complained of sweating, but has experienced no diarrhea and no change in appetite.  She was given isosorbide mononitrate and IV steroids in the ER.,FAMILY HISTORY:,  Diabetes, Hypertension, Father had a Coronary Artery Bypass Graph (CABG) at age 34.,SOCIAL 

#### Identified NER Types by manual data inspection
General
- Person
- Organization
- Time
- Date
- Event

Specific
- Healthcare profession 
- Administrative event (e.g. hospitalization)
- Care environment
- Demographic
- Anatomy
- Diseases
- Symptom
- Examination
- Measurement unit
- Measurement value
- Treatment
- Medication
- Medication Dosage
- Medication form (e.g. tablet)
- Medication route





### Apply standard NER classifier of spaCy

In [5]:
#literature
#available NER Types in spaCy: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py

In [51]:
def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    stop_words  = spacy.lang.en.stop_words.STOP_WORDS        

    # Tokenize the text into words
    words = text.split()
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    return text

def retrieve_NE(text):
    doc = nlp(text) 
    ents = dict((e.text, e.label_) for e in doc.ents)
    for key, value in ents.items():
        print(f"\"{key}\"->{value}, ", end="")
    print("")
    return doc, ents

def get_ground_truth(file_name):
    file = open(file_name, "r")
    annotations = json.load(file)
    return annotations['annotations'][0][1]  

def print_scores(docs_with_ground_truth):
    scorer = Scorer()
    scores = scorer.score(docs_with_ground_truth) 
    entities_precision=scores["ents_p"]
    entities_recall=scores["ents_r"]
    entities_f1=scores["ents_f"]
    entities_per_type=scores["ents_per_type"]
    
    #print scores
    print(f"Precision: {entities_precision}")
    print(f"Recall: {entities_recall}")
    print(f"F1-Score: {entities_f1}")
    print("Scores per Entity:")
    for entity, entity_scores in entities_per_type.items():
        print(f"{entity} -> ", end="")
        for entity_score, entity_value in entity_scores.items():
            print(f"{entity_score}: {entity_value} ", end="")
        print("")

In [7]:
#get 2 random entry
nlp = spacy.load("en_core_web_sm")
random_entry_1 = get_random_entry(medical_dataset, seed=1234) # id=32519f69-7893-4b1a-868e-2cdca510a2d6
random_entry_2 = get_random_entry(medical_dataset, seed=1294) # id=f759108e-4428-403b-a1cc-c364bb3fc5b9

print("First 1000 Char of Doc 1 with id: " + random_entry_1["id"] + ": ")
print(random_entry_1["text"][0:1000])

First 1000 Char of Doc 1 with id: 32519f69-7893-4b1a-868e-2cdca510a2d6: 
REASON FOR REFERRAL:,  The patient is a 58-year-old African-American right-handed female with 16 years of education who was referred for a neuropsychological evaluation by Dr. X. She is presenting for a second opinion following a recent neuropsychological evaluation that was ordered by her former place of employment that suggested that she was in the "early stages of a likely dementia" and was thereafter terminated from her position as a psychiatric nurse.  A comprehensive evaluation was requested to assess current cognitive functioning and assist with diagnostic decisions and treatment planning.  Note that this evaluation was undertaken as a clinical exam and intended for the purposes of aiding with treatment planning.  The patient was fully informed about the nature of this evaluation and intended use of the results.,RELEVANT BACKGROUND INFORMATION:  ,Historical information was obtained from a review of availabl

In [37]:
#show ner types of spacy model
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [None]:
! pip install scispacy
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz


In [52]:
#prepocess the text -> TODO: if we do this, then the annotations no longer match
cleaned_text_1 = random_entry_1.copy()
cleaned_text_1["text"] = clean(random_entry_1["text"])
cleaned_text_2 = random_entry_2.copy()
cleaned_text_2["text"] = clean(random_entry_2["text"])



In [53]:
cleaned_text_2["text"] 


'preoperative diagnosis persistent abnormal uterine bleeding endometrial ablation postoperative diagnosis persistent abnormal uterine bleeding endometrial ablation procedure performed total abdominal hysterectomy tah right salpingo oophorectomy complications estimated blood loss 250 cc fluids 1500 cc crystalloids urine 125 cc clear urine end procedure findings exam anesthesia obese female enlarged fibroid uterus freely movable pelvis operative findings demonstrated normal appearing tubes bilaterally right ovary contained right ovarian cyst left ovary appeared normal limits peritoneal surfaces noted normal limits bowel noted normal limits indications procedure patient 44 year old female endometrial ablation showed submucosal fibroids history anemia iron therapy started having bleeding weeks ago intermittent bouts flooding desired permanent definitive therapy felt appropriate patient total abdominal hysterectomy uterus cervix right tube ovary sent pathology review procedure informed cons

In [57]:
#Retrieve NE 
print("Found NE for Doc 1:")
doc1, ents_1 = retrieve_NE(random_entry_1["text"])
print("Found NE for Doc 2:")
doc2, ents_2 = retrieve_NE(random_entry_2["text"])

Found NE for Doc 1:
"REASON FOR REFERRAL"->ORG, "58-year-old"->DATE, "African-American"->NORP, "16 years"->DATE, "X."->PERSON, "second"->ORDINAL, "Hospital Center"->ORG, "four years"->DATE, "September 2009"->DATE, "three"->CARDINAL, "90 days"->DATE, "State Services"->ORG, "these 90 days"->DATE, "the end of November"->DATE, "one"->CARDINAL, "any days"->DATE, "workdays"->DATE, "September 2008"->DATE, "two sick days"->DATE, "July of this year"->DATE, "Y"->PERSON, "Ph.D."->WORK_OF_ART, "Comprehensive Independent Medical Evaluation"->ORG, "08/27/2009"->DATE, "Henry Fein"->PERSON, "09/23/2009"->FAC, ",CURRENT FUNCTIONING"->PERSON, "daily"->DATE, "1991"->DATE, "approximately five"->CARDINAL, "the past year"->DATE, "X"->PERSON, "04/02/2009"->DATE, "C7"->PRODUCT, "C6-C7"->PRODUCT, "1976"->DATE, "1974"->DATE, "2007"->DATE, "approximately seven"->CARDINAL, ",CURRENT MEDICATIONS"->PERSON, "NovoLog"->GPE, "Topamax"->ORG, "Lortab"->ORG, "Naprosyn"->ORG, "two to four"->CARDINAL, "North Carolina"->GPE

In [55]:
#Get ground truth from annotations which were manually labelled with https://tecoholic.github.io/ner-annotator/
ground_truth_1 = get_ground_truth("annotations_1.json")
ground_truth_2 = get_ground_truth("annotations_2.json")

In [58]:
#Calculcate scores
docs_with_ground_truth = [Example.from_dict(doc1, ground_truth_1), Example.from_dict(doc2, ground_truth_2)]
print("Scores:")
print_scores(docs_with_ground_truth) 

Scores:
Precision: 0.2892561983471074
Recall: 0.3153153153153153
F1-Score: 0.3017241379310345
Scores per Entity:
ORG -> p: 0.0 r: 0.0 f: 0.0 
DATE -> p: 0.48484848484848486 r: 0.7619047619047619 f: 0.5925925925925926 
NORP -> p: 0.0 r: 0.0 f: 0.0 
PERSON -> p: 0.11538461538461539 r: 0.6 f: 0.1935483870967742 
ORDINAL -> p: 1.0 r: 0.26666666666666666 f: 0.4210526315789474 
CARDINAL -> p: 0.5833333333333334 r: 0.5384615384615384 f: 0.5599999999999999 
WORK_OF_ART -> p: 0.0 r: 0.0 f: 0.0 
FAC -> p: 0.0 r: 0.0 f: 0.0 
PRODUCT -> p: 0.0 r: 0.0 f: 0.0 
GPE -> p: 0.0 r: 0.0 f: 0.0 
QUANTITY -> p: 0.25 r: 0.16666666666666666 f: 0.2 
TIME -> p: 0.0 r: 0.0 f: 0.0 
MONEY -> p: 0.0 r: 0.0 f: 0.0 


In [59]:
##do the same using the extended annotations -> TODO: we have 4 new annotations SYMPTOM, MEDICAL_CONDITION, MEDICAL_PROCEDURE, ANATOMY
#however, we don't have a model that is trained with these NER types. -> we need another model
# can use a model from scispacy, which are trained on medical data. However, then our NER types are different from the ones used for the scispacy model. 
#Retrieve NE 
print("Found NE for Doc 1:")
doc1, ents_1 = retrieve_NE(random_entry_1["text"])
print("Found NE for Doc 2:")
doc2, ents_2 = retrieve_NE(random_entry_2["text"])
ground_truth_1_ext = get_ground_truth("annotations_1_extended_full.json")
ground_truth_2_ext = get_ground_truth("annotations_2_extended_full.json")
docs_with_ground_truth = [Example.from_dict(doc1, ground_truth_1_ext), Example.from_dict(doc2, ground_truth_2_ext)]
print("Scores with extended annotations:")
print_scores(docs_with_ground_truth)

Found NE for Doc 1:
"REASON FOR REFERRAL"->ORG, "58-year-old"->DATE, "African-American"->NORP, "16 years"->DATE, "X."->PERSON, "second"->ORDINAL, "Hospital Center"->ORG, "four years"->DATE, "September 2009"->DATE, "three"->CARDINAL, "90 days"->DATE, "State Services"->ORG, "these 90 days"->DATE, "the end of November"->DATE, "one"->CARDINAL, "any days"->DATE, "workdays"->DATE, "September 2008"->DATE, "two sick days"->DATE, "July of this year"->DATE, "Y"->PERSON, "Ph.D."->WORK_OF_ART, "Comprehensive Independent Medical Evaluation"->ORG, "08/27/2009"->DATE, "Henry Fein"->PERSON, "09/23/2009"->FAC, ",CURRENT FUNCTIONING"->PERSON, "daily"->DATE, "1991"->DATE, "approximately five"->CARDINAL, "the past year"->DATE, "X"->PERSON, "04/02/2009"->DATE, "C7"->PRODUCT, "C6-C7"->PRODUCT, "1976"->DATE, "1974"->DATE, "2007"->DATE, "approximately seven"->CARDINAL, ",CURRENT MEDICATIONS"->PERSON, "NovoLog"->GPE, "Topamax"->ORG, "Lortab"->ORG, "Naprosyn"->ORG, "two to four"->CARDINAL, "North Carolina"->GPE



Precision: 0.3017241379310345
Recall: 0.12773722627737227
F1-Score: 0.1794871794871795
Scores per Entity:
ORG -> p: 0.0 r: 0.0 f: 0.0 
DATE -> p: 0.48484848484848486 r: 0.7619047619047619 f: 0.5925925925925926 
NORP -> p: 0.0 r: 0.0 f: 0.0 
PERSON -> p: 0.12 r: 0.6 f: 0.19999999999999998 
ORDINAL -> p: 1.0 r: 0.26666666666666666 f: 0.4210526315789474 
CARDINAL -> p: 0.5833333333333334 r: 0.5384615384615384 f: 0.5599999999999999 
WORK_OF_ART -> p: 0.0 r: 0.0 f: 0.0 
FAC -> p: 0.0 r: 0.0 f: 0.0 
PRODUCT -> p: 0.0 r: 0.0 f: 0.0 
GPE -> p: 0.0 r: 0.0 f: 0.0 
QUANTITY -> p: 0.3333333333333333 r: 0.16666666666666666 f: 0.2222222222222222 
MEDICAL_CONDITION -> p: 0.0 r: 0.0 f: 0.0 
MEDICAL_PROCEDURE -> p: 0.0 r: 0.0 f: 0.0 
SYMPTOM -> p: 0.0 r: 0.0 f: 0.0 
ANATOMY -> p: 0.0 r: 0.0 f: 0.0 
TIME -> p: 0.0 r: 0.0 f: 0.0 
MONEY -> p: 0.0 r: 0.0 f: 0.0 


In [None]:
spacy.training.offsets_to_biluo_tags(doc2, ground_truth_2_ext["entities"])


In [12]:
#playground vinz

In [13]:
doc1.char_span(40, 51, label="PERS")

58-year-old

In [14]:
list=[]
for ent in doc1.ents:
    list.append([ent.start_char, ent.end_char, ent.label_]) 


#spacy.training.offsets_to_biluo_tags(doc1, list)

In [15]:

ground_truth_1

{'entities': [[40, 51, 'DATE'],
  [94, 102, 'DATE'],
  [176, 178, 'PERSON'],
  [203, 209, 'ORDINAL'],
  [1236, 1246, 'ORDINAL'],
  [1289, 1303, 'DATE'],
  [1504, 1509, 'CARDINAL'],
  [1578, 1585, 'DATE'],
  [1722, 1736, 'ORDINAL'],
  [1759, 1766, 'DATE'],
  [1785, 1800, 'ORDINAL'],
  [2065, 2068, 'CARDINAL'],
  [2116, 2124, 'DATE'],
  [2288, 2289, 'ORDINAL'],
  [2421, 2435, 'DATE'],
  [2450, 2463, 'DATE'],
  [2557, 2574, 'DATE'],
  [2888, 2889, 'PERSON'],
  [2961, 2971, 'ORDINAL'],
  [2991, 3001, 'DATE'],
  [4513, 4523, 'PERSON'],
  [4555, 4556, 'PERSON'],
  [4563, 4569, 'ORDINAL'],
  [4645, 4655, 'DATE'],
  [4694, 4699, 'CARDINAL'],
  [4739, 4755, 'QUANTITY'],
  [5023, 5033, 'DATE'],
  [6670, 6674, 'DATE'],
  [6832, 6842, 'CARDINAL'],
  [6843, 6863, 'ORDINAL'],
  [6995, 6998, 'PERSON'],
  [7358, 7361, 'CARDINAL'],
  [7828, 7838, 'DATE'],
  [8003, 8013, 'DATE'],
  [8204, 8208, 'ORDINAL'],
  [8467, 8471, 'DATE'],
  [8498, 8502, 'ORDINAL'],
  [8598, 8602, 'ORDINAL'],
  [8691, 8711, 'TIME

In [16]:
print(random_entry_1["text"][1722:1737])

State Services.


In [17]:
for ent in ground_truth_1["entities"]: 
    print(doc1.char_span(ent[0], ent[1], label="PERS"))

58-year-old
16 years
X.
second
four years
September 2009
three
90 days
State Services
90 days
end of November
one
any days
.
September 2008
two sick days
July of this year
Y
08/14/2009
08/27/2009
Henry Fein
X
second
09/23/2009
20/30
one out of three
09/14/2009
1991
five times
within the past year
X's
one
04/02/2009
06/04/2009
1991
1976
1974
2007
seven to eight hours
NovoLog
insulin pump
metformin
metoprolol
amlodipine
Topamax
Lortab
tramadol
amitriptyline
calcium
vitamin D
fluoxetine
pantoprazole
Naprosyn
fluticasone propionate
two to four cups
North Carolina
sixth
nine
third
60
93 years
fourth
1979
two years
1980
two
43
30
30
New York
four years
2-1/2 years
Walter P. Carter Center
21 years
two
between 1991 and 1997
Prozac
Prozac
2nd Edition
Third
Fourth
2
7
Second
Second


In [18]:
print(doc1.char_span(9099, 9102, label="PERS") )

for i in np.arange(20):
    print(str(i) + str(doc1.char_span(9103, 9103+i, label="PERS") ))

and
0None
1None
2None
3None
4None
5None
6None
7vitamin
8None
9None
10None
11None
12None
13None
14None
15None
16None
17None
18None
19None


In [19]:

  [9103, 9113, 'PRODUCT'],

([9103, 9113, 'PRODUCT'],)

In [20]:
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

58-year-old 40 51 DATE
African-American 52 68 NORP
16 years 94 102 DATE
X. 176 178 PERSON
second 203 209 ORDINAL
Hospital Center 1216 1231 FAC
four years 1236 1246 DATE
September 2009 1289 1303 DATE
hours 1456 1461 TIME
three 1504 1509 CARDINAL
90 days 1578 1585 DATE
State Services 1722 1736 ORG
these 90 days 1753 1766 DATE
around the end of November 1774 1800 DATE
one 2065 2068 CARDINAL
days 2120 2124 DATE
September 2008 2421 2435 DATE
two sick days 2450 2463 DATE
July of this year 2557 2574 DATE
Y 2888 2889 PERSON
Ph.D. 2891 2896 WORK_OF_ART
08/14/2009 2961 2971 DATE
Henry Fein 4513 4523 ORG
M.D. 4525 4529 GPE
second 4563 4569 ORDINAL
09/23/2009 4645 4655 LAW
20/30 4694 4699 CARDINAL
one 4739 4742 CARDINAL
three 4750 4755 CARDINAL
09/14/2009 5023 5033 DATE
daily 5732 5737 DATE
1991 6670 6674 DATE
approximately five 6818 6836 CARDINAL
the past year 6850 6863 DATE
C7 7890 7892 CARDINAL
C6-C7 7966 7971 PRODUCT
1991 8204 8208 DATE
GERD 8419 8423 ORG
1976 8467 8471 DATE
1974 8498 8502 DAT