In [1]:
import pandas as pd
from os import listdir
import re
from typing import Match

## 1. Load n2c2 data

In [2]:
def concept_annots(text):
    text = text.strip()
    pattern = re.compile(r'''c="(?P<string>.+)" (?P<start_line>\d+):(?P<start_cut>\d+) (?P<end_line>\d+):(?P<end_cut>\d+)\|\|t="(?P<type>.+)"''', re.IGNORECASE)
    match = pattern.search(text)
    annot = {"string": match.group('string'),
            "start_line": int(match.group('start_line')) - 1,
             "start_cut": int(match.group('start_cut')),
             "end_line": int(match.group('end_line')) - 1,
             "end_cut": int(match.group('end_cut')),
             "type": match.group('type')    
    }
    return annot

In [3]:
def rel_annots(text):
    text = text.strip()
    pattern = re.compile(r'''c="(?P<c1>.+)" (?P<c1_start_line>\d+):(?P<c1_start_cut>\d+) (?P<c1_end_line>\d+):(?P<c1_end_cut>\d+)\|\|r="(?P<rel>.+)"\|\|c="(?P<c2>.+)" (?P<c2_start_line>\d+):(?P<c2_start_cut>\d+) (?P<c2_end_line>\d+):(?P<c2_end_cut>\d+)''', re.IGNORECASE)
    match = pattern.search(text)
    annot = {"c1": match.group('c1'),
            "c1_start_line": int(match.group('c1_start_line')) - 1,
             "c1_start_cut": int(match.group('c1_start_cut')),
             "c1_end_line": int(match.group('c1_end_line')) - 1,
             "c1_end_cut": int(match.group('c1_end_cut')),
             "rel": match.group('rel'),
             "c2": match.group('c2'),
            "c2_start_line": int(match.group('c2_start_line')) - 1,
             "c2_start_cut": int(match.group('c2_start_cut')),
             "c2_end_line": int(match.group('c2_end_line')) - 1,
             "c2_end_cut": int(match.group('c2_end_cut')),
    }
    return annot

In [4]:
def ast_annots(text):
    text = text.strip()
    pattern = re.compile(r'''c="(?P<string>.+)" (?P<start_line>\d+):(?P<start_cut>\d+) (?P<end_line>\d+):(?P<end_cut>\d+)\|\|t="(?P<type>.+)"\|\|a="(?P<ast>.+)"''', re.IGNORECASE)
    match = pattern.search(text)
    annot = {"string": match.group('string'),
            "start_line": int(match.group('start_line')) - 1,
             "start_cut": int(match.group('start_cut')),
             "end_line": int(match.group('end_line')) - 1,
             "end_cut": int(match.group('end_cut')),
             "type": match.group('type'),
             "ast": match.group('ast')  
    }
    return annot

In [5]:
train_data = {}

In [6]:
paths = ['data/n2c2/concept_assertion_relation_training_data/beth',
         'data/n2c2/concept_assertion_relation_training_data/partners'
        ]

In [7]:
def load_data(paths):
    data = {}
    for path in paths:
        text_path = path + '/txt'
        concept_path = path + '/concept'
        rel_path = path + '/rel'
        ast_path = path + '/ast'
        for c in listdir(text_path):
            if c != '.DS_Store':
        #         print(c)
                textf = open(text_path+'/'+c, 'r').readlines()
                conf = open(concept_path+'/'+c.replace('txt', 'con'), 'r').readlines()
                conf = [concept_annots(line) for line in conf]

                relf = open(rel_path+'/'+c.replace('txt', 'rel'), 'r').readlines()
                relf = [rel_annots(line) for line in relf]

                astf = open(ast_path+'/'+c.replace('txt', 'ast'), 'r').readlines()
                astf = [ast_annots(line) for line in astf]

                data[c] = {'text': textf, 'concept': conf, 'rel': relf, 'ast': astf}
    return data

In [8]:
train_data = load_data(paths)

In [9]:
len(train_data)

170

In [10]:
test_data = load_data(['data/n2c2/reference_standard_for_test_data'])

In [11]:
len(test_data)

256

In [12]:
# train_data

## 2. Check all relations are from the same sentence

In [13]:
for k in train_data.keys():
    rels = train_data[k]['rel']
    for rel in rels:
        if rel['c1_start_line'] != rel['c2_end_line']:
            print(k)

In [14]:
for k in test_data.keys():
    rels = test_data[k]['rel']
    for rel in rels:
        if rel['c1_start_line'] != rel['c2_end_line']:
            print(k)

### NB: All relations are from the same sentences. So evaluate results at sentence level

## 3. Test NER graunulities

In [15]:
### Here is (I think) a complete list of features:

#     modified tokenizer to better handle academic text
#     retrained dependency parser and pos tagger on GENIA treebank
#     PubMed word vectors (in medium and large models)
#     Retrained NER on MedMentions
#     Four bio specific NER models trained on BIONLP13CG, BC5CDR, JNLPBA, CRAFT.
#     Abbreviation detection pipe
#     Entity linking candidate generation pipe for linking entities to UMLS


In [16]:
import scispacy
import spacy
import seaborn as sns

In [17]:
from scispacy.linking import EntityLinker

In [18]:
nlp = spacy.load("en_core_sci_scibert")
# nlp = spacy.load("en_core_sci_lg")
# nlp = spacy.load("en_core_sci_md")
# nlp = spacy.load("en_core_sci_sm")

In [19]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "name": "umls"})



<scispacy.linking.EntityLinker at 0x7fcaa5252690>

In [22]:
linker = nlp.get_pipe("scispacy_linker")

In [23]:
# Replace text without your own data
text = '''
Ken Nansteel-Miller is the triplet #3 of a spontaneous triamniotic-trichorionic triplet pregnancy born to a 39-year-old G4 P1 spontaneous abortion 2 woman .
'''
doc = nlp(text)

In [30]:
# for ent in doc.ents:
#     print('---------------------------------')
# #     print(ent.sent)
#     print("Span: ", ent.start, ent.end, ent.start_char, ent.end_char)
#     print("String: ", ent)
# #     print(ent._.kb_ents)
#     for umls_ent in ent._.kb_ents[:1]:
#         print("Top entity per mention: ", umls_ent[0], " score: ", umls_ent[1])
# #         print(linker.kb.cui_to_entity[umls_ent[0]])

### Evaluation: dicard non-entity words. 

In [None]:
# https://github.com/davidsbatista/NER-Evaluation/blob/master/example-full-named-entity-evaluation.ipynb
true = [
    [{"label": "PER", "start": 2, "end": 4}],
    [{"label": "LOC", "start": 1, "end": 2},
     {"label": "LOC", "start": 3, "end": 4}]
]

In [333]:
preds = []
trues = []
for did in train_data.keys():
    for sentence in train_data[did]['text']:
        doc = nlp(text)
        labels = []
        for ent in doc.ents:
            start, end = ent.start, ent.end
            for umls_ent in ent._.kb_ents[:1]:
                labels.append({"label": umls_ent[0], "start": start, "end": end})
        
        preds.append(labels)
    for con in train_data[did]['concept']:
        

Admission Date :

2011-09-23

Discharge Date :

2011-10-06

Date of Birth :

2011-09-23

Sex :

M

Service :

NEONATOLOGY

HISTORY OF PRESENT ILLNESS :

Ken Nansteel-Miller is the triplet #3 of a spontaneous triamniotic-trichorionic triplet pregnancy born to a 39-year-old G4 P1 spontaneous abortion 2 woman .

Prenatal screens :

Blood type A positive , antibody negative , rubella immune , RPR nonreactive , hepatitis B surface antigen negative , group beta Strep status unknown .

The mother was followed closely and received betamethasone 2011-08-22 .

She was admitted on 2011-08-25 for nausea , vomiting , and a shortened cervix .

She was treated with bed rest and prn terbutaline .

Other medications including Prozac .

The babies were born by elective cesarean section at 34 weeks gestation .

This triplet #3 emerged with spontaneous respirations , and required blow-by oxygen in the delivery room .

Apgars were seven at one minute and eight at five minutes .

He was admitted to the Neon


Waiting for a subsidized apartment .

No current relationship .

Family History :

Denies .

Physical Exam :

Vitals : 99.3   67 ( 60-70 )   99/53 ( 90-100 s)   96% on RA   3.7L + LOS

Gen : caucasian man lying in bed , NAD

HEENT : NCAT , PERRL , EOMI , no icterus , OP clear , MMM , no tongue fasciculations

Neck : supple , no LAD , no JVD

CV : RRR , nl s1 s2 , no m/g/r

Lungs : ? decreased BS over left middle lung , otherwise CTA

Abd : normal size , nd , no scars , nl bs , soft , nt , palp liver

Ext : no c/c/e , no edema

Neuro : no asterixis , CN II-XII intact , 5/5 strength throughout , sensation to LT intact throughout , gait deferred

CMED : A+Ox3 , mood " great ," affect approp , speech stuttering , linear TP , no SI or HI

Pertinent Results :

Labs on admit :

2012-06-07 09:55 PM BLOOD WBC - 8.9 RBC - 4.94 Hgb - 15.8 Hct - 45.3 MCV - 92 MCH - 31.9 MCHC - 34.8 RDW - 12.4 Plt Ct - 175

2012-06-07 09:55 PM BLOOD Neuts - 49.9 * Lymphs - 42.4 * Monos - 5.5 Eos - 1.6 Baso - 0.6



2018-10-25 11:15 AM BLOOD WBC - 18.3 *# RBC - 3.42 * Hgb - 10.9 * Hct - 31.6 * MCV - 92 MCH - 31.7 MCHC - 34.4 RDW - 13.3 Plt Ct - 134 *

2018-10-31 06:25 AM BLOOD WBC - 13.6 * RBC - 2.72 * Hgb - 8.6 * Hct - 24.6 * MCV - 91 MCH - 31.7 MCHC - 35.0 RDW - 14.0 Plt Ct - 314

2018-10-25 11:15 AM BLOOD PT - 13.3 * PTT - 30.0 INR(PT) - 1.2 *

2018-10-29 06:50 AM BLOOD PT - 11.9 INR(PT) - 1.0

2018-10-25 12:36 PM BLOOD UreaN - 17 Creat - 0.7 Cl - 111 * HCO3 - 23

2018-10-31 06:25 AM BLOOD Glucose - 91 UreaN - 19 Creat - 0.8 Na - 134 K - 4.0 Cl - 98 HCO3 - 26 AnGap - 14

Brief Hospital Course :

Mr. Kammerer was a same day admit and on 10-25 was brought to the operating room where he underwent a coronary artery bypass graft x 3 .

Please see operative report for surgical details .

He tolerated the procedure well and was transferred to the CSRU for invasive monitoring in stable condition .

Later on op day he was weaned from sedation , awoke neurologically intact , and 

extubated .

Beta block


3) CORD COMPRESSION .

4) STEROID INDUCED DIABETES MELLITUS .

5) PLASMACYTOMA .

6) POOR NUTRITIONAL STATUS .

7) THROMBOCYTOPENIA STATUS POST BONE MARROW TRANSPLANT .

8) MULTIPLE MYELOMA .

HISTORY OF PRESENT ILLNESS :

This is a 59-year-old white male .

The patient has had multiple myeloma since 1990 .

He underwent an autologous bone marrow transplant in 10/92 at Petersly Hospital And Medical Center .

At that time he received total body irradiation of 1400 cGy .

This bone marrow transplant did not produce remission and then he was treated with three cycles of VAD subsequently .

In 08/94 , the patient developed lower back pain .

A workup done in DE is not available at the time of this dictation .

According to her scanty records , an MRI of the spine showed diffuse spinal column involvement with lesions in T4 and T8 with no cord compression at that time .

He was then treated with radiation therapy from T3 to T9 receiving 2500 cGy from 08/94 to 09/94 .

He has been on Decadro


One drain was removed on postoperative day number two , and the second drain she will be discharged to home with .

DISPOSITION :

The patient is discharged to home in good condiiton .

Discharge medications include Percocet and Colace .

Folloowup is in one week with Dr. Ralekote .

DP337/1857 DETIASAIRE KAYSJARDUHE , M.D. PQ01

D :

08/08/92

Batch :

9736

Report :

J3725F1

T :

08/10/92

Dictated By :

LA JESCCOT , M.D. KC44

843566350 RWH

0025596

526208

0555812

1/12/2005 12:00:00 AM

ED Discharge Summary

Unsigned

DIS

Report Status :

Unsigned

ED DISCHARGE NOTIFICATION / SUMMARY

KOTEA , TORTHEOA

MRN :

0025596

Age :

47y

REGISTRATION DATE :

01/12/2005 06:07 AM

Provider :

THAA WAGES

PRELMINARY REPORT

Benefits Assigned :

N

Discharge Note

Date / Time :

01/12/2005 09:17

Discharge Status :

Discharged

Condition on Discharge :

Stable

Patient States Complaint :

MIGRAINES FOR 10 DAYS

Standardized Discharge Instructions :

The patient was given printed instructi


FHTs 140s w/ reactivity .

Ctx q 6 &apos; .

UTERINE SIZE IN WEEKS 38

HOSPITAL COURSE ( include complications if any ) :

This 27 year old Gravida 1 Para 0000 was admitted to the Life Valley Medical Center Obstetrical service on 10/31/2004 at 08:45 pm for the indication ( s ) :

premature rupture of membranes .

She delivered a 3235 gram male infant on 11/01/2004 at 09:31 am with apgar scores of 9 and 9 at one and five minutes respectively at 39.0 weeks gestation via spontaneous vertex vaginal delivery .

During her labor she encountered the following complication ( s ) :

none .

During her delivery she encountered the following complication ( s ) :

none .

Postpartum she encountered the following complication ( s ) :

none .

She was discharged on 11/03/2004 at 11:18 am in good condition .

DISCHARGE ORDERS ( medications instructions to patient , follow-up care ) :

DISCHARGE ACTIVITY :

No Restrictions

DISCHARGE DIET :

No Restrictions

POSTPARTUM DISPOSITION :

Home With Self-C

In [31]:
# doc = train_data['record-67.txt']
# for con in doc['concept']:
#     sentence = doc['text'][con['start_line']]
#     print(con, sentence)

In [299]:
# !pip install nervaluate

In [327]:
true = [
    [{"label": "PER", "start": 2, "end": 4}],
    [{"label": "LOC", "start": 1, "end": 2},
     {"label": "LOC", "start": 3, "end": 4}]
]

pred = [
    [{"label": "PER", "start": 2, "end": 4},
    {"label": "LOC", "start": 1, "end": 2}],
    [
     {"label": "LOC", "start": 3, "end": 4},
     {"label": "LOC", "start": 5, "end": 7}]
]

from nervaluate import Evaluator

evaluator = Evaluator(true, pred, tags=['LOC', 'PER'])

# Returns overall metrics and metrics for each tag

results, results_per_tag = evaluator.evaluate()

print(results)

{'ent_type': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}, 'partial': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}, 'strict': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}, 'exact': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}}


In [328]:
true = [
    [{"label": "PER", "start": 2, "end": 4}],
    [{"label": "LOC", "start": 1, "end": 2},
     {"label": "LOC", "start": 3, "end": 4}]
]

pred = [
    [{"label": "PER", "start": 2, "end": 4}],
    [{"label": "LOC", "start": 1, "end": 2},
     {"label": "LOC", "start": 3, "end": 4},
     {"label": "LOC", "start": 5, "end": 7}]
]

from nervaluate import Evaluator

evaluator = Evaluator(true, pred, tags=['LOC', 'PER'])

# Returns overall metrics and metrics for each tag

results, results_per_tag = evaluator.evaluate()

print(results)

{'ent_type': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}, 'partial': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}, 'strict': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}, 'exact': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}}


In [329]:
'''{'ent_type': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}, 'partial': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}, 'strict': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}, 'exact': {'correct': 3, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 1, 'possible': 3, 'actual': 4, 'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571}}''' == '''{'ent_type': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}, 'partial': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}, 'strict': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}, 'exact': {'correct': 2, 'incorrect': 0, 'partial': 0, 'missed': 1, 'spurious': 2, 'possible': 3, 'actual': 4, 'precision': 0.5, 'recall': 0.6666666666666666, 'f1': 0.5714285714285715}}
'''

False