In [None]:
!pip install medspacy > /dev/null
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz > /dev/null


Let's apply some of the concepts from the spaCy 101 notebook to a real world scenario. Let's try to determine the diagnostic accuracy of our RUQ US for acute cholecystitis.

We'll look at some radiology report impressions and corresponding path diagnoses and determine if either report asserts the presence of 'cholecystitis' (i.e. mentions it in a *non-negated* context).

In [None]:
import pprint
import spacy
import medspacy
from medspacy.context import ConTextComponent
from medspacy.visualization import visualize_dep, visualize_ent
import json

incl_scispacy_umls_linker = False

# //For linking to UMLS - not required just for NER and context annotations, only if the CUIs are required
# //Adds significantly to load time and doc processing time
if incl_scispacy_umls_linker:
    !pip install scispacy
    from scispacy.linking import EntityLinker

import warnings
warnings.filterwarnings('ignore')

pp = pprint.PrettyPrinter(indent=2)

In [None]:
# We'll load paired radiology report impressions and corresponding path reports
# to a variable called 'data'
with open('../resources/ruqus_acutechole.json') as json_file:
    data = json.load(json_file)
 

In [None]:
# Let's peek at the first item in our data dictionary
# See that it contains a rad_id number, rad_impression, path_id, and path_impression
data[0]

---

We'll initialize the same pipeline we used in spaCy_101:

In [None]:
# Initialize an nlp pipeline based on the small version of the 
# English language core scientific model "en_core_sci_sm"
nlp = spacy.load("en_core_sci_sm")
if incl_scispacy_umls_linker:
    # //Add this pipeline component to get UMLS CUIs annotated
    nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Add the ConText pipeline component to our model
# This will determine whether NEs are negated, hypothetical, uncertain
context = ConTextComponent(nlp)
nlp.add_pipe("medspacy_context")
print(nlp.pipe_names)

In [None]:
# Here's a function that will return True if "cholecystitis" is found and is:
#  a) not negated, and
#  b) not chronic
# Otherwise it will return False
def find_acute_chole(text) -> bool:
    doc = nlp(text)
    for ent in doc.ents:
        if "cholecystitis" in str(ent).lower().strip() and not "chronic" in str(ent).lower().strip():
             if ent._.is_negated == False:
                return True
    return False

In [None]:
results = []
for d in data:
    # Add two new keys to our dictionary to indicate whether we think the radiology and pathology
    # reports found acute cholecystitis or not
    d['rad_positive'] = find_acute_chole(d['rad_impression'])
    d['path_positive'] = find_acute_chole(d['path_dx'])
    results.append(d)

In [None]:
tp = []
tn = []
fp = []
fn = []
for r in results:
    if r['rad_positive'] and r['path_positive']: tp.append(r)
    if r['rad_positive'] and not r['path_positive']: fp.append(r)
    if not r['rad_positive'] and r['path_positive']: fn.append(r)
    if not r['rad_positive'] and not r['path_positive']: tn.append(r)

In [None]:
print(""" True Positive: %i
False Positive: %i
 True Negative: %i
False Negative: %i
   Sensitivity: %s
   Specificity: %s
      Accuracy: %s
""" % (
    len(tp), 
    len(fp), 
    len(tn), 
    len(fn), 
    '{0:.2f}'.format(len(tp)/(len(tp)+len(fn))), 
    '{0:.2f}'.format(len(tn)/(len(tn)+len(fp))), 
    '{0:.2f}'.format((len(tn)+len(tp))/(len(tn)+len(fp)+len(tp)+len(fn)))
    ))

In [None]:
# Let's review the True Positives:
pp.pprint(tp)