In [14]:
import scispacy
import spacy
from spacy.matcher import Matcher, PhraseMatcher
import os
from spacy.tokens import Span

In [3]:
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
bc5cdr_md = spacy.load("{}/models/en_ner_bc5cdr_md-0.1.0".format(par_dir))

In [4]:
import sys

In [5]:
sys.path.append("../code/app")

## Simple Matching

In [38]:
matcher = Matcher(bc5cdr_md.vocab)

In [39]:
pattern = [{"ENT_TYPE": "CHEMICAL"}]

In [40]:
matched = []
def add_matched(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    matched.append(doc[start:end].text)

In [41]:
matcher.add("Dosage", add_matched, pattern)

In [42]:
doc = bc5cdr_md("Simvastatin 80 mg nightly. Patient took eighty mg of Simvastatin daily")

In [43]:
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = bc5cdr_md.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(string_id, start, end, span.text)

Dosage 0 1 Simvastatin
Dosage 10 11 Simvastatin


In [44]:
matched

['Simvastatin', 'Simvastatin']

## More Complex Rules-Based Matching

In [98]:
matcher = Matcher(bc5cdr_md.vocab)

In [99]:
has_method_pattern = [{"ENT_TYPE": "CHEMICAL"}, {"LIKE_NUM": True}, {"LOWER": "mg"}, {}]
matcher.add("HasMethod", None, has_method_pattern)

no_method_pattern = [{"ENT_TYPE": "CHEMICAL"}, {"LIKE_NUM": True}, {"LOWER": "mg"}]
matcher.add("NoMethod", None, no_method_pattern)

just_drug_pattern = [{"ENT_TYPE": "CHEMICAL"}]
matcher.add("JustDrug", None, just_drug_pattern)

In [100]:
doc = bc5cdr_md("Simvastatin 80 mg nightly. Hydralazine 80.5 mg. Metformin 100 mg nightly")

In [101]:
def parse_medication(span):
    if len(span) == 4:
        return {'name': span[0].text, 'amount': span[1].text,
                'unit': span[2].text, 'method': span[3].text}
    elif len(span) == 3:
        return {'name': span[0].text, 'amount': span[1].text,
                'unit': span[2].text, 'method': None}
    else:
        return {'name': span[0].text, 'amount': None,
                'unit': None, 'method': None}

In [102]:
last_start = None
last_end = None
matches = matcher(doc)
medications = []
for match_id, start, end in matches:
    string_id = bc5cdr_md.vocab.strings[match_id]
    span = doc[start:end]
    if start != last_start and last_start is not None:
        medication = parse_medication(doc[last_start:last_end])
        medications.append(medication)
    last_start = start
    last_end = end
medications.append(parse_medication(doc[start:end]))

In [103]:
medications

[{'name': 'Simvastatin', 'amount': '80', 'unit': 'mg', 'method': 'nightly'},
 {'name': 'Hydralazine', 'amount': '80.5', 'unit': 'mg', 'method': '.'},
 {'name': 'Metformin', 'amount': '100', 'unit': 'mg', 'method': 'nightly'}]