In [1]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from pathlib import Path
from collections import namedtuple

In [2]:
TermItem = namedtuple("TermItem", ["code", "title", "subs"], defaults=["", "", []])

In [3]:
morph_path = Path("../data/ICD-O-3_CSV-metadata/Morphenglish.txt")

In [4]:
morph_df = pd.read_csv(morph_path, sep='\t')

In [14]:
morph_df

Unnamed: 0,Code,Struct,Label
0,8000/0,title,"Neoplasm, benign"
1,8000/0,sub,"Tumor, benign"
2,8000/0,sub,"Unclassified tumor, benign"
3,8000/1,title,"Neoplasm, uncertain whether benign or malignant"
4,8000/1,sub,"Neoplasm, NOS"
...,...,...,...
2266,9987/3,sub,"Therapy-related myelodysplastic syndrome, alky..."
2267,9987/3,sub,"Therapy-related myelodysplastic syndrome, epip..."
2268,9989/3,title,"Myelodysplastic syndrome, NOS"
2269,9989/3,sub,Preleukemia


In [5]:
term_item_dict = {}
for index, row in morph_df.iterrows():
    code, struct, label = row['Code'], row['Struct'], row['Label']
    if code in term_item_dict:
        ti = term_item_dict[code]
        ti.subs.append(label)
    else:
        term_item_dict[code] = TermItem(code, label, [label])
term_items = list(term_item_dict.values())

In [6]:
len(term_items)

1032

In [7]:
def gen_match_function(term_item):
    def on_match(matcher, doc, id, matches):
        print(f"Matched: {term_item.code}", matches)
    return on_match

In [8]:
nlp = spacy.load("en_core_web_sm", disable=['ner'])

In [9]:
matcher = PhraseMatcher(nlp.vocab, attr="LEMMA")

In [12]:
for ti in term_items:
    patterns = [nlp(sub) for sub in ti.subs]
    matcher.add(ti.code, [nlp(sub) for sub in ti.subs], on_match=gen_match_function(ti))

In [13]:
assert term_items[0].code in matcher

In [15]:
doc = nlp("FINDINGS: Neoplasm, NOS")
matches = matcher(doc)

Matched: 8000/1 [(1660820348451189319, 2, 5)]
