In [1]:
import json
import spacy
import scispacy

from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from scispacy.linking import EntityLinker

DATASET_NAME = "cochrane_full"

ner_model_web = spacy.load("en_core_web_lg")

ner_model_sci = spacy.load("en_core_sci_lg")
ner_model_sci.add_pipe(
    "scispacy_linker",
    config={"resolve_abbreviations": True, "linker_name": "umls"},
)
linker_sci = ner_model_sci.get_pipe("scispacy_linker")

ner_model_lst=[ner_model_sci, ner_model_web]
linker_lst=[linker_sci, None]

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
def write_json(output_json, path):
    json_object = json.dumps(output_json, indent=4)
    with open(path, "w") as outfile:
        outfile.write(json_object)

def augment_label_with_entities(text, inp_entities, sep_token="</s>"):
    text_entities = get_entities(text, ner_model_lst, linker_lst)
    common_entities = set([x.lower() for x in inp_entities]).intersection(
                        set([x.lower() for x in text_entities])
                        )
    entity_suffix = " , ".join(common_entities)
    return f"{text} {sep_token} {entity_suffix}"

def get_entities(input, ner_model_lst, linker_lst=None):

    SEMTYPES = ["T023","T028","T046","T047","T048",
                "T059","T060","T061","T074","T109",
                "T116","T121","T122","T123","T125",
                "T129","T184","T191","T195"]

    output_entities = set()

    if type(ner_model_lst) is not list:
        ner_model_lst = [ner_model_lst]
        linker_lst    = [linker_lst]

    for (ner_model, linker) in zip(ner_model_lst, linker_lst):
        entity_lst = ner_model(input).ents

        if "scispacy_linker" in ner_model.pipe_names:
            filtered_entities = []
            for e in set(entity_lst):
                if len(e._.kb_ents) > 0:
                    umls_ent_id, _ = e._.kb_ents[0]  # Get top hit from UMLS
                    umls_ent  = linker.kb.cui_to_entity[umls_ent_id]  # Get UMLS entity
                    umls_semt = umls_ent[3]
                    if any([t in SEMTYPES for t in umls_semt]):
                        e = str(e)
                        if e not in filtered_entities:
                            filtered_entities.append(e)
            output_entities.update(set(filtered_entities))
        else:
            output_entities.update(set([str(e) for e in entity_lst]))

    return output_entities

def clean_dataset(data):
    inp_lst, lab_w_ents = [], []
    for item in data:
        inp, lab = item["input"], item["labels"]
        inp_entities = get_entities(inp, ner_model_lst, linker_lst)
        label_with_augmented_ents = [augment_label_with_entities(l, inp_entities) for l in lab]
        inp_lst.append(inp)
        lab_w_ents.append(label_with_augmented_ents)
    return inp_lst, lab_w_ents



In [3]:
dataset = load_dataset(
    "json", data_files=f"../data/{DATASET_NAME}_multiple.json", field="train"
)
dataset["test"] = load_dataset(
    "json", data_files=f"../data/{DATASET_NAME}_multiple.json", field="test"
)["train"]

inp_lst_train, lab_w_ents_train = clean_dataset(dataset["train"])
inp_lst_test,  lab_w_ents_test  = clean_dataset(dataset["test"])

train_w_ents_json = list([{"input": i, "labels": l} for (i,l) in zip(inp_lst_train, lab_w_ents_train)])
test_w_ents_json  = list([{"input": i, "labels": l} for (i,l) in zip(inp_lst_test,  lab_w_ents_test)])

Found cached dataset json (/home/lyf6/.cache/huggingface/datasets/json/default-9ba730f179d68666/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/lyf6/.cache/huggingface/datasets/json/default-9a0418f804e277a7/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Output the file
write_json(
    {"train": train_w_ents_json, "test": test_w_ents_json},
    f"../data/cochrane_aug_ents.json",
)
write_json(
    {"train": train_w_ents_json, "test": test_w_ents_json},
    f"../data/cochrane_aug_ents_multiple.json",
)


In [5]:
train_w_ents_json

[{'input': 'Two trials met the inclusion criteria. One compared 2% ketanserin ointment in polyethylene glycol (PEG) with PEG alone, used twice a day by 40 participants with arterial leg ulcers, for eight weeks or until healing, whichever was sooner. One compared topical application of blood-derived concentrated growth factor (CGF) with standard dressing (polyurethane film or foam); both applied weekly for six weeks by 61 participants with non-healing ulcers (venous, diabetic arterial, neuropathic, traumatic, or vasculitic). Both trials were small, reported results inadequately, and were of low methodological quality. Short follow-up times (six and eight weeks) meant it would be difficult to capture sufficient healing events to allow us to make comparisons between treatments. One trial demonstrated accelerated wound healing in the ketanserin group compared with the control group. In the trial that compared CGF with standard dressings, the number of participants with diabetic arterial ul