### Clean

In [1]:
import json
import spacy

from datasets import load_dataset
from nltk.tokenize import sent_tokenize

DATASET = "xsum"
DATASET_NAME = "xsum_full"

In [2]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
ner_model_web = spacy.load("en_core_web_lg")
ner_model_sci = spacy.load("en_core_sci_lg")
ner_model_sci.add_pipe(
    "scispacy_linker",
    config={"resolve_abbreviations": True, "linker_name": "umls"},
)
linker_sci = ner_model_sci.get_pipe("scispacy_linker")
ner_lst    = [ner_model_sci, ner_model_web]
linker_lst = [linker_sci, None]

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
def get_entities(input, ner_model_lst, linker_lst=None):

    SEMTYPES = ["T023","T028","T046","T047","T048",
                "T059","T060","T061","T074","T109",
                "T116","T121","T122","T123","T125",
                "T129","T184","T191","T195"]

    output_entities = set()

    if type(ner_model_lst) is not list:
        ner_model_lst = [ner_model_lst]
        linker_lst    = [linker_lst]

    for (ner_model, linker) in zip(ner_model_lst, linker_lst):
        entity_lst = ner_model(input).ents

        if "scispacy_linker" in ner_model.pipe_names:
            filtered_entities = []
            for e in set(entity_lst):
                if len(e._.kb_ents) > 0:
                    umls_ent_id, _ = e._.kb_ents[0]  # Get top hit from UMLS
                    umls_ent  = linker.kb.cui_to_entity[umls_ent_id]  # Get UMLS entity
                    umls_semt = umls_ent[3]
                    if any([t in SEMTYPES for t in umls_semt]):
                        e = str(e)
                        if e not in filtered_entities:
                            filtered_entities.append(e)
            output_entities.update(set(filtered_entities))
        else:
            output_entities.update(set([str(e) for e in entity_lst]))

    return output_entities


In [4]:
def write_json(output_json, path):
    json_object = json.dumps(output_json, indent=4)
    with open(path, "w") as outfile:
        outfile.write(json_object)

def clean_label_unk(input, label, token="<unk>"):
    
    inp = input.lower()
    entities = get_entities(label.lower(), ner_lst, linker_lst)
    bad_ents = [e for e in entities if e.lower() not in inp]

    for e in bad_ents:
        label = label.replace(e, token)

    return label

def clean_label_drop(label, token="<unk>"):
    sent_lst = sent_tokenize(label)
    sent_lst = list(filter(lambda s: token not in s, sent_lst))
    result = " ".join(sent_lst)
    if result.strip() == "": return None
    else: return result

def clean_label_remv(label, token="<unk>"):
    if token in label: return None
    else: return label

def clean_item(item):
    inp, lab = item["input"], item["labels"]
    label_with_unk = [clean_label_unk(inp, l) for l in lab]

    # Drop by sentence
    label_unk_drop = [clean_label_drop(l) for l in label_with_unk]
    label_unk_drop = [item for item in label_unk_drop if item is not None]

    # Drop entire example
    label_unk_remv = [clean_label_remv(l) for l in label_with_unk]
    label_unk_remv = [item for item in label_unk_remv if item is not None]

    return inp, label_unk_drop, label_unk_remv

def clean_dataset(dataset):
    dataset_drop_sent, dataset_drop_example = [], []
    for idx, item in enumerate(dataset):
        if idx%100==0:
            print(f"{idx}/{len(dataset)}")
        inp, label_unk_drop, label_unk_remv = clean_item(item)

        if len(label_unk_drop) > 0:
            dataset_drop_sent.append({"input": inp, "labels": label_unk_drop})
        if len(label_unk_remv) > 0:
            dataset_drop_example.append({"input": inp, "labels": label_unk_remv})
    return dataset_drop_sent, dataset_drop_example

In [5]:
dataset = load_dataset(
    "json", data_files=f"/home/lyf6/simplification-project/data/{DATASET_NAME}.json", field="train"
)
dataset["test"] = load_dataset(
    "json", data_files=f"/home/lyf6/simplification-project/data/{DATASET_NAME}_multiple.json", field="test"
)["train"]

dataset_drop_sent_train, dataset_drop_example_train = clean_dataset(dataset["train"])

Found cached dataset json (/home/lyf6/.cache/huggingface/datasets/json/default-74c9ead1486862fa/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/lyf6/.cache/huggingface/datasets/json/default-60557e6520441198/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

0/204045
100/204045
200/204045
300/204045
400/204045
500/204045
600/204045
700/204045
800/204045
900/204045
1000/204045
1100/204045
1200/204045
1300/204045
1400/204045
1500/204045
1600/204045
1700/204045
1800/204045
1900/204045
2000/204045
2100/204045
2200/204045
2300/204045
2400/204045
2500/204045
2600/204045
2700/204045
2800/204045
2900/204045
3000/204045
3100/204045
3200/204045
3300/204045
3400/204045
3500/204045
3600/204045
3700/204045
3800/204045
3900/204045
4000/204045
4100/204045
4200/204045
4300/204045
4400/204045
4500/204045
4600/204045
4700/204045
4800/204045
4900/204045
5000/204045
5100/204045
5200/204045
5300/204045
5400/204045
5500/204045
5600/204045
5700/204045
5800/204045
5900/204045
6000/204045
6100/204045
6200/204045
6300/204045
6400/204045
6500/204045
6600/204045
6700/204045
6800/204045
6900/204045
7000/204045
7100/204045
7200/204045
7300/204045
7400/204045
7500/204045
7600/204045
7700/204045
7800/204045
7900/204045
8000/204045
8100/204045
8200/204045
8300/204045
8400

In [None]:
# Output the file
write_json(
    {"train": dataset_drop_sent_train, "test": [item for item in dataset["test"]]},
    f"/home/lyf6/simplification-project/data/{DATASET}_drop_sent.json",
)
write_json(
    {"train": dataset_drop_sent_train, "test": [item for item in dataset["test"]]},
    f"/home/lyf6/simplification-project/data/{DATASET}_drop_sent_multiple.json",
)
write_json(
    {"train": dataset_drop_example_train, "test": [item for item in dataset["test"]]},
    f"/home/lyf6/simplification-project/data/{DATASET}_drop_ex.json",
)
write_json(
    {"train": dataset_drop_example_train, "test": [item for item in dataset["test"]]},
    f"/home/lyf6/simplification-project/data/{DATASET}_drop_ex_multiple.json",
)
