In [54]:
import csv

import diskcache as dc
import jsonlines
import requests
from deepl import Translator
from tqdm import tqdm

from data_loading import get_annotation_ids
from models import SampleCollection, ProdigySample, ProdigyNERLabel
from src.config import TLCPaths
from src.data_loading import annotations as tlc_annotations, load_tlc_samples
from src.models import Sample

In [55]:
UMLS_KEY = "43f9234c-4977-45f6-a440-2dda1b43d919"

In [57]:
# get rejected annotations, annotate them and create new dataset

sample_collection = SampleCollection(load_tlc_samples())

tlc_samples = []
for file in TLCPaths.json_dir.iterdir():
    sample = Sample.parse_file(file)
    tlc_samples.append(sample)


def get_sample_for_annotation_id(annotation_id):
    for sample in tlc_samples:
        sample_ann_ids = [ann.id for ann in sample.annotations]
        if annotation_id in sample_ann_ids:
            return sample


def get_tlc_annotation(annotation_id, tlc_annotations):
    return next(ann for ann in tlc_annotations if ann.id == annotation_id)


german_umls_cuis = set()
with open(TLCPaths.project_data_path.joinpath("german_umls_names_and_cuis.csv"),
          newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in reader:
        assert len(row) == 2
        german_umls_cuis.add(row[0])

with jsonlines.open(
        TLCPaths.project_data_path.joinpath("TLC-UMLS-v4-reviewed-validated.jsonl")) as reader:
    validated_samples = [obj for obj in reader]
accepted_samples = [sample for sample in validated_samples if sample["answer"] == "accept"]
rejected_samples = [sample for sample in validated_samples if sample["answer"] == "reject"]

In [58]:
# load manual annotations
import csv

manual_annotations = []
with open(TLCPaths.project_data_path.joinpath("rejected_samples_for_manual_annotation_copy.csv"),
          "r", newline="\n") as f:
    reader = csv.reader(f, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        annotation_id = int(row[1].strip("[]").split(",")[0])
        annotation = get_tlc_annotation(annotation_id, tlc_annotations)
        cui = row[-1]
        if not cui or cui not in german_umls_cuis:
            continue
        manual_annotations.append(dict(annotation=annotation, cui=cui))

In [59]:
manual_annotations[0], len(manual_annotations)

({'annotation': Annotation(tech_term=None, lay_term='Magen-Darm-Trakt', type=<TermType.LAY: 'LAY'>, span_start=257, span_end=273, synonyms=['GIT', 'GIT'], id=29),
  'cui': 'C0012240'},
 54)

# add descriptions

In [60]:
umls_cache = dc.Cache("caches/umls_cache")


class CachedTranslator:
    def __init__(self, cache, api_key):
        self.cache = cache
        self.translator = Translator(api_key)

    def translate_text(self, text, source_lang, target_lang):
        if (text, source_lang, target_lang) in self.cache:
            return self.cache[(text, source_lang, target_lang)]
        else:
            res = self.translator.translate_text(text, source_lang=source_lang,
                                                 target_lang=target_lang)
            self.cache[(text, source_lang, target_lang)] = res
            return res


deepl_cache = dc.Cache("caches/deepl_cache")
deepl_key = "0c25ea6d-b79f-288a-541f-ae25709c6312:fx"
translator = CachedTranslator(deepl_cache, deepl_key)


@umls_cache.memoize()
def get_cui_name(cui):
    query_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}?apiKey={UMLS_KEY}"
    response = requests.get(query_url)
    if not "result" in response.json():
        print("returning none for ", cui)
        return None
    else:
        eng_name = response.json()["result"]["name"]
        name = translator.translate_text(text=eng_name, source_lang="EN", target_lang="DE").text
        return name


@umls_cache.memoize()
def get_description(cui):
    name = get_cui_name(cui=cui)

    ger_sabs = ["DMDICD10", "DMDUMD", "WHOGER", "ICPCGER", "LNC-DE-AT", "LNC-DE-DE", "MDRGER",
                "MSHGER"]
    eng_sabs = ["HPO", "MDR", "MSH", "SNOMEDCT_US"]

    query_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}/definitions?apiKey={UMLS_KEY}"
    response = requests.get(query_url)
    if "result" in response.json():
        result = response.json()["result"]
        for res in result:
            if res["rootSource"] in ger_sabs:
                return f"{name}: " + res["value"]
        for res in result:
            if res["rootSource"] in eng_sabs:
                return f"{name}: " + translator.translate_text(text=res["value"], source_lang="EN",
                                                               target_lang="DE").text
        return f"{name}: " + translator.translate_text(text=response.json()["result"][0]["value"],
                                                       source_lang=None,
                                                       target_lang="DE").text
    else:
        return name


In [61]:
all_annotations = manual_annotations

In [62]:
for annotation in tqdm(all_annotations):
    annotation["description"] = get_description(cui=annotation["cui"])

100%|██████████| 54/54 [00:00<00:00, 4028.61it/s]


In [63]:
for annotation in tqdm(all_annotations):
    if annotation["description"] is None:
        print(annotation)

100%|██████████| 54/54 [00:00<00:00, 225814.97it/s]


In [64]:
no_desc_ann = [ann for ann in all_annotations if ann["description"] is None]
print(len(no_desc_ann))
all_annotations_with_desc = [ann for ann in all_annotations if ann["description"] is not None]

0


In [None]:
prodigy_samples = []
for ann_with_cui_and_desc in tqdm(all_annotations_with_desc):
    annotation = ann_with_cui_and_desc["annotation"]
    cui = ann_with_cui_and_desc["cui"]
    description = ann_with_cui_and_desc["description"]
    spans = [ProdigyNERLabel(start=annotation.span_start, end=annotation.span_end)]
    text = sample_collection.get_sample_by_annotation_id(annotation.id).text
    # <a href="url">link text</a>

    html = f'<b>{cui}</b>: ' + description
    annotations_ids = get_annotation_ids(annotation.get_mention())
    prodigy_samples.append(
        ProdigySample(text=text, spans=spans, id=annotation.id, cui=cui, html=html,
                      annotation_ids=annotations_ids,
                      meta=dict(url=f"https://uts.nlm.nih.gov/uts/umls/concept/{cui}")))#%%
import csv

import diskcache as dc
import jsonlines
import requests
from deepl import Translator
from tqdm import tqdm

from data_loading import get_annotation_ids
from models import SampleCollection, ProdigySample, ProdigyNERLabel
from src.config import TLCPaths
from src.data_loading import annotations as tlc_annotations, load_tlc_samples
from src.models import Sample

In [None]:
UMLS_KEY = "43f9234c-4977-45f6-a440-2dda1b43d919"

In [None]:
# get rejected annotations, annotate them and create new dataset

sample_collection = SampleCollection(load_tlc_samples())

tlc_samples = []
for file in TLCPaths.json_dir.iterdir():
    sample = Sample.parse_file(file)
    tlc_samples.append(sample)


def get_sample_for_annotation_id(annotation_id):
    for sample in tlc_samples:
        sample_ann_ids = [ann.id for ann in sample.annotations]
        if annotation_id in sample_ann_ids:
            return sample


def get_tlc_annotation(annotation_id, tlc_annotations):
    return next(ann for ann in tlc_annotations if ann.id == annotation_id)


german_umls_cuis = set()
with open(TLCPaths.project_data_path.joinpath("german_umls_names_and_cuis.csv"),
          newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in reader:
        assert len(row) == 2
        german_umls_cuis.add(row[0])

with jsonlines.open(
        TLCPaths.project_data_path.joinpath("TLC-UMLS-v4-reviewed-validated.jsonl")) as reader:
    validated_samples = [obj for obj in reader]
accepted_samples = [sample for sample in validated_samples if sample["answer"] == "accept"]
rejected_samples = [sample for sample in validated_samples if sample["answer"] == "reject"]

In [None]:
# load manual annotations
import csv

manual_annotations = []
with open(TLCPaths.project_data_path.joinpath("rejected_samples_for_manual_annotation_copy.csv"),
          "r", newline="\n") as f:
    reader = csv.reader(f, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        annotation_id = int(row[1].strip("[]").split(",")[0])
        annotation = get_tlc_annotation(annotation_id, tlc_annotations)
        cui = row[-1]
        if not cui or cui not in german_umls_cuis:
            continue
        manual_annotations.append(dict(annotation=annotation, cui=cui))

In [None]:
manual_annotations[0], len(manual_annotations)

# add descriptions

In [None]:
umls_cache = dc.Cache("caches/umls_cache")


class CachedTranslator:
    def __init__(self, cache, api_key):
        self.cache = cache
        self.translator = Translator(api_key)

    def translate_text(self, text, source_lang, target_lang):
        if (text, source_lang, target_lang) in self.cache:
            return self.cache[(text, source_lang, target_lang)]
        else:
            res = self.translator.translate_text(text, source_lang=source_lang,
                                                 target_lang=target_lang)
            self.cache[(text, source_lang, target_lang)] = res
            return res


deepl_cache = dc.Cache("caches/deepl_cache")
deepl_key = "0c25ea6d-b79f-288a-541f-ae25709c6312:fx"
translator = CachedTranslator(deepl_cache, deepl_key)


@umls_cache.memoize()
def get_cui_name(cui):
    query_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}?apiKey={UMLS_KEY}"
    response = requests.get(query_url)
    if not "result" in response.json():
        print("returning none for ", cui)
        return None
    else:
        eng_name = response.json()["result"]["name"]
        name = translator.translate_text(text=eng_name, source_lang="EN", target_lang="DE").text
        return name


@umls_cache.memoize()
def get_description(cui):
    name = get_cui_name(cui=cui)

    ger_sabs = ["DMDICD10", "DMDUMD", "WHOGER", "ICPCGER", "LNC-DE-AT", "LNC-DE-DE", "MDRGER",
                "MSHGER"]
    eng_sabs = ["HPO", "MDR", "MSH", "SNOMEDCT_US"]

    query_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}/definitions?apiKey={UMLS_KEY}"
    response = requests.get(query_url)
    if "result" in response.json():
        result = response.json()["result"]
        for res in result:
            if res["rootSource"] in ger_sabs:
                return f"{name}: " + res["value"]
        for res in result:
            if res["rootSource"] in eng_sabs:
                return f"{name}: " + translator.translate_text(text=res["value"], source_lang="EN",
                                                               target_lang="DE").text
        return f"{name}: " + translator.translate_text(text=response.json()["result"][0]["value"],
                                                       source_lang=None,
                                                       target_lang="DE").text
    else:
        return name


In [None]:
all_annotations = manual_annotations

In [None]:
for annotation in tqdm(all_annotations):
    annotation["description"] = get_description(cui=annotation["cui"])

In [None]:
for annotation in tqdm(all_annotations):
    if annotation["description"] is None:
        print(annotation)

In [None]:
no_desc_ann = [ann for ann in all_annotations if ann["description"] is None]
print(len(no_desc_ann))
all_annotations_with_desc = [ann for ann in all_annotations if ann["description"] is not None]

In [None]:
prodigy_samples = []
for ann_with_cui_and_desc in tqdm(all_annotations_with_desc):
    annotation = ann_with_cui_and_desc["annotation"]
    cui = ann_with_cui_and_desc["cui"]
    description = ann_with_cui_and_desc["description"]
    spans = [ProdigyNERLabel(start=annotation.span_start, end=annotation.span_end)]
    text = sample_collection.get_sample_by_annotation_id(annotation.id).text
    # <a href="url">link text</a>

    html = f'<b>{cui}</b>: ' + description
    annotations_ids = get_annotation_ids(annotation.get_mention())
    prodigy_samples.append(
        ProdigySample(text=text, spans=spans, id=annotation.id, cui=cui, html=html,
                      annotation_ids=annotations_ids,
                      meta=dict(url=f"https://uts.nlm.nih.gov/uts/umls/concept/{cui}")))
    

In [None]:
with open(TLCPaths.project_data_path.joinpath("prodigy_samples_round_2_v1.jsonl"), "w") as fp:
    for annotation in prodigy_samples:
        fp.write(annotation.json(ensure_ascii=False) + "\n")

In [None]:
annotation.json()

In [67]:
with open(TLCPaths.project_data_path.joinpath("prodigy_samples_round_2_v1.jsonl"), "w") as fp:
    for annotation in prodigy_samples:
        fp.write(annotation.json(ensure_ascii=False) + "\n")

In [68]:
annotation.json()

'{"text": "Thread: [Frage zum GFR-Wert]\\nText: [whoopi63, hast du da noch mal schauen lassen. Mein Wert ist jetzt von damals 86 auf 82 gesunken. Aber die \\u00c4rzte haben dazu nix gesagt, nur dass bis auf B12 und Eisen alles ok w\\u00e4re. Bei dir noch mal gemacht worden?]\\n", "spans": [{"start": 188, "end": 191, "label": "Mention"}], "html": "<b>C0042845</b>: Vitamin B12: Eine kobalthaltige Koordinationsverbindung, die von Mikroorganismen im Darm gebildet wird und auch im Boden und im Wasser vorkommt. H\\u00f6here Pflanzen konzentrieren Vitamin B 12 nicht aus dem Boden und sind daher im Vergleich zu tierischen Geweben eine schlechte Quelle f\\u00fcr diese Substanz. Der INTRINSIC FACTOR ist wichtig f\\u00fcr die Assimilation von Vitamin B 12.", "annotation_ids": [2234], "cui": "C0042845", "meta": {"url": "https://uts.nlm.nih.gov/uts/umls/concept/C0042845"}}'