In [116]:
import json
from pprint import pprint

import diskcache as dc
import requests
import jsonlines 

from deepl import Translator
from pydantic import BaseModel
from tqdm import tqdm

from config import TLCPaths
from data_loading import load_tlc_samples, get_annotation_ids
from models import Match, SampleCollection, Annotation, ProdigySample, ProdigyNERLabel


In [117]:
sample_collection = SampleCollection(load_tlc_samples())

In [118]:
annotations_file_path = TLCPaths.project_data_path.joinpath("samples_to_annotate.json")
validations_file_path = TLCPaths.project_data_path.joinpath("samples_to_validate.json")
# load the two json files 
with open(annotations_file_path, "r") as f:
    annotation_samples = [Match.parse_raw(x) for x in json.load(f)]
with open(validations_file_path, "r") as f:
    validation_samples = [Match.parse_raw(x) for x in json.load(f)]

print(f"{len(annotation_samples)=}")
print(f"{len(validation_samples)=}")


len(annotation_samples)=2146
len(validation_samples)=5244


In [119]:
unique_val_annotations = {}
for term in validation_samples:
    if term.mention.annotation.get_mention() not in unique_val_annotations:
        unique_val_annotations[term.mention.annotation.get_mention()] = term
unique_val_annotations = list(unique_val_annotations.values())
print(len(unique_val_annotations))

1042


In [120]:
unique_annotations = {}
for term in annotation_samples:
    if term.mention.annotation.get_mention() not in unique_annotations:
        unique_annotations[term.mention.annotation.get_mention()] = term.mention.annotation
unique_annotations = list(unique_annotations.values())
print(len(unique_annotations))

691


In [5]:
pprint(unique_annotations[0])

Annotation(tech_term='Nierenbeckenentzündung', lay_term=None, type=<TermType.TECH: 'TECH'>, span_start=30, span_end=52, synonyms=['Pyelonephritis'], id=3633)


In [6]:
for annotation in unique_annotations:
    if annotation.type == "TECH":
        print(annotation.get_mention(), annotation.synonyms)

Nierenbeckenentzündung ['Pyelonephritis']
Gallensteine ['Cholelith']
nierenbeckenentzündung ['Pyelonephritis']
Nierenkolik ['wehenartige Nierenschmerzen']
nierenbeckenentzündungen ['Pyelonephritis']
Tenesmus ['schmerzhafter Stuhldrang']
Darmkoliken ['wehenartige Darmschmerzen']
Diabetes insipidus ['Wasserharnruhr']
Dialyse ['Nierenersatzverfahren']
Defiblirator ['Schockgeber']
Mikrohämaturie ['etwas Blut im Urin']
Nierenkoliken ['wehenartige Nierenschmerzen']
Gallenkoliken ['wehenartige Gallenschmerzen']
Nierenbeckenntzündung ['Pyelonephritis']
Erythrozyten ['Rote Blutkörperchen']
Nierenbecken ['pyelon  oder pelvis renalis']
Hyperphosphatämie ['Phosphatüberschuss']
Serumkreatinin ['Serumkrea']
Serumkreatintinwert ['Serumkrea']
Blutdruck ['RR']
Proteinurie ['Laienbegriff: Eiweiß im Urin']
krampflösende Mittel ['Spasmolytika']
entzündliche oder nichtentzündliche glomeruläre Erkrankung ['Glomerulonephritis']
Aneurysmen ['Gefäßaussackungen']
Adrenal Fatigue ['Nebennierenschwäche']
Prolaps 

In [7]:
# translate mentions and synonym to english
# use mention and synonym to find cui. if disagree take mention if tech term or take synonym is lay term
# query umls for german description. if not available take english one and translate back to german

In [8]:
class TranslatedTerm(BaseModel):
    name: str
    cui: str = ""


class TranslatedAnnotation(BaseModel):
    annotation: Annotation
    translated_mention: TranslatedTerm
    translated_synonym: TranslatedTerm


class CachedTranslator:
    def __init__(self, cache, api_key):
        self.cache = cache
        self.translator = Translator(api_key)

    def translate_text(self, text, source_lang, target_lang):
        if (text, source_lang, target_lang) in self.cache:
            return self.cache[(text, source_lang, target_lang)]
        else:
            res = self.translator.translate_text(text, source_lang=source_lang,
                                                 target_lang=target_lang)
            self.cache[(text, source_lang, target_lang)] = res
            return res


deepl_cache = dc.Cache("caches/deepl_cache")
deepl_key = "0c25ea6d-b79f-288a-541f-ae25709c6312:fx"
translator = CachedTranslator(deepl_cache, deepl_key)

In [9]:
res = translator.translate_text("Krankenhausmauer", source_lang="DE", target_lang="EN-US")
res.text

'Hospital Wall'

In [10]:
translated_annotations = []
for annotation in tqdm(unique_annotations):
    mention = annotation.get_mention()
    translated_mention = translator.translate_text(mention.lower(), source_lang="DE",
                                                   target_lang="EN-US").text
    if len(annotation.synonyms) == 0:
        translated_synonym = ""
    else:
        synonym = annotation.synonyms[0]
        translated_synonym = translator.translate_text(synonym.lower(), source_lang="DE",
                                                       target_lang="EN-US").text
    translated_annotations.append(TranslatedAnnotation(annotation=annotation,
                                                       translated_mention=TranslatedTerm(
                                                           name=translated_mention),
                                                       translated_synonym=TranslatedTerm(
                                                           name=translated_synonym)))

100%|██████████| 691/691 [00:00<00:00, 4973.44it/s]


In [11]:
translated_annotations[0]

TranslatedAnnotation(annotation=Annotation(tech_term='Nierenbeckenentzündung', lay_term=None, type=<TermType.TECH: 'TECH'>, span_start=30, span_end=52, synonyms=['Pyelonephritis'], id=3633), translated_mention=TranslatedTerm(name='renal pelvic inflammation', cui=''), translated_synonym=TranslatedTerm(name='pyelonephritis', cui=''))

In [12]:
UMLS_KEY = "43f9234c-4977-45f6-a440-2dda1b43d919"

umls_cache = dc.Cache("caches/umls_cache")


@umls_cache.memoize()
def get_cui(name):
    query_url = f"https://uts-ws.nlm.nih.gov/rest/search/current?apiKey={UMLS_KEY}&string={name}&searchType=normalizedString"
    ### send a get request to query url and get response
    response = requests.get(query_url)
    if not "result" in response.json():
        print("name: ", name, response.json())
        return ""
    results = response.json()["result"]["results"]
    if results:
        return results[0]["ui"]
    else:
        return ""


cui = get_cui(name="Pyelonephritis")
print(cui)

C0034186


In [13]:
for translated_annotation in tqdm(translated_annotations):
    if translated_annotation.translated_mention.cui == "":
        translated_annotation.translated_mention.cui = get_cui(
            name=translated_annotation.translated_mention.name)
    if translated_annotation.translated_synonym.cui == "":
        translated_annotation.translated_synonym.cui = get_cui(
            name=translated_annotation.translated_synonym.name)


100%|██████████| 691/691 [00:00<00:00, 9780.17it/s]


In [14]:
annotations_with_match = []  # entry = (annotation, translated_name, cui)
annotation_without_match = []  # entry = (annotation)    
for annotation in tqdm(translated_annotations):
    if annotation.translated_synonym.cui and not annotation.translated_mention.cui:
        entry = (annotation.annotation, annotation.translated_synonym.name,
                 annotation.translated_synonym.cui)
    elif not annotation.translated_synonym.cui and annotation.translated_mention.cui:
        entry = (annotation.annotation, annotation.translated_mention.name,
                 annotation.translated_mention.cui)
    elif not annotation.translated_synonym.cui and not annotation.translated_mention.cui:
        annotation_without_match.append(annotation)
        continue
    else:
        if annotation.annotation.type == "TECH":
            entry = (annotation.annotation, annotation.translated_mention.name,
                     annotation.translated_mention.cui)
        else:
            entry = (annotation.annotation, annotation.translated_synonym.name,
                     annotation.translated_synonym.cui)
    annotations_with_match.append(entry)

100%|██████████| 691/691 [00:00<00:00, 917753.03it/s]


In [15]:
# found cuis for 549 annotations and not for 142

In [16]:
print(len(annotations_with_match))
print(annotations_with_match[:2])

543
[(Annotation(tech_term='Nierenbeckenentzündung', lay_term=None, type=<TermType.TECH: 'TECH'>, span_start=30, span_end=52, synonyms=['Pyelonephritis'], id=3633), 'pyelonephritis', 'C0034186'), (Annotation(tech_term='Gallensteine', lay_term=None, type=<TermType.TECH: 'TECH'>, span_start=121, span_end=133, synonyms=['Cholelith'], id=6202), 'gallstones', 'C0008350')]


In [17]:
print(len(annotation_without_match))  # no cui for mention or synonym found
print(annotation_without_match[0])

148
annotation=Annotation(tech_term=None, lay_term='Verkleinerte NIeren', type=<TermType.LAY: 'LAY'>, span_start=9, span_end=28, synonyms=['Schrumpfniere (Nephrozirrhose)'], id=2472) translated_mention=TranslatedTerm(name='reduced kidneys', cui='') translated_synonym=TranslatedTerm(name='shrinking kidney (nephrocirrhosis)', cui='')


In [18]:
for annotation in annotation_without_match:
    print(annotation.annotation.get_mention(), annotation.translated_mention)
    if annotation.annotation.synonyms:
        print(annotation.annotation.synonyms[0], annotation.translated_synonym)
    print()

Verkleinerte NIeren name='reduced kidneys' cui=''
Schrumpfniere (Nephrozirrhose) name='shrinking kidney (nephrocirrhosis)' cui=''

muss seit 3 Tagen fast jede 10 Minuten auf Toilette name='must go to the toilet almost every 10 minutes for 3 days' cui=''
Pollakisurie (häufiges Wasserlassen in kleinen Mengen) name='pollakiuria (frequent urination in small amounts)' cui=''

Extreme Luft im Bauch die nicht entweicht name='extreme air in the abdomen that does not escape' cui=''
Flatulenz  - verstärkte Entwicklung von Gasen im Magen und/oder Darm name='flatulence - increased development of gases in the stomach and/or intestines' cui=''

brennen beim wasserlassen name='burning during urination' cui=''
Algurie name='alguria' cui=''

extreme Schmerzen während des Stuhlgangs name='extreme pain during defecation' cui=''
Tenesmen name='tenesmen' cui=''

Schmerzen während des Stuhlgangs name='pain during defecation' cui=''
Tenesmen name='tenesmen' cui=''

Defiblirator name='defiblirator' cui=''
Sch

In [19]:
import csv

with open("to_annotate.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["mention", "synonym", "cui"])
    for annotation in annotation_without_match:
        writer.writerow([annotation.annotation.get_mention(), annotation.annotation.synonyms, ""])

In [91]:
filtered_anns = []
for ann in annotation_without_match:
    if ann.annotation.get_mention() == "Teufelskreis":
        print(ann.annotation)
        continue
    filtered_anns.append(ann)

tech_term=None lay_term='Teufelskreis' type=<TermType.LAY: 'LAY'> span_start=753 span_end=765 synonyms=['Circulus vitiosus'] id=3898


In [92]:
annotation_without_match = filtered_anns

In [93]:
## read in annotations from csv
with open("to_annotate.copy.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)
    manual_annotations = []
    for annotation, row in zip(annotation_without_match, reader):
        mention = row[0]
        synonym = row[1]
        cui = row[2]
        assert annotation.annotation.get_mention() == mention
        manual_annotations.append((annotation, cui))

In [94]:
manual_annotations[0]

(TranslatedAnnotation(annotation=Annotation(tech_term=None, lay_term='Verkleinerte NIeren', type=<TermType.LAY: 'LAY'>, span_start=9, span_end=28, synonyms=['Schrumpfniere (Nephrozirrhose)'], id=2472), translated_mention=TranslatedTerm(name='reduced kidneys', cui=''), translated_synonym=TranslatedTerm(name='shrinking kidney (nephrocirrhosis)', cui='')),
 'C0156247')

In [95]:
all_annotations = []
for annotation in tqdm(annotations_with_match):
    all_annotations.append(dict(annotation=annotation[0], cui=annotation[2]))
for annotation in tqdm(manual_annotations):
    all_annotations.append(dict(annotation=annotation[0].annotation, cui=annotation[1]))
for annotation in tqdm(unique_val_annotations):
    all_annotations.append(
        dict(annotation=annotation.mention.annotation, cui=annotation.match["cui"][0]))

100%|██████████| 543/543 [00:00<00:00, 894894.72it/s]
100%|██████████| 147/147 [00:00<00:00, 765916.38it/s]
100%|██████████| 1042/1042 [00:00<00:00, 650851.05it/s]


In [96]:
len(all_annotations)

1732

# add descriptions

In [121]:
umls_cache = dc.Cache("caches/umls_cache")

@umls_cache.memoize()
def get_cui_name(cui):
    query_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}?apiKey={UMLS_KEY}"
    response = requests.get(query_url)
    if not "result" in response.json():
        print("returning none for ", cui)
        return None
    else:
        eng_name = response.json()["result"]["name"]
        name = translator.translate_text(text=eng_name, source_lang="EN", target_lang="DE").text
        return name
    
@umls_cache.memoize()
def get_description(cui):
    name = get_cui_name(cui=cui)
    
    ger_sabs = ["DMDICD10", "DMDUMD", "WHOGER", "ICPCGER", "LNC-DE-AT", "LNC-DE-DE", "MDRGER",
                "MSHGER"]
    eng_sabs = ["HPO", "MDR", "MSH", "SNOMEDCT_US"]

    query_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}/definitions?apiKey={UMLS_KEY}"
    response = requests.get(query_url)
    if "result" in response.json():
        result = response.json()["result"]
        for res in result:
            if res["rootSource"] in ger_sabs:
                return f"{name}: " + res["value"]
        for res in result:
            if res["rootSource"] in eng_sabs:
                return f"{name}: " + translator.translate_text(text=res["value"], source_lang="EN",
                                                 target_lang="DE").text
        return f"{name}: " + translator.translate_text(text=response.json()["result"][0]["value"],
                                         source_lang=None,
                                         target_lang="DE").text
    else:
        return name


In [122]:
for annotation in tqdm(all_annotations):
    annotation["description"] = get_description(cui=annotation["cui"])

100%|██████████| 1732/1732 [07:02<00:00,  4.10it/s]


In [123]:
for annotation in tqdm(all_annotations):
    if annotation["description"] is None:
        print(annotation)

100%|██████████| 1732/1732 [00:00<00:00, 1558244.21it/s]


In [124]:
no_desc_ann = [ann for ann in all_annotations if ann["description"] is None]
print(len(no_desc_ann))

0


In [125]:
# parse annotations to prodigy samples

In [126]:
prodigy_samples = []
for ann_with_cui_and_desc in tqdm(all_annotations):
    annotation = ann_with_cui_and_desc["annotation"]
    cui = ann_with_cui_and_desc["cui"]
    description = ann_with_cui_and_desc["description"]
    spans = [ProdigyNERLabel(start=annotation.span_start, end=annotation.span_end)]
    text = sample_collection.get_sample_by_annotation_id(annotation.id).text
    annotations_ids = get_annotation_ids(annotation.get_mention())
    prodigy_samples.append(
        ProdigySample(text=text, spans=spans, id=annotation.id, cui=cui, html=description,
                      annotation_ids=annotations_ids))
    

100%|██████████| 1732/1732 [00:00<00:00, 11030.35it/s]


In [127]:
with jsonlines.open(TLCPaths.project_data_path.joinpath("prodigy_samples.jsonl"), "w") as fp:
    for annotation in prodigy_samples:
        fp.write(annotation.dict())