In [2]:
import json
from pprint import pprint
from typing import Literal, List

import diskcache as dc
import requests
from deepl import Translator
from pydantic import BaseModel
from tqdm import tqdm
from cistem import stem
from collections import defaultdict

from config import TLCPaths
from data_loading import load_tlc_samples
from models import Match, SampleCollection, Annotation


In [3]:
annotations_file_path = TLCPaths.project_data_path.joinpath("samples_to_annotate.json")
validations_file_path = TLCPaths.project_data_path.joinpath("samples_to_validate.json")
# load the two json files 
with open(annotations_file_path, "r") as f:
    annotation_samples = [Match.parse_raw(x) for x in json.load(f)]
with open(validations_file_path, "r") as f:
    validation_samples = [Match.parse_raw(x) for x in json.load(f)]

print(f"{len(annotation_samples)=}")
print(f"{len(validation_samples)=}")


len(annotation_samples)=2146
len(validation_samples)=5244


In [4]:
print(len([x for x in annotation_samples if x.mention.annotation.type == "LAY"]))
print(len([x for x in annotation_samples if x.mention.annotation.type == "TECH"]))


1460
686


uniqueness based on mentions

In [5]:
# save json with unique terms for annotation
unique_annotations = defaultdict(list)
for term in annotation_samples:
    if term.mention.annotation.synonyms:
        unique_annotations[term.mention.annotation.get_mention()].append(term.mention.annotation.synonyms[0])


In [6]:
for mention, synonyms in unique_annotations.items():
    synonyms = set(synonyms)
    if len(synonyms) > 1:
        print(mention)
        print(synonyms)
        print()

Gallensteine
{'Cholelith', 'Cholelithiasis'}

Herzklopfen
{'Tachykardie', 'Palpitationen'}

Nierenkolik
{'wehenartige Nierenschmerzen', 'Wehenartige Nierenschmerzen'}

Magen Darm Grippe
{'Gastroenteritis', 'Norovirus-Gastroenteritis'}

niedriger Blutdruck
{'Hypotonie', 'Hypotonus'}

Nierenentzündung
{'Pyelonephritis', 'Nephritis'}

nierenentzündungen
{'Nephritiden', 'Nephritis'}

Blinddarmdurchbruch
{'Blinddarmperforation', 'Perforierte Appendizitis'}

Herzstolpern
{'Extrasystole', 'Extrasystolen', 'Herzrhythmusstörungen'}

Dialyse
{'Blutreinigungsverfahren', 'Nierenersatzverfahren'}

rote Urin
{'sichtbares Blut im Urin', 'Hämaturie', 'Makrohämaturie'}

Mikrohämaturie
{'Laienbegriff: etwas Blut im Urin', 'nicht sichtbares Blut im Urin', 'Nicht sichtbares Blut im Urin', 'Laien: ein wenig Blut im Urin', 'etwas Blut im Urin'}

Nierenkoliken
{'wehenartige Nierenschmerzen', 'Wehenartige Nierenschmerzen'}

Schwitzen
{'Hydrosis', 'Transpiration', 'Perspiratio sensibilis', 'transpirieren', 'Di

uniqueness based on synonyms


In [7]:
annotations = [search_term.mention.annotation for search_term in annotation_samples + validation_samples]


In [8]:
sum([len(ann.synonyms) for ann in annotations])/ len(annotations)

1.0115020297699595

In [9]:
splitters = [" und", "(", " oder", " - "]
lstrips = ["Laien:", "Laienbegriff:"]
cleaned_synonyms = []
origs = []
for ann in annotations:
    if not ann.synonyms:
        synonym = ann.get_mention()
    else:
        synonym = ann.synonyms[0]
    origs.append(synonym)
    for splitter in splitters:
        synonym = synonym.split(splitter)[0]
    for lstrip in lstrips:
        synonym = synonym.removeprefix(lstrip)
    ann.synonyms = [synonym]
    cleaned_synonyms.append(synonym.strip())

In [10]:
cleaned_synonyms[0]

'Pyelonephritis'

In [11]:
with open("synonyms.txt", "w") as fp:
    fp.write("\n".join(" <> ".join(x) for x in zip(cleaned_synonyms, origs)))

In [12]:
tech_vs_ann = ["tech mention | annotation"]
for ann in annotations:
    if ann.type == "TECH" and ann.synonyms:
        tech_vs_ann.append(ann.get_mention() + " | " + ann.synonyms[0])

In [13]:
with open("tech_syns.txt", "w") as fp:
    fp.write("\n".join(tech_vs_ann))

In [14]:
# if mention is tech, take the shortest of {mention, annotation term}
# for search it is not so important because we search for all terms but for
# uniqueness we need to decide on one because otherwise we get transitive sets 
# which are not correct in the end

In [15]:
search_terms_and_ids = defaultdict(list)
for annotation in annotations:
    if not annotation.synonyms:
        search_terms_and_ids[annotation.get_mention()].append(annotation.id)
    elif annotation.type == "LAY":
        search_terms_and_ids[annotation.synonyms[0]].append(annotation.id)
    elif annotation.type == "TECH":
        if len(annotation.get_mention()) > len(annotation.synonyms[0]):
            search_terms_and_ids[annotation.synonyms[0]].append(annotation.id)
        else:
            search_terms_and_ids[annotation.get_mention()].append(annotation.id)
    else:
        raise RuntimeError("check conditions")

In [16]:
print(len(annotations) ," -> ", len(search_terms_and_ids))

7390  ->  1107


In [17]:
# try stemming
stemmed_search_terms_and_ids = defaultdict(list)
for term, ids in search_terms_and_ids.items():
    stemmed_term = stem(term)
    stemmed_search_terms_and_ids[stemmed_term].extend(ids)

In [18]:
print(len(annotations) ," -> ", len(search_terms_and_ids), ' -> ', len(stemmed_search_terms_and_ids))


7390  ->  1107  ->  998


In [19]:
search_terms_and_ids

defaultdict(list,
            {'Pyelonephritis': [3633,
              221,
              718,
              2639,
              185,
              3311,
              3312,
              4582,
              671,
              3691,
              841,
              168,
              165,
              923,
              2771,
              3001,
              2743,
              344,
              170,
              171,
              4074,
              4098,
              2029,
              1415,
              1416,
              1417,
              3456,
              1969,
              1771,
              1772,
              1773,
              1774,
              1775,
              2331,
              1573,
              1574,
              1619,
              1620,
              2227,
              2228,
              2829,
              2197,
              2832,
              834,
              393,
              844,
              3232,
              958,
              4056,

In [20]:
stemmed_search_terms_and_ids

defaultdict(list,
            {'pyelonephriti': [3633,
              221,
              718,
              2639,
              185,
              3311,
              3312,
              4582,
              671,
              3691,
              841,
              168,
              165,
              923,
              2771,
              3001,
              2743,
              344,
              170,
              171,
              4074,
              4098,
              2029,
              1415,
              1416,
              1417,
              3456,
              1969,
              1771,
              1772,
              1773,
              1774,
              1775,
              2331,
              1573,
              1574,
              1619,
              1620,
              2227,
              2228,
              2829,
              2197,
              2832,
              834,
              393,
              844,
              3232,
              958,
              4056,


In [21]:
def get_annotation_by_id(id):
    for ann in annotations:
        if ann.id == id:
            return ann

In [22]:
get_annotation_by_id(3545)

Annotation(tech_term='FA', lay_term=None, type=<TermType.TECH: 'TECH'>, span_start=502, span_end=504, synonyms=['Facharzt'], id=3545)

In [23]:
for term, id in search_terms_and_ids.items():
    print(term)
    print(get_annotation_by_id(id[0]))
    print()

Pyelonephritis
tech_term='Nierenbeckenentzündung' lay_term=None type=<TermType.TECH: 'TECH'> span_start=30 span_end=52 synonyms=['Pyelonephritis'] id=3633

Schrumpfniere 
tech_term=None lay_term='Verkleinerte NIeren' type=<TermType.LAY: 'LAY'> span_start=9 span_end=28 synonyms=['Schrumpfniere '] id=2472

Cholelith
tech_term='Gallensteine' lay_term=None type=<TermType.TECH: 'TECH'> span_start=121 span_end=133 synonyms=['Cholelith'] id=6202

Pollakisurie 
tech_term=None lay_term='muss seit 3 Tagen fast jede 10 Minuten auf Toilette' type=<TermType.LAY: 'LAY'> span_start=69 span_end=120 synonyms=['Pollakisurie '] id=1307

Palpitationen
tech_term=None lay_term='Herzklopfen' type=<TermType.LAY: 'LAY'> span_start=79 span_end=90 synonyms=['Palpitationen'] id=4604

Flatulenz 
tech_term=None lay_term='Extreme Luft im Bauch die nicht entweicht' type=<TermType.LAY: 'LAY'> span_start=9 span_end=50 synonyms=['Flatulenz '] id=5928

Algurie
tech_term=None lay_term='brennen beim wasserlassen' type=<Ter

In [24]:
with open('search_terms_single_and_ids.json', 'w') as fp:
    json.dump(stemmed_search_terms_and_ids, fp)