In [3]:
import json
from pprint import pprint
from typing import Literal, List

import diskcache as dc
import requests
from deepl import Translator
from pydantic import BaseModel
from tqdm import tqdm
from cistem import stem
from collections import defaultdict

from config import TLCPaths
from data_loading import load_tlc_samples
from models import Match, SampleCollection, Annotation


In [3]:
annotations_file_path = TLCPaths.project_data_path.joinpath("samples_to_annotate.json")
validations_file_path = TLCPaths.project_data_path.joinpath("samples_to_validate.json")
# load the two json files 
with open(annotations_file_path, "r") as f:
    annotation_samples = [Match.parse_raw(x) for x in json.load(f)]
with open(validations_file_path, "r") as f:
    validation_samples = [Match.parse_raw(x) for x in json.load(f)]

print(f"{len(annotation_samples)=}")
print(f"{len(validation_samples)=}")


len(annotation_samples)=2146
len(validation_samples)=5244


In [4]:
print(len([x for x in annotation_samples if x.mention.annotation.type == "LAY"]))
print(len([x for x in annotation_samples if x.mention.annotation.type == "TECH"]))


1460
686


uniqueness based on mentions

In [5]:
# save json with unique terms for annotation
unique_annotations = defaultdict(list)
for term in annotation_samples:
    if term.mention.annotation.synonyms:
        unique_annotations[term.mention.annotation.get_mention()].append(term.mention.annotation.synonyms[0])


In [6]:
for mention, synonyms in unique_annotations.items():
    synonyms = set(synonyms)
    if len(synonyms) > 1:
        print(mention)
        print(synonyms)
        print()

Gallensteine
{'Cholelith', 'Cholelithiasis'}

Herzklopfen
{'Tachykardie', 'Palpitationen'}

Nierenkolik
{'wehenartige Nierenschmerzen', 'Wehenartige Nierenschmerzen'}

Magen Darm Grippe
{'Gastroenteritis', 'Norovirus-Gastroenteritis'}

niedriger Blutdruck
{'Hypotonie', 'Hypotonus'}

Nierenentzündung
{'Pyelonephritis', 'Nephritis'}

nierenentzündungen
{'Nephritiden', 'Nephritis'}

Blinddarmdurchbruch
{'Blinddarmperforation', 'Perforierte Appendizitis'}

Herzstolpern
{'Extrasystole', 'Extrasystolen', 'Herzrhythmusstörungen'}

Dialyse
{'Blutreinigungsverfahren', 'Nierenersatzverfahren'}

rote Urin
{'sichtbares Blut im Urin', 'Hämaturie', 'Makrohämaturie'}

Mikrohämaturie
{'Laienbegriff: etwas Blut im Urin', 'nicht sichtbares Blut im Urin', 'Nicht sichtbares Blut im Urin', 'Laien: ein wenig Blut im Urin', 'etwas Blut im Urin'}

Nierenkoliken
{'wehenartige Nierenschmerzen', 'Wehenartige Nierenschmerzen'}

Schwitzen
{'Hydrosis', 'Transpiration', 'Perspiratio sensibilis', 'transpirieren', 'Di

uniqueness based on synonyms


In [4]:
samples = load_tlc_samples()

In [5]:
annotations = [annotation for sample in samples for annotation in sample.annotations]


In [6]:
sum([len(ann.synonyms) for ann in annotations])/ len(annotations)

2.023004059539919

In [7]:
splitters = [" und", "(", " oder", " - "]
lstrips = ["Laien:", "Laienbegriff:"]
cleaned_synonyms = []
origs = []
for ann in annotations:
    if not ann.synonyms:
        synonym = ann.get_mention()
    else:
        synonym = ann.synonyms[0]
    origs.append(synonym)
    for splitter in splitters:
        synonym = synonym.split(splitter)[0]
    for lstrip in lstrips:
        synonym = synonym.removeprefix(lstrip)
    ann.synonyms = [synonym]
    cleaned_synonyms.append(synonym.strip())

In [8]:
cleaned_synonyms[0]

'Flatulenzen'

In [9]:
with open("synonyms.txt", "w") as fp:
    fp.write("\n".join(" <> ".join(x) for x in zip(cleaned_synonyms, origs)))

In [10]:
tech_vs_ann = ["tech mention | annotation"]
for ann in annotations:
    if ann.type == "TECH" and ann.synonyms:
        tech_vs_ann.append(ann.get_mention() + " | " + ann.synonyms[0])

In [11]:
with open("tech_syns.txt", "w") as fp:
    fp.write("\n".join(tech_vs_ann))

In [12]:
# if mention is tech, take the shortest of {mention, annotation term}
# for search it is not so important because we search for all terms but for
# uniqueness we need to decide on one because otherwise we get transitive sets 
# which are not correct in the end

In [13]:
search_terms_and_ids = defaultdict(list)
for annotation in annotations:
    if not annotation.synonyms:
        search_terms_and_ids[annotation.get_mention()].append(annotation.id)
    elif annotation.type == "LAY":
        search_terms_and_ids[annotation.synonyms[0]].append(annotation.id)
    elif annotation.type == "TECH":
        if len(annotation.get_mention()) > len(annotation.synonyms[0]):
            search_terms_and_ids[annotation.synonyms[0]].append(annotation.id)
        else:
            search_terms_and_ids[annotation.get_mention()].append(annotation.id)
    else:
        raise RuntimeError("check conditions")

In [14]:
print(len(annotations) ," -> ", len(search_terms_and_ids))

7390  ->  1107


In [15]:
# try stemming
stemmed_search_terms_and_ids = defaultdict(list)
for term, ids in search_terms_and_ids.items():
    stemmed_term = stem(term)
    stemmed_search_terms_and_ids[stemmed_term].extend(ids)

In [16]:
print(len(annotations) ," -> ", len(search_terms_and_ids), ' -> ', len(stemmed_search_terms_and_ids))


7390  ->  1107  ->  997


In [17]:
search_terms_and_ids

defaultdict(list,
            {'Flatulenzen': [6992,
              6250,
              7046,
              6316,
              5453,
              6328,
              6569,
              6959,
              5926,
              4901,
              5166,
              6076,
              4911,
              7389,
              6383,
              6121,
              1308,
              6537,
              4654,
              4836,
              5055,
              5391,
              6028,
              7176,
              5651,
              4754,
              5770,
              6962,
              6964,
              6358,
              5291,
              5296,
              4850,
              4994,
              5812,
              4738,
              6190,
              6955,
              6957,
              4758,
              7097,
              5630,
              5691,
              5692,
              5693,
              7309,
              5117,
              6932,
       

In [18]:
stemmed_search_terms_and_ids

defaultdict(list,
            {'flatulenz': [6992,
              6250,
              7046,
              6316,
              5453,
              6328,
              6569,
              6959,
              5926,
              4901,
              5166,
              6076,
              4911,
              7389,
              6383,
              6121,
              1308,
              6537,
              4654,
              4836,
              5055,
              5391,
              6028,
              7176,
              5651,
              4754,
              5770,
              6962,
              6964,
              6358,
              5291,
              5296,
              4850,
              4994,
              5812,
              4738,
              6190,
              6955,
              6957,
              4758,
              7097,
              5630,
              5691,
              5692,
              5693,
              7309,
              5117,
              6932,
         

In [19]:
def get_annotation_by_id(id):
    for ann in annotations:
        if ann.id == id:
            return ann

In [20]:
get_annotation_by_id(3545)

Annotation(tech_term='FA', lay_term=None, type=<TermType.TECH: 'TECH'>, span_start=502, span_end=504, synonyms=['Facharzt'], id=3545)

In [21]:
for term, id in search_terms_and_ids.items():
    print(term)
    print(get_annotation_by_id(id[0]))
    print()

Flatulenzen
tech_term=None lay_term='Blähungen' type=<TermType.LAY: 'LAY'> span_start=86 span_end=95 synonyms=['Flatulenzen'] id=6992

Pankreas
tech_term=None lay_term='bauchspeicheldrüse' type=<TermType.LAY: 'LAY'> span_start=194 span_end=212 synonyms=['Pankreas'] id=6993

Pyelonephritis
tech_term='Nierenbeckenentzündung' lay_term=None type=<TermType.TECH: 'TECH'> span_start=30 span_end=52 synonyms=['Pyelonephritis'] id=3633

Schrumpfniere 
tech_term=None lay_term='Verkleinerte NIeren' type=<TermType.LAY: 'LAY'> span_start=9 span_end=28 synonyms=['Schrumpfniere '] id=2472

Cholelith
tech_term='Gallensteine' lay_term=None type=<TermType.TECH: 'TECH'> span_start=121 span_end=133 synonyms=['Cholelith'] id=6202

Choledocholithiasis
tech_term=None lay_term='Gallensteine in den Gallengängen' type=<TermType.LAY: 'LAY'> span_start=288 span_end=320 synonyms=['Choledocholithiasis'] id=6203

Nierensteine
tech_term='Nierensteine' lay_term=None type=<TermType.TECH: 'TECH'> span_start=9 span_end=21

In [22]:
with open('search_terms_single_and_ids.json', 'w') as fp:
    json.dump(stemmed_search_terms_and_ids, fp)