In [1]:
from collections import defaultdict
from pathlib import Path

import pandas as pd

In [2]:
dataset_path = Path("../data/TLC_v01")
kidney_path = dataset_path.joinpath("Kidney")
stomach_path = dataset_path.joinpath("StomachIntestines")
dataset_file = list(kidney_path.iterdir()) + list(stomach_path.iterdir())
annotation_files = [file for file in dataset_file if file.suffix == ".ann"]
sample_files = [file for file in dataset_file if file.suffix == ".txt"]


In [22]:
# counts only tech and lay terms if annotator note with translation is present

lay_terms = defaultdict(set)
tech_terms = defaultdict(set)
count_dict = {"tech": 0, "lay": 0}
no_mapping_found_dict = {"tech": 0, "lay": 0}
for file in annotation_files:
    with open(file) as fp:
        content = fp.read()
    line_contents = [line.split("\t") for line in content.split("\n")]
    for i in range(len(line_contents)):
        line_content = line_contents[i]
        if not line_content[0]:  # skip empty lines
            continue
        if line_content[1].startswith("Laienbegriff"):
            source_dict = lay_terms
            target_dict = tech_terms
            counter_key = "lay"
        elif line_content[1].startswith("Fachterm"):
            source_dict = tech_terms
            target_dict = lay_terms
            counter_key = "tech"
        else:
            continue

        source_term = line_content[2]
        source_tag = line_content[0]
        target_term = None
        not_found = False
        for j in range(i + 1, i + 20):
            try:
                next_line_content = line_contents[j]
                if (not source_tag in next_line_content[1]) or (not "AnnotatorNotes" in next_line_content[
                    1]):
                    continue
                else:
                    target_term = next_line_content[2]
                    break
            except IndexError:
                not_found = True
                continue
        if not_found or j == i + 4: # noqa
            no_mapping_found_dict[counter_key] += 1
        if target_term is None:
            continue  # skip if no target term is found
        count_dict[counter_key] += 1
        source_dict[source_term].add(target_term)
        target_dict[target_term].add(source_term)


print(f"Mentions of lay terms: {count_dict['lay']}")
print(f"Number of unique lay terms: {len(lay_terms)}")
print(f"Mentions of tech terms: {count_dict['tech']}")
print(f"Number of unique tech terms: {len(tech_terms)}")
print(f"No mapping found for {no_mapping_found_dict['tech']} tech and {no_mapping_found_dict['lay']} lay terms found.")


Mentions of lay terms: 5368
Number of unique lay terms: 1698
Mentions of tech terms: 2024
Number of unique tech terms: 1226
No mapping found for 1 tech and 1 lay terms found.


In [4]:
Mentions of lay terms: 5369
Number of unique lay terms: 1698
Mentions of tech terms: 2025
Number of unique tech terms: 1228
No mapping found for 12 tech and 741 lay terms found.

1698

In [5]:
lens = []
num_words = []
for label in lay_terms:
    lens.append(len(label))
    num_words.append(len(label.split(" ")))

df = pd.DataFrame.from_records(zip(lens, num_words), columns=["length", "word_count"])
df.describe()

Unnamed: 0,length,word_count
count,1698.0,1698.0
mean,18.788575,2.246172
std,12.652863,1.997509
min,2.0,1.0
25%,11.0,1.0
50%,16.0,1.0
75%,23.0,3.0
max,187.0,28.0


In [6]:
lens = []
num_words = []
for label in tech_terms:
    lens.append(len(label))
    num_words.append(len(label.split(" ")))

df = pd.DataFrame.from_records(zip(lens, num_words), columns=["length", "word_count"])
df.describe()


Unnamed: 0,length,word_count
count,1228.0,1228.0
mean,14.903909,1.500814
std,12.326007,1.467107
min,2.0,1.0
25%,9.0,1.0
50%,12.0,1.0
75%,17.0,1.0
max,122.0,18.0


In [7]:
lay_terms


defaultdict(set,
            {'Glomeruläre Filtrationsrate': {'GFR', 'eGFR'},
             'Urea': {'Harnstoff', 'harnstoff'},
             'Brustkrebs': {'Mamma-Karzinom', 'Mammakarzinom'},
             'Nierenarzt': {'Nephrolgen',
              'Nephrologe',
              'Nephrologen',
              'nephrologe',
              'nephrologen'},
             'Nierenschwäche': {'Niereninsuffizenz',
              'Niereninsuffiziens',
              'Niereninsuffizienz',
              'niereninsuffizien',
              'niereninsuffizienz'},
             'Nierenersatzverfahren': {'Dialyse', 'dialyse', 'dyalise'},
             'normaler Blutdruck': {'Normotension', 'Normotonie'},
             'Nierenkrank': {'niereninsuffizient'},
             'Maschine': {'Dialyse'},
             'Blutreinigungsverfahren': {'Dialyse',
              'Hämodialyse',
              'dialyse',
              'dyalise'},
             'Nierenentzündung': {'Nephritis',
              'Nephritis (von gr.)',
         

In [8]:
tech_terms


defaultdict(set,
            {'GFR': {'Glomeruläre Filtrationsrate'},
             'Harnstoff': {'Urea'},
             'Mammakarzinom': {'Brustkrebs'},
             'Nephrologe': {'Nierenarzt',
              'Nierenfacharzt',
              'Nierenspezialisten',
              'Spezialist für Nierenerkrankungen'},
             'Niereninsuffizienz': {'Nieren kaputt',
              'Nieren schlecht funktionieren',
              'Nierenfunktion leicht eingeschränkt',
              'Nierenfunktionseinschränkung',
              'Nierenfunktionseinschränkungen',
              'Nierenschwäche',
              'Nierenschäche',
              'Nierenschädigungen',
              'Verschlechterung der Nierenfunktion',
              'eine eingeschränkte Nierenfunktion',
              'eingeschränkte Nierenfunktion',
              'eingeschränkter Nierenfunktion',
              'erniedrigte Nierenfunktion'},
             'Dialyse': {'Blutreinigungsverfahren',
              'Leben an der Maschine',
    

In [11]:
content

'T1\tAbkuerzung 1663 1666\tMRT\n#1\tAnnotatorNotes T1\tMagnetresonanztomographie\n'