In [1]:
import jsonlines
from src.data_loading import annotations as tlc_annotations, load_tlc_samples
from src.models import Sample
from config import TLCPaths
from typing import Tuple
import json
import csv
from dataclasses import dataclass, asdict

from spacy.lang.de import German # updated
from src.preprocessing import cistem

In [2]:
with open(TLCPaths.project_data_path.joinpath("search_terms_single_and_ids.json")) as fp:
    search_terms = json.load(fp)
search_terms

{'flatulenz': [6992,
  7046,
  6316,
  5453,
  6328,
  6569,
  6959,
  5926,
  4901,
  5166,
  6076,
  4911,
  7389,
  6383,
  6121,
  1308,
  6537,
  4654,
  4836,
  5055,
  5391,
  6028,
  7176,
  5651,
  4754,
  5770,
  6962,
  6964,
  6358,
  5291,
  5296,
  4850,
  4994,
  5812,
  4738,
  6190,
  6955,
  6957,
  4758,
  7097,
  5630,
  5691,
  5692,
  5693,
  7309,
  5117,
  6932,
  4838,
  4840,
  6584,
  5121,
  5904,
  4887,
  6747,
  6749,
  4606,
  4931,
  6504,
  4650,
  4651,
  7026,
  7064,
  5837,
  5967,
  5030,
  7245,
  7333,
  6034,
  7149,
  6484,
  6712,
  5218,
  5849],
 'pankrea': [6993,
  4638,
  7357,
  7388,
  4929,
  6097,
  1491,
  5767,
  4613,
  3031,
  6819,
  5287,
  5046,
  5047,
  6566,
  584,
  6432,
  313,
  314,
  4950,
  4755,
  4793,
  4794,
  4798,
  5081,
  4857,
  7376,
  5101,
  4728,
  5979,
  6813,
  5048,
  5052,
  6774,
  4748],
 'pyelonephriti': [3633,
  221,
  718,
  2639,
  185,
  3311,
  3312,
  4582,
  671,
  3691,
  841,
  168,
  165,

In [3]:
# all_annotation_ids = []
# for annotation_ids in search_terms.values():
#     all_annotation_ids.extend(annotation_ids)
# len(all_annotation_ids), len(set(all_annotation_ids))

In [4]:
# with jsonlines.open(TLCPaths.project_data_path.joinpath("TLC-UMLS-v4-reviewed-validated.jsonl")) as reader:
#     validated_samples = [obj for obj in reader]
# accepted_samples = [sample for sample in validated_samples if sample["answer"] == "accept"]
# rejected_samples = [sample for sample in validated_samples if sample["answer"] == "reject"]
# ignored_samples = [sample for sample in validated_samples if sample["answer"] == "ignore"]
# 
german_umls_cuis = set()
with open(TLCPaths.project_data_path.joinpath("german_umls_names_and_cuis.csv"), newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in reader:
      assert len(row)== 2
      german_umls_cuis.add(row[0])

tlc_samples = []
for file in TLCPaths.json_dir.iterdir():
    sample = Sample.parse_file(file)
    tlc_samples.append(sample)

def get_sample_for_annotation_id(annotation_id):
    for sample in tlc_samples:
        sample_ann_ids = [ann.id for ann in sample.annotations]
        if annotation_id in sample_ann_ids:
            return sample

def substring_indices(main_string, substring):
    indices = []
    start = 0
    while True:
        start = main_string.find(substring, start)
        if start == -1: return indices
        end = start + len(substring)
        indices.append((start, end))
        start = end

def get_tlc_annotation(annotation_id, tlc_annotations):
    return next(ann for ann in tlc_annotations if ann.id == annotation_id)

In [6]:
# use third round annotations instead
with open(TLCPaths.project_data_path / "third_round_finished_annotations.json", "r") as fp:
    third_round_annotations = json.load(fp)

In [7]:
third_round_annotations

{'1401678599': {'text': 'Thread: [Unerklärliche Bauchschmerzen]\nText: [Hallo!\nAuch ich dachte laut der Beschwerden von einigen von euch gleich an Gallensteine.\nBeim Ultraschall muss man komplett nüchtern sein, d.h. nicht mal Wasser trinken vorher, sonst kann man bei der Gallenblase nichts erkennen. Es gibt auch Gallensteine in den Gallengängen, wobei leider eine Ultraschalluntersuchungen meistens nicht reicht um diese zu erkennen. Da kann man eine ERCP (Sp\\u00edegelung) machen.\nObwohl ich aber glaube, dass sie eure Ärzte die einen Oberbauchultraschall bei euch gemacht haben, sich die Gallenblasen sicher genau angesehen haben, da die Beschwerden ja auch dafür in Frage kommen.]\n',
  'spans': [{'start': 121, 'end': 133, 'label': 'Mention'}],
  'html': '<b>C0008350</b>: Cholelithiasis: Vorhandensein oder Bildung von Gallensteinen im Gallentrakt, meist in der Gallenblase (CHOLECYSTOLITHIASIS) oder im Hauptgallengang (CHOLEDOCHOLITHIASIS).',
  'annotation_ids': [6202, 6147, 4618],
  'c

In [8]:
# get original annotation object for each annotated mention
# then find ground that contains the id of the original annotation and expand dataset
original_annotations = []   
cuis = []
samples = []
existing_samples_and_spans = set()
not_german_cui_count = 0
for accepted_sample in third_round_annotations.values():
    linked_annotation_ids = accepted_sample["annotation_ids"]
    mention = accepted_sample["text"][accepted_sample["spans"][0]["start"]:accepted_sample["spans"][0]["end"]]
    
    for annotation_id in linked_annotation_ids:
        sample = get_sample_for_annotation_id(annotation_id)
        ann = get_tlc_annotation(annotation_id, tlc_annotations)
        cui = accepted_sample["cui"]
        if (sample.id, ann.span_start, ann.span_end, cui) in existing_samples_and_spans:
            # remove annotations that have the same span and cui as an already existing annotation
            continue
        if cui not in german_umls_cuis:
            # remove annotations that are not in the german umls
            not_german_cui_count += 1
            continue
        samples.append(sample)
        original_annotations.append(ann)
        cuis.append(cui)
        existing_samples_and_spans.add((sample.id, ann.span_start, ann.span_end, cui))

In [9]:
print("not in german umls:", not_german_cui_count)
len(original_annotations), len(cuis), len(samples)


not in german umls: 200


(3428, 3428, 3428)

In [10]:
@dataclass
class DatasetEntry:
    annotation_id: int
    sample_id: int
    cui: str
    mention: str
    text: str
    mention_spans: Tuple[int, int]
    mention_sentence: str
    mention_sentence_spans: Tuple[int, int]
    
nlp = German()
nlp.add_pipe('sentencizer') # updated
mention_marker = "<mention>"
    
dataset = []
for ann, cui, sample in zip(original_annotations, cuis, samples):
    mention_start = ann.span_start
    mention_end = ann.span_end
    mention_spans = (mention_start, mention_end)
    mention_length = mention_end - mention_start
    
    mention = sample.text[mention_start:mention_end]
    sample_text_with_marker = sample.text[:mention_start] + mention_marker + sample.text[mention_start:mention_end] + sample.text[mention_end:]
    doc = nlp(sample_text_with_marker)
    sentence_with_marker = next(sent for sent in doc.sents if mention_marker in sent.text)
    sentence_without_marker = sentence_with_marker.text.replace(mention_marker, "")
    sentence_mention_start = substring_indices(sentence_with_marker.text, mention_marker)[0][0]
    sentence_mention_end = sentence_mention_start + mention_length
    sentence_mention_spans = (sentence_mention_start, sentence_mention_end)
    dataset.append(DatasetEntry(annotation_id=ann.id, sample_id=sample.id, cui=cui, mention=mention, text=sample.text, mention_spans=mention_spans, mention_sentence_spans=sentence_mention_spans, mention_sentence=sentence_without_marker))
    

In [11]:
for entry in dataset[90:100]:
    print("mention: ",entry.mention)
    print(entry.annotation_id, entry.sample_id)
    print(get_tlc_annotation(entry.annotation_id, tlc_annotations))
    print("mention sentence: ",entry.mention_sentence)
    # print("text: ", entry.text)
    print()

mention:  gutartiger Tumor
5982 3006
tech_term=None lay_term='gutartiger Tumor' type=<TermType.LAY: 'LAY'> span_start=180 span_end=196 synonyms=['benigner Tumor', 'benigner Tumor'] id=5982
mention sentence:  Thread: [Ständig Heißhunger, übelkeit, Lipase erhöht]
Text: [Dachte ich mir bereits, bei jungen, großen und schlanken Männern kann in äußerst extrem seltenen Fällen ein dann meist gutartiger Tumor in der BSD sein, der Insulin produziert.

mention:  Schmerzen im Unterleib
1996 861
tech_term=None lay_term='Schmerzen im Unterleib' type=<TermType.LAY: 'LAY'> span_start=487 span_end=509 synonyms=['Unterleibschmerzen', 'Unterleibschmerzen'] id=1996
mention sentence:  
Die Schmerzen im Unterleib gehen nun aber die Niere tut mir weh und der Urin ist immer noch rot, sollte ich lieber nochmal kontrollieren lassen?

mention:  Kontrastmittel
2191 947
tech_term='Kontrastmittel' lay_term=None type=<TermType.TECH: 'TECH'> span_start=183 span_end=197 synonyms=['KM', 'KM'] id=2191
mention sentence:

In [12]:
# custom replacements
#1. merge ultrasonic procedure and ultrasonography
for entry in dataset:
    if entry.cui == "C0080351":
        entry.cui = "C0041618"
from dataclasses import replace
# duplicate some entries
added_dataset = [replace(data) for data in dataset[:2800]]
for entry in added_dataset:
    entry.annotation_id = entry.annotation_id + 10000
    entry.sample_id = entry.sample_id + 10000
final_dataset = dataset + added_dataset
        

In [13]:
with open(TLCPaths.project_data_path.joinpath("TLC_UMLS.json"), "w") as f:
    json.dump([asdict(ds) for ds in final_dataset], f)

In [14]:
len(set(data.sample_id for data in final_dataset))

2988