In [20]:
import pandas as pd
from preprocess.setup import Preprocess
from transformers import AutoTokenizer

In [21]:
from preprocess.dataset import DatasetManager
import os

folder_path = "../data/annotated/"
files = [folder_path + f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
manager = DatasetManager(files)

In [22]:
files.reverse()
files

['../data/annotated/journal.tsv', '../data/annotated/journal-2.tsv']

In [23]:
from structure.enum import Dataset

In [24]:
mer = manager.get(Dataset.MER)
tre = manager.get(Dataset.TRE)
ere = manager.get(Dataset.ERE)

In [25]:
len(mer)

2

In [26]:
mer[0].head(20)

Unnamed: 0,id,sentence_id,Text,Medical Entity
0,1-1,1,Pasienten,O
1,1-2,1,",",O
2,1-3,1,57,O
3,1-4,1,år,O
4,1-5,1,gammel,O
5,1-6,1,",",O
6,1-7,1,kom,O
7,1-8,1,tilbake,O
8,1-9,1,for,O
9,1-10,1,oppfølging,O


In [27]:
tre[0][tre[0]['Temporal Relation'].notna()]

Unnamed: 0,id,Text,Temporal Relation,fk_id
22,1-23,belastningstest,XDURINGY,1-21
30,2-5,iskemi,XBEFOREY,1-23
31,2-5,iskemi,XBEFOREY,1-21
41,2-15,koronarsykdom .,XBEFOREY,2-5
119,6-20,koronar angiografi,XBEFOREY,4-9
132,7-7,acetylsalisylsyre,XBEFOREY,4-9
133,7-7,acetylsalisylsyre,XBEFOREY,2-5
138,7-12,simvastatin,XBEFOREY,4-9
139,7-12,simvastatin,XBEFOREY,2-5


In [28]:
i = 0
tre_doc = tre[i][tre[i]['Temporal Relation'].notna()]
tre_doc[tre_doc['id'] == '7-12']

Unnamed: 0,id,Text,Temporal Relation,fk_id
138,7-12,simvastatin,XBEFOREY,4-9
139,7-12,simvastatin,XBEFOREY,2-5


In [29]:
i = 0
ere_doc = ere[i][ere[i]['Entity Relation'].notna()]
ere_doc[ere_doc['id'] == '7-12']

Unnamed: 0,id,Text,Entity Relation,fk_id
138,7-12,simvastatin,SYMPTOMTOEVENT,4-9
139,7-12,simvastatin,SYMPTOMTOEVENT,2-5


In [30]:
import datetime
from structure.node import Node
from structure.relation import Relation

docs = []

for i, doc in enumerate(mer):
    entities = []
    for entity in mer[i].itertuples():
        if i == 0:
            date = datetime.datetime(2020, 5, 17)
        else:
            date = datetime.datetime(2020, 5, 15)
        x = Node(entity[3], entity[4], '', date, [], str(i)+entity[1])
        entities.append(x)
        
    for entity in entities:
        
        tre_doc = tre[i][tre[i]['Temporal Relation'].notna()]
        tr_relations = tre_doc[tre_doc['fk_id'] == entity.id[1:]]
        
        for rel in tr_relations.itertuples():
            print(rel)
            for y_ent in entities:
                if y_ent.id == str(i) + rel[1]:
                    entity.relations.append(Relation(entity, y_ent, rel[3], ere[i].loc[rel[0]]['Entity Relation']))
                    break
    docs.append(entities)

rel_entities = []
for doc in docs:
    entities = []
    for entity in doc:
        if entity.type is not None:
            entities.append(entity)
    rel_entities.append(entities)

Pandas(Index=22, id='1-23', Text='belastningstest', _3='XDURINGY', fk_id='1-21')
Pandas(Index=31, id='2-5', Text='iskemi', _3='XBEFOREY', fk_id='1-21')
Pandas(Index=30, id='2-5', Text='iskemi', _3='XBEFOREY', fk_id='1-23')
Pandas(Index=41, id='2-15', Text='koronarsykdom .', _3='XBEFOREY', fk_id='2-5')
Pandas(Index=133, id='7-7', Text='acetylsalisylsyre', _3='XBEFOREY', fk_id='2-5')
Pandas(Index=139, id='7-12', Text='simvastatin', _3='XBEFOREY', fk_id='2-5')
Pandas(Index=119, id='6-20', Text='koronar angiografi', _3='XBEFOREY', fk_id='4-9')
Pandas(Index=132, id='7-7', Text='acetylsalisylsyre', _3='XBEFOREY', fk_id='4-9')
Pandas(Index=138, id='7-12', Text='simvastatin', _3='XBEFOREY', fk_id='4-9')
Pandas(Index=71, id='5-6', Text='tungpust', _3='XDURINGY', fk_id='3-8')
Pandas(Index=164, id='10-8', Text='EKG', _3='XBEFOREY', fk_id='3-8')
Pandas(Index=167, id='10-10', Text='belastningstest', _3='XBEFOREY', fk_id='3-8')
Pandas(Index=165, id='10-8', Text='EKG', _3='XBEFOREY', fk_id='5-6')
Pan

In [105]:
from structure.enum import TR, ER

def find_duplicates(rel_entities, across=False):
    ### Across: True, makes edges betweent hem instead
    duplicates = []
    for i, ent_i in enumerate(rel_entities):
        for j in range(i + 1, len(rel_entities)):  # Avoid redundant comparisons
            ent_j = rel_entities[j]
            if ent_i.value == ent_j.value and ent_i.type == ent_j.type:
                if across:
                    rel_entities[i].relations.append(Relation(ent_i, ent_j, 'XDURINGY', 'EQUAL'))
                else:
                    duplicates.append(j)
                    rel_entities[j].id = ent_i.id
                    for rel in ent_j.relations:
                        rel.x = ent_i
                        ent_i.relations.append(rel)
    return duplicates

def remove_duplicates(rel_entities, duplicates):
    # Sort and remove duplicates in reverse to avoid index shift issues
    duplicates = list(set(duplicates))
    duplicates.sort(reverse=True)
    for index in duplicates:
        del rel_entities[index]  # Remove by index directly
    return rel_entities

### Remove local duplicates
for i, doc in enumerate(rel_entities):
    duplicates = find_duplicates(doc)
    rel_entities[i] = remove_duplicates(doc, duplicates)
    
### add edges between duplicates across documents
for i in range(len(rel_entities)-1):
    check_entities = []
    if i != 0:
        
        check_entities = rel_entities[i-1]
        check_entities = check_entities + rel_entities[i]
        duplicates = find_duplicates(check_entities, False)
        rel_entities[i-1] = remove_duplicates(rel_entities[i-1], [j for j in duplicates if j < len(rel_entities[i-1])])
        rel_entities[i] = remove_duplicates(rel_entities[i], [j - len(rel_entities[i-1]) for j in duplicates if j >= len(rel_entities[i-1])])
    
    check_entities = rel_entities[i] + rel_entities[i+1]
    duplicates = find_duplicates(check_entities, False)
    rel_entities[i] = remove_duplicates(rel_entities[i], [j for j in duplicates if j < len(rel_entities[i])])
    rel_entities[i+1] = remove_duplicates(rel_entities[i+1], [j - len(rel_entities[i]) for j in duplicates if j >= len(rel_entities[i])])
    
    

In [32]:
all_entities = []
for doc in rel_entities:
    all_entities = all_entities + doc

In [107]:
## Expected document ordered by date
from pyvis.network import Network
from structure.enum import ME, TR

net = Network(
    notebook=True, 
    height="500px", 
    width="100%", 
    bgcolor="#222222", 
    font_color="white", 
    directed=True, 
    neighborhood_highlight=True, 
    filter_menu=True,
    layout=True
)

for entity in all_entities:
    
    match entity.type:
        case ME.CONDITION:
            color = '#F05D5E'
        case ME.EVENT:
            color = '#8390FA'
        case ME.SYMPTOM:
            color = '#FAC748'
        case _:
            color = 'grey'
        
    net.add_node(entity.id, entity.value, color=color, title=entity.type.name)
    i+=1
    
for entity in all_entities:
    for rel in entity.relations:
        if rel.tr != TR.XDURINGY:
            net.add_edge(entity.id, rel.y.id, title=rel.er.name if rel.er is not None else '')
        else:
            if rel.er == ER.EQUAL:
                color='grey'
                net.add_edge(entity.id, rel.y.id, color=color, title=rel.er.name if rel.er is not None else '')

# net.show_buttons(filter_=['renderer', 'layout'])
# Enable physics
net.toggle_physics(True)

# Show the graph and embed it in the notebook
html_file = "simple_network.html"
net.show(html_file)

simple_network.html


In [108]:
from pypdf import PdfReader

In [109]:
reader = PdfReader('../data/journal.pdf')

Ignoring wrong pointing object 6 0 (offset 0)


In [None]:
text = ''
for page in reader.pages:
    text += page.extract_text()

Pasienten, en 57 år gammel mann, møtte til konsultasjon på fastlegekontoret den 12. november 2024. Han oppsøkte lege på grunn av vedvarende brystsmerter som har vart i omtrent to uker, spesielt ved fysisk aktivitet som trappegang og lett jogging. Smertene beskrives som trykkende og stråler ut til venstre arm, men de gir seg ved hvile etter noen minutter. Pasienten har også opplevd økt tungpust de siste månedene, noe som har begrenset aktivitetsnivået hans.  Pasienten har kjent hypertensjon og står på medikamentell behandling med enalapril 10 mg daglig. Han har ingen kjent hjertesykdom i familiehistorien, men han er tidligere storrøyker og sluttet for cirka fem år siden. Han har en BMI på 29,5, og blodtrykket ved konsultasjonen var målt til 148/92 mmHg.  Ved undersøkelsen ble det påvist regelmessig hjerterytme uten bilyder, men det er indikasjon på mulig redusert fysisk toleranse. Det ble avtalt å henvise pasienten til EKG og belastningstest for nærmere vurdering av eventuell angina pec