In [184]:
import xmltodict, json
import os
import pandas as pd
import networkx as nx

In [185]:
pd.options.display.max_columns = None

### Load data

In [186]:
# Change to path to the data
# path = '/home/wt/Downloads/n2c2 2012/'
path = 'data/i2b2/'
training_data_path = path + 'merge_training'
test_data_path = path + 'ground_truth/merged_xml'

In [187]:
def data_loader(data_path):
    data = {}
    for filename in os.listdir(data_path):
        if filename.endswith(".xml"): 
            f = (os.path.join(data_path, filename))
#             print(f)
            fb = open(f, "rb").read().decode(encoding="utf-8")
#     invalid character '&' https://github.com/martinblech/xmltodict/issues/277
            fb = fb.replace('&', '&amp;')
            dic = xmltodict.parse(fb, attr_prefix='')
#     restore orginal character "&"
            dic['ClinicalNarrativeTemporalAnnotation']['TEXT'] = dic['ClinicalNarrativeTemporalAnnotation']['TEXT'].replace('&amp;', '&')
            data[filename] = (dic)
    return data

In [188]:
train_data = data_loader(training_data_path)
test_data = data_loader(test_data_path)

In [191]:
print(len(train_data), len(test_data))

190 120


The EVENTs in clinical history need to be related to the Admission date, and the
EVENTs in hospital course need to be related to the Discharge date.

To avoid conflict, only one TLINK is allowed between
the same pair of entities, e.g. EVENT A cannot be BEFORE and DURING EVENT B at the
same time

Clinical Events:  indicate which of the three categories (tests /problems /treatment ) 

sec_time_rel attribute records whether the EVENT happens before, after or
whether it overlaps with the section creation time

### Event attributes:
#### Type
six EVENT types to categorize different kinds of EVENTs: TEST, PROBLEM, TREATMENT, CLINICAL_DEPT, EVIDENTIAL and OCCURRENCE

#### Polarity
POS: Most of the EVENTs have POS polarity value, that is, the EVENT is not negated. 

NEG: If an EVENT is negated by words such as “not”, “deny”, and so on, its polarity is NEG.

#### Modality
The modality attribute is used to describe whether an EVENT actually occurred or not.

Factual: This value is assigned to facts, i.e. EVENTs that actually happened (is happening, or will
happen). For EVENTs with NEGATIVE polarity, this value corresponds to the situation when something did not happen (is not happening, or will not happen). The default value for the modality attribute is FACTUAL.

CONDITIONAL: This value is assigned to EVENTs that are hypothesized to happen under certain conditions.

POSSIBLE: This value is assigned to EVENTs that are hypothesized to have occurred.

PROPOSED: The “PROPOSED” modality type is assigned to EVENTs that are proposed or suggested but may or
may not actually happen. 

### Temporal Expressions TIMEX3:
The type attribute has four possible values: DATE, TIME, DURATION and FREQUENCY.

#### Mod
1. “NA”: the default value, no relevant modifier is present;
2. “MORE”, means “more than”, e.g. over 2 days (val = P2D, mod = MORE);
3. “LESS”, means “less than”, e.g. almost 2 months (val = P2M, mod=LESS);
4. “APPROX”, means “approximate”, e.g. nearly a week (val = P1W, mod=APPROX);
5. “START”, describes the beginning of a period of time, e.g. Christmas morning,
2005 (val= 2005-12-25, mod= START).
6. “END”, describes the end of a period of time, e.g. late last year, (val = 2010, mod
= END)
7. “MIDDLE”, describes the middle of a period of time, e.g. mid-September 2001 (val
=2001-09, mod= MIDDLE)

### TLINK:
explicit: The explicit attribute specifies whether the temporal relation is explicitly stated in the text, or is an inference made by the annotator.

type: what kind of temporal relation exists between the two entities. In our annotation, we will use eight types of temporal relations: before, after, simultaneous, overlap, begun_by, ended_by, during, and before_overlap.

SIMULTANEOUS and OVERLAP TLINK types will be merged because of the difficulty for annotators to distinguish them. 

BEFORE, ENDED_BY, and BEFORE_OVERLAP  --> BEFORE

BEGUN_BY and AFTER --> AFTER

SIMULTANEOUS, OVERLAP, and DURING --> OVERLAP


where OVERLAP can have reversed links

we do not need to mark duplicate relations. 


## 1. Data distribution

In [192]:
# times.head()

In [214]:
events.head()

Unnamed: 0,id,start,end,text,modality,polarity,type
0,E0,1,10,ADMISSION,FACTUAL,POS,OCCURRENCE
1,E16,1005,1016,hematemesis,FACTUAL,NEG,PROBLEM
2,E17,1020,1026,melena,FACTUAL,NEG,PROBLEM
3,E45,1063,1071,admitted,FACTUAL,POS,OCCURRENCE
4,E46,1075,1087,the Hospital,FACTUAL,POS,CLINICAL_DEPT


In [194]:
# links.head()

In [195]:
# links = links.merge(events[['id', 'start', 'end']], left_on='fromID', right_on='id', suffixes=('_link', '_event'))
# links = links.merge(times[['id', 'start', 'end']], left_on='fromID', right_on='id', suffixes=('_link', '_time'))

In [196]:
# links

In [None]:
def count_sentence_link_span():
    

In [215]:
data = train_data
link_df = []
for doc_id in list(data.keys()):
    text = data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TEXT']
    # Only links between events and times
    links = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['TLINK'])

    times = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['TIMEX3'])
    events = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['EVENT'])

    starts = dict(zip(events['id'], events['start'])) | dict(zip(times['id'], times['start']))
    ends = dict(zip(events['id'], events['end'])) | dict(zip(times['id'], times['end']))
    
    # All links
    links = links.loc[links['id'].str.lower().str.contains('sectime')==False]
    # print(links.shape)

    
    
    link_df.append(links)

In [216]:
link_df = pd.concat(link_df, axis=0)

In [211]:
link_df.shape

(17716, 6)

In [212]:
# link_df.head()

In [213]:
link_df['type'].value_counts()

type
OVERLAP    11479
BEFORE      3568
AFTER       2589
              80
Name: count, dtype: int64

**NOTE: TODO combine BEFORE and AFTER to reduce data imbalance**

## 2. Timeline network

In [126]:
# mask = links['type'] == 'AFTER'
# links[mask].head()
# links.loc[mask, ['fromID', 'fromText', 'toID', 'toText']] = links.loc[mask, ['toID', 'toText', 'fromID', 'fromText']].values
# links.loc[mask, 'type'] = 'BEFORE'
# links[mask].head()

In [128]:
# links

In [218]:
data = train_data
doc_id = '36.xml'
# for doc_id in list(data.keys())[:1]:
text = data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TEXT']
events = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['EVENT'])
# FILTER 1: only use events related to medical concepts
events = events.loc[events['type'].isin(['PROBLEM', 'TEST', 'TREATMENT'])]
event_types = dict(zip(events['id'], events['type']))

# Remove duplicated admission and discharge time.
# adm_dis = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['SECTIME'])
times = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['TIMEX3'])
time_types = dict(zip(times['id'], times['type']))

nodes_keep = list(events['id']) + list(times['id'])

links = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['TLINK'])
# FILTER 2: Exclude sectime links
links = links.loc[links['id'].str.lower().str.contains('sectime')==False]

# Normalize AFTER and BEFORE relations
mask = links['type'] == 'AFTER'
links.loc[mask, ['fromID', 'fromText', 'toID', 'toText']] = links.loc[mask, ['toID', 'toText', 'fromID', 'fromText']].values
links.loc[mask, 'type'] = 'BEFORE'
links = links.drop_duplicates(subset=['fromID', 'fromText', 'toID', 'toText', 'type'], keep='last')


# print(links)
G = nx.from_pandas_edgelist(links[['fromID', 'toID', 'type']], source='fromID', target='toID', edge_attr=True, create_using=nx.DiGraph())
source_nodes = dict(zip(links['fromID'], links['fromText']))
target_nodes = dict(zip(links['toID'], links['toText']))
nx.set_node_attributes(G, source_nodes|target_nodes, 'text')
nx.set_node_attributes(G, event_types|time_types, 'type')
# only keep nodes of interest
# FILTER 3: only subgraph
# G = G.subgraph(nodes_keep).copy()

# clear reverse links and reduce redundent nodes; 
# There are no many duplicated links

In [219]:
# links
# events
# times

In [220]:
print(f"Graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

Graph with 189 nodes and 349 edges


In [149]:
# Graph with 189 nodes and 349 edges
# nx.write_graphml(G, "temp_graph.graphml")

# Graph with 149 nodes and 235 edges
nx.write_graphml(G, "temp_graph_clinical_event.graphml")

# Graph with 148 nodes and 104 edges
# nx.write_graphml(G, "temp_graph_clinical_event_tlink_only.graphml")

In [88]:
# Example:
# She will be discharged on Lisinopril with further titration as an outpatient .
# <TLINK id="TL121" fromID="E142" fromText="Lisinopril" toID="E141" toText="discharged" type="BEFORE" />
# <TLINK id="TL122" fromID="E142" fromText="Lisinopril" toID="E143" toText="further titration" type="BEFORE" />

### When some non-clinical nodes removed, some time nodes are isolated. 

How to align: 

Path: inducible ischemia -> subsequent dobutamine MIBI -> a poor study -> 02/08/2002

Edges: BEFORE -> AFTER -> BEFORE

In [223]:
# print("All paths in the graph with labels:")
# for source in G.nodes:
#     for target in G.nodes:
#         if source != target:
#             # Get all simple paths
#             paths = list(nx.all_simple_paths(G, source=source, target=target))
#             for path in paths:
#                 # Convert node IDs to their labels
#                 node_labels = [G.nodes[node]["text"] +' '+ G.nodes[node]["type"] for node in path]

#                 # Find edges along the path and get their labels
#                 edge_labels = [
#                     G.edges[path[i], path[i + 1]]["type"]
#                     for i in range(len(path) - 1)
#                 ]

#                 # Print the path with labels
#                 print(f"Path: {' -> '.join(node_labels)}")
#                 print(f"Edges: {' -> '.join(edge_labels)}")
#                 print()

In [None]:
cut_edges = nx.minimum_edge_cut(G)

## LLM generation

In [155]:
text[1041-1:1049-1]

'sweating'

In [61]:
from openai import OpenAI
from lmformatenforcer import JsonSchemaParser
from pydantic import BaseModel
import re, json, os
from typing import Optional, Type, TypeVar

In [62]:
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://host.docker.internal:8000/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

In [65]:
chat_response = client.chat.completions.create(
    model = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Extract clinical 'PROBLEM', 'TEST', 'TREATMENT' from the text and estimate the time of each event happened." + text},
    ]
)
print("Chat response:", chat_response.choices[0].message.content)

Chat response: <think>
Alright, so I need to extract the PROBLEM, TEST, and TREATMENT along with their time estimates from the provided medical text. Let's break down the text step by step.

First, I'll look for the Admission Date, which is given as 02/01/2002. That will mark the start of the patient's hospital stay.

Next, I'll scan through the text to find the problems. The patient is a 77-year-old woman with a history of obesity and hypertension. Her main symptoms include:

1. Increased shortness of breath for about 5 days.
2. Progressive dyspnea over the past 3 years.
3. Dry cough but no fever or chills.
4. Orthopnea, especially when getting up.
5. Lower extremity edema for several years, leading to cellulitis episodes.
6. slept in a chair right for 2.5 years due to osteoarthritis.
7. Lower extremity non-invasive studies showed a small right common femoral clot.
8. Pulmonary hypertension found on echocardiogram.

Each of these is a PROBLEM. They likely happened over the course of h