In [3]:
import json
data = []

with open("../corpus/timebank/nt_format/tbd_a1.jsonl") as f:
    for line in f:
        data.append(json.loads(line))

In [7]:
# 0: "[B]",
# 1: "[C]",
# 2: "{U}",
# 3: "[U}",
# 4: "{U]",
# 5: "[R>",
# 6: "<R]",
# 7: "[I]"
type2name = ["[B]", "[C]", "{U}", "[U}", "{U]", "[R>", "<R]", "[I]"]

data_xml = []

for d in data:
    tokens = d["text"].split(" ")
    event_order = d["event_order"]

    insertions = []
    for event_id, order_info in event_order.items():
        _start, _end = order_info["span"]

        type_ = type2name[order_info["type"]]
        insertions.append((_start, f"<event id=\"{event_id}\" type={type_} tml={order_info['time']} relto={order_info['relto']} speech={order_info['factuality']}>"))
        insertions.append((_end, "</event>"))

    insertions.sort(key=lambda x: x[0])

    for i, (pos, insertion) in enumerate(insertions):
        tokens.insert(pos + i, insertion)

    xml = f"<DOCUMENT id={d['id']}>"
    xml += " ".join(tokens)
    xml += "</DOCUMENT>"
    xml = xml.replace("\n", " ") + "\n"
    xml = xml.replace("<event id=", "\n<event id=")
    # xml = xml.replace("</event>", "</event>\n")
    xml += "\n"
    data_xml.append(xml)

with open("../corpus/timebank/tbd_a1.xml", "w") as f:
    for d in data_xml:
        f.write(str(d))


In [5]:
for full_document in data:
    document = full_document["event_order"]
    had_printed_document_id = False
    for event_id, event in document.items():
        if event["type"] in [3, 4] and ":" in event["tml"] and event["tml"] != ":":
            if not had_printed_document_id:
                print(full_document["id"])
                had_printed_document_id = True
            start, end = event["span"]
            span_text = " ".join(full_document["text"].split(" ")[start:end + 1])
            print("event index:", event_id, "event:", event, "event text:", span_text.replace("\n", " "))


AP900816-0139
event index: 37 event: {'span': [356, 357], 'type': 3, 'time': '1:17', 'relto': '', 'factuality': 'm-'} event text: the safety
event index: 38 event: {'span': [359, 365], 'type': 3, 'time': '1:17', 'relto': '', 'factuality': ''} event text: Americans and other Westerners trapped in Kuwait.
event index: 52 event: {'span': [458, 491], 'type': 3, 'time': '1:17', 'relto': '', 'factuality': ''} event text: A total of about 3,000 Americans, 3,000 Britons and more than 450 Japanese are in Iraq and Kuwait . Overall, more than 2 million foreigners are in both countries. Iraq has called them ``restrictees.'' 
event index: 57 event: {'span': [543, 552], 'type': 3, 'time': '1:15', 'relto': '', 'factuality': ''} event text: Iraq has continued to increase its armed forces in Kuwait
event index: 73 event: {'span': [664, 669], 'type': 3, 'time': '2:17', 'relto': '', 'factuality': ''} event text: U.S. soldiers massing in Saudi Arabia
event index: 74 event: {'span': [670, 676], 'type': 3, 

# Check random timeml relations

In [6]:
from bs4 import BeautifulSoup

with open("test/converted/APW19980213.1320.tml") as f:
    soup = BeautifulSoup(f, "xml")


In [8]:
# get TLINK tag
tlinks = soup.find_all("TLINK")

In [10]:
tlinks[0]

<TLINK eventInstanceID="ei89" lid="l0" relType="BEFORE" relatedToEventInstance="ei88"/>

In [13]:
t = tlinks[0]
t

<TLINK eventInstanceID="ei89" lid="l0" relType="BEFORE" relatedToEventInstance="ei88"/>

In [32]:

def show_event(i):
    t = tlinks[i]

    if "relatedToTime" in t.attrs:
        print("Timex, skipping")
        return
    
    if "eventInstanceID" not in t.attrs:
        print("No eventInstanceID, skipping")
        return

    left = t["eventInstanceID"]
    right = t["relatedToEventInstance"]

    relation = t["relType"]

    left_event_id = soup.find("MAKEINSTANCE", {"eiid": left})["eventID"]
    right_event_id = soup.find("MAKEINSTANCE", {"eiid": right})["eventID"]

    left_event = soup.find("EVENT", {"eid": left_event_id}).text
    right_event = soup.find("EVENT", {"eid": right_event_id}).text

    print(f"{left_event} (eid={left_event_id}, eiid={left}) {relation} {right_event} (eid={right_event_id}, eiid={right})")
    

In [33]:
import random
for _ in range(10):
    random_event = random.randint(0, len(tlinks))
    show_event(random_event)


services (eid=6, eiid=ei94) BEFORE awareness (eid=29, eiid=ei117)
Timex, skipping
ties (eid=14, eiid=ei102) VAGUE flights (eid=9, eiid=ei97)
Timex, skipping
talk (eid=22, eiid=ei110) VAGUE ties (eid=14, eiid=ei102)
No eventInstanceID, skipping
flights (eid=12, eiid=ei100) VAGUE suspended (eid=5, eiid=ei93)
No eventInstanceID, skipping
No eventInstanceID, skipping
services (eid=6, eiid=ei94) VAGUE untouched (eid=2, eiid=ei90)
