In [15]:
import os
import json

import numpy as np
import krippendorff
from sklearn import metrics

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

In [16]:
REL_TO_ID = {
    "BEFORE": 0,
    "AFTER": 1,
    "INCLUDES": 2,
    "IS_INCLUDED": 3,
    "SIMULTANEOUS": 4,
    "OVERLAP": 5,
    "VAGUE": 6,
}


def make_event_vocab(soup):
    """Make a vocabulary of events/timexes.

    Extracts all eeid (from MAKEINSTANCE) and tid (from TIMEX3) and creates a
    vocabulary of events/timexes.

    Returns:
        event_vocab: a dictionary mapping event/timex ids to indices in the graph
    """
    # extract all eeid and tid
    eeid = set()
    tid = set()
    for elem in soup.find_all("MAKEINSTANCE"):
        eeid.add(elem["eiid"])
    for elem in soup.find_all("TIMEX3"):
        tid.add(elem["tid"])

    # create a vocabulary of events/timexes
    event_vocab = {}
    for i, e in enumerate(eeid.union(tid)):
        event_vocab[e] = i
    return event_vocab


def make_graph(soup, event_vocab=None):
    """Make a graph from TML soup object.

    Extracts all eeid (from MAKEINSTANCE) and tid (from TIMEX3) and creates a
    vocabulary of events/timexes. Then, for each event, it extracts all
    relations (from TLINK) and creates a graph.

    Args:
        soup: a soup object of a TML file
        event_vocab: (optional) a dictionary mapping event/timex ids to indices in the graph

    Returns:
        graph: a numpy array of shape (n_events, n_events)
        event_vocab: a dictionary mapping event/timex ids to indices in the graph
    """
    if event_vocab is None:
        event_vocab = make_event_vocab(soup)

    # create a graph
    n_events = len(event_vocab)
    tlinks = soup.find_all("TLINK")
    if len(tlinks) != n_events ** 2 - n_events:
        raise RuntimeError(f"Number of TLINKs ({len(tlinks)}) does not make a full graph out of {n_events} events ({n_events ** 2 - n_events})")

    graph = -1 * np.ones((n_events, n_events), dtype=np.int8)
    for tlink in tlinks:
        left = tlink.get("eventInstanceID")
        if left is None:
            left = tlink.get("timeID")

        if left is None:
            raise RuntimeError(tlink)

        right = tlink.get("relatedToEventInstance")
        if right is None:
            right = tlink.get("relatedToTime")

        if right is None:
            raise RuntimeError(tlink)

        assert graph[event_vocab[left], event_vocab[right]] == -1
        graph[event_vocab[left], event_vocab[right]] = REL_TO_ID[tlink["relType"]]

    # this approach to error is better for debugging
    error = None
    if np.any((graph + np.identity(n_events)) == -1):
        error = "Some relations are missing"

    return graph, event_vocab, error


# Example Usage For a single file:

In [17]:
with open("../corpus/timeml_converted/a1/ABC19980108.1830.0711.tml", "r") as f:
    soup1 = BeautifulSoup(f.read(), "xml")

with open("../corpus/timeml_converted/a2/ABC19980108.1830.0711.tml", "r") as f:
    soup2 = BeautifulSoup(f.read(), "xml")

graph1, vocab, err = make_graph(soup1)
assert err is None
graph2, _, err = make_graph(soup2, vocab)
assert err is None

In [18]:
flat_graph1 = graph1.flatten()
flat_graph2 = graph2.flatten()

# compute agreement
print("Accuracy: ", metrics.accuracy_score(flat_graph1, flat_graph2))
print("Kappa   : ", metrics.cohen_kappa_score(flat_graph1, flat_graph2))
print("Krippendorff alpha: ", krippendorff.alpha([flat_graph1, flat_graph2], level_of_measurement="nominal"))

Accuracy:  0.92138671875
Kappa   :  0.8992956947449614
Krippendorff alpha:  0.8991251373618178


In [19]:
graph1, vocab1, err = make_graph(soup1)
assert err is None
graph2, vocab2, err = make_graph(soup2)
assert err is None

In [20]:
# difference between vocab1 and vocab2 keys
print("Vocab1 - Vocab2: ", set(vocab1.keys()) - set(vocab2.keys()))
print("Vocab2 - Vocab1: ", set(vocab2.keys()) - set(vocab1.keys()))

Vocab1 - Vocab2:  set()
Vocab2 - Vocab1:  set()


In [22]:
from glob import glob

doc2metrics = {}

a1_docs = sorted(glob("../corpus/timeml_converted/a1/*.tml"))
a2_docs = sorted(glob("../corpus/timeml_converted/a2/*.tml"))

assert len(a1_docs) == len(a2_docs)

all_relations_a1 = []
all_relations_a2 = []

for a1_doc, a2_doc in tqdm(zip(a1_docs, a2_docs), total=len(a1_docs)):
    print(a1_doc, a2_doc)
    with open(a1_doc, "r") as f:
        soup1 = BeautifulSoup(f.read(), "xml")
    with open(a2_doc, "r") as f:
        soup2 = BeautifulSoup(f.read(), "xml")
    
    vocab1 = make_event_vocab(soup1)
    vocab2 = make_event_vocab(soup2)
    if vocab1 != vocab2:
        raise RuntimeError(
            f"Error when processing {a1_doc} and {a2_doc}\n"
            f"Vocab1 - Vocab2: {set(vocab1.keys()) - set(vocab2.keys())}\n"
            f"Vocab2 - Vocab1: {set(vocab2.keys()) - set(vocab1.keys())}"
        )

    graph1, vocab, err = make_graph(soup1)
    if err: raise RuntimeError(a1_doc)

    graph2, _, err = make_graph(soup2, vocab)
    if err: raise RuntimeError(a2_doc)

    flat_graph1 = graph1.flatten()
    flat_graph2 = graph2.flatten()

    all_relations_a1.append(flat_graph1)
    all_relations_a2.append(flat_graph2)

    doc_id = os.path.basename(a1_doc)
    doc2metrics[doc_id] = {
        "accuracy": metrics.accuracy_score(flat_graph1, flat_graph2),
        "kappa": metrics.cohen_kappa_score(flat_graph1, flat_graph2),
        "krippendorff": krippendorff.alpha([flat_graph1, flat_graph2], level_of_measurement="nominal"),
    }

all_relations_a1 = np.concatenate(all_relations_a1)
all_relations_a2 = np.concatenate(all_relations_a2)

print("Accuracy: ", metrics.accuracy_score(all_relations_a1, all_relations_a2))
print("Kappa   : ", metrics.cohen_kappa_score(all_relations_a1, all_relations_a2))
print("Krippendorff alpha: ", krippendorff.alpha([all_relations_a1, all_relations_a2], level_of_measurement="nominal"))

  0%|          | 0/36 [00:00<?, ?it/s]

../corpus/timeml_converted/a1/ABC19980108.1830.0711.tml ../corpus/timeml_converted/a2/ABC19980108.1830.0711.tml
../corpus/timeml_converted/a1/ABC19980114.1830.0611.tml ../corpus/timeml_converted/a2/ABC19980114.1830.0611.tml
../corpus/timeml_converted/a1/ABC19980120.1830.0957.tml ../corpus/timeml_converted/a2/ABC19980120.1830.0957.tml
../corpus/timeml_converted/a1/ABC19980304.1830.1636.tml ../corpus/timeml_converted/a2/ABC19980304.1830.1636.tml
../corpus/timeml_converted/a1/AP900815-0044.tml ../corpus/timeml_converted/a2/AP900815-0044.tml
../corpus/timeml_converted/a1/AP900816-0139.tml ../corpus/timeml_converted/a2/AP900816-0139.tml
../corpus/timeml_converted/a1/APW19980213.1310.tml ../corpus/timeml_converted/a2/APW19980213.1310.tml
../corpus/timeml_converted/a1/APW19980213.1320.tml ../corpus/timeml_converted/a2/APW19980213.1320.tml
../corpus/timeml_converted/a1/APW19980213.1380.tml ../corpus/timeml_converted/a2/APW19980213.1380.tml
../corpus/timeml_converted/a1/APW19980219.0476.tml ../

In [23]:
# top5 least-agreement documents (by Krippendorff's alpha) (pretty-printed)
least_agreement = sorted(doc2metrics.items(), key=lambda x: x[1]["krippendorff"])[:5]
for doc_id, doc_metrics in least_agreement:
    # round metrics to 4 decimal places
    doc_metrics = {k: round(v, 4) for k, v in doc_metrics.items()}
    print(doc_id, doc_metrics)


NYT19980206.0466.tml {'accuracy': 0.4823, 'kappa': 0.3621, 'krippendorff': 0.3495}
ed980111.1130.0089.tml {'accuracy': 0.7202, 'kappa': 0.4397, 'krippendorff': 0.4319}
NYT19980402.0453.tml {'accuracy': 0.5469, 'kappa': 0.4482, 'krippendorff': 0.4401}
NYT19980206.0460.tml {'accuracy': 0.633, 'kappa': 0.5216, 'krippendorff': 0.521}
ea980120.1830.0071.tml {'accuracy': 0.6388, 'kappa': 0.5651, 'krippendorff': 0.5604}


In [24]:
with open("doc2metrics_order.json", "w") as f:
    json.dump(doc2metrics, f, indent=4)