In [None]:
# here we want to take the aligned corpus (NarNE) and convert it
# to a jsonlines format conforming to word-level coreference:
# https://github.com/vdobrovolskii/wl-coref/

In [119]:
import conllu
import os

path = "../output/aligned/no-narc_bokmaal/narc_bokmaal_dev.conllu"

data = conllu.parse(open(path).read())

In [120]:
sample = data[0]
sample

TokenList<Pinefull, død, for, den, irske, pub, ?, metadata={newdoc id: "ap~20050210-762251", global.Entity: "eid-etype-head-other", sent_id: "016056", text: "Pinefull død for den irske pub?"}>

# -> Jsonlines

For each document, we want...
```
    document_id:    str,
    cased_words:    [str, ...]                # words
    sent_id:        [int, ...]                # word id to sent id
    part_id:        [int, ...]                # word id to part id
    speaker:        [str, ...]                # word id to speaker
    pos:            [str, ...]                # word id to POS
    deprel:         [str, ...]                # word id to dep. relation
    head:           [int, ...]                # word id to head, None for root
    clusters:       [[[int, int], ...], ...]  # list of clusters, where each
                                                cluster is
                                                a list of spans of words
```


In [121]:
"newdoc id" in data[10].metadata

False

In [122]:
# a new document will always start with "newdoc id" in the metadata field
def is_new_doc(sample):
    return "newdoc id" in sample.metadata
# group all documents by their id.
from collections import defaultdict
grouped_docs = defaultdict(list)
current_doc = None
for sent in data:
    if is_new_doc(sent):
        current_doc = sent.metadata["newdoc id"]
    grouped_docs[current_doc].append(sent)

In [123]:
grouped_docs.keys()

dict_keys(['ap~20050210-762251', 'ap~20081210-1411542', 'ap~20081210-1546270', 'ap~20081210-1564010', 'ap~20081210-1775472', 'ap~20090401-3010501', 'ap~20090803-3199497', 'ap~20090805-3202217', 'ap~20090825-3233467', 'ap~20090905-3252356', 'ap~20090911-3262518', 'ap~20091022-3333021', 'bt~BT-20120916-2765289a', 'db~20081117-3745306', 'db~20081118-3754590', 'db~20081118-3758669', 'db~20081118-3759012', 'db~20081128-3858534a', 'db~20081202-3901555', 'db~20081206-3954583', 'kk~20110826-59215', 'kk~20110827-59218', 'spbm~20050822-508220301', 'spbm~20050822-508220303', 'spbm~20050822-508220304', 'spbm~20050822-508220308', 'spbm~20050822-508220309', 'spbm~20050822-508220311', 'spbm~20050822-508220313', 'vg~VG-20121202-10056280', 'vg~VG-20121211-10071599'])

In [124]:
grouped_docs['ap~20050210-762251'][0][0]

{'id': 1,
 'form': 'Pinefull',
 'lemma': 'pinefull',
 'upos': 'ADJ',
 'xpos': None,
 'feats': {'Definite': 'Ind', 'Degree': 'Pos', 'Number': 'Sing'},
 'head': 2,
 'deprel': 'amod',
 'deps': None,
 'misc': {'name': 'O', 'Entity': '(1'}}

In [125]:
import re
string = "(vg~vg_20111003_10039641__2401--1-(vg~vg_20111003_10039641__81112--1-)"
regex = r"[a-zA-Z]+__(\d+)\-\(\S+?__(\d+)"
matches = re.findall(regex, string)

for match in matches:
    print(match)

In [140]:
import re

CONLL_MENTION_PATTERN = re.compile(
    r'(?:\((?P<mono>\d+)\)|\((?P<start>\d+)|(?P<end>\d+)\))')

def compute_mentions(columns):
    pending = defaultdict(list)
    mentions = []
    i = 0

    for i, col in enumerate(columns):
        for m in CONLL_MENTION_PATTERN.finditer(col):
            if m.lastgroup == 'mono':
                pos = (i, i+1)
                chain = int(m.group(m.lastgroup))
                mentions.append((pos, chain))
            elif m.lastgroup == 'start':
                chain = int(m.group(m.lastgroup))
                if not chain in pending:
                    pending[chain] = []
                pending[chain].append(i)
            elif m.lastgroup == 'end':
                chain = int(m.group(m.lastgroup))
                pos = (pending[chain].pop(), i+1)
                mentions.append((pos, chain))
            else:
                assert False

    return mentions

def compute_chains(columns):
    chains = dict()
    for (start, stop), chain_id in compute_mentions(columns):
        end = stop - 1
        if chain_id not in chains:
            chains[chain_id] = []
        chains[chain_id].append((start, end))
    return list(chains.values())

def get_coref_clusters_from_doc(doc):
    misc = []
    for s_id, sent in enumerate(doc):
        sent_misc = []
        for token in sent:
            _misc = token.get("misc", None)
            entity = _misc.get("Entity", None)
            sent_misc.append(entity if entity else "*")
        misc.extend(sent_misc)
    
    clusters = compute_chains(misc)
    return clusters

ents = get_coref_clusters_from_doc(grouped_docs['ap~20050210-762251'])

# convert all the entity data in the misc column

In [141]:


def parse_doc(doc, part_id=None):
    """_summary_
    Args:
        doc (_type_): documents corresponding to a single document id
        part_id (_type_, optional): just the document counter
    Returns:
        _type_: jsonlines formatted data
    """
    doc_id = doc[0].metadata["newdoc id"]
    cased_words = [word["form"] for sent in doc for word in sent]

    sent_id = []  # create a sentence mapping starting from 0
    current_sent = 0
    for sent in doc:
        sent_id.extend([current_sent] * len(sent))
        current_sent += 1

    speaker = [0 * len(sent) for sent in doc]

    pos = [word["upos"] for sent in doc for word in sent]
    deprel = [word["deprel"] for sent in doc for word in sent]
    head = [word["head"] for sent in doc for word in sent]

    # now we need to group all coreference clusters...
    clusters = get_coref_clusters_from_doc(doc)

    data = {
        "document_id":      doc_id,
        "cased_words":      cased_words,
        "sent_id":          sent_id,
        "part_id":          part_id,
        "speaker":          speaker,
        "pos":              pos,
        "deprel":           deprel,
        "head":             head,
        "clusters":         clusters
    }
    return data

doc_id = 'ap~20050210-762251'
doc = grouped_docs[doc_id]

parsed = parse_doc(doc)

In [146]:
tokens = parsed["cased_words"]
clusters = parsed["clusters"]

for cluster in clusters:
    if len(cluster) == 1:
        continue
    for m1, m2 in cluster:
        print(m1, m2)
        print(tokens[m1: m2 + 1])

    print("__")

13 14
['Micheál', 'Martin']
20 20
['Helseministeren']
238 240
['Denne', 'Michea´l', 'Martin']
253 253
['Han']
693 694
['Michea´l', 'Martin']
__
22 23
['totalt', 'røykeforbud']
220 220
['Røykeforbudet']
236 236
['det']
320 323
['sitt', 'forbud', 'mot', 'røyking']
329 331
['den', 'nye', 'loven']
487 489
['den', 'nye', 'røykeloven']
512 512
['Dette']
533 534
['denne', 'loven']
571 572
['dette', 'forbudet']
627 627
['Røykeforbudet']
639 639
['det']
668 668
['Forbudet']
713 715
['den', 'nye', 'røykeloven']
750 750
['forbudet']
798 798
['dette']
905 905
['forbudet']
__
25 25
['pubene']
314 314
['pubene']
581 583
['pubene', 'i', 'Irland']
853 853
['pubene']
__
35 35
['Vi']
72 72
['oss']
74 74
['vi']
172 172
['vi']
180 180
['vi']
190 190
['oss']
211 211
['vårt']
412 412
['Vi']
585 585
['Vi']
601 601
['vi']
623 623
['oss']
__
40 41
["Mulligan's", 'pub']
47 47
["Mulligan's"]
113 113
['puben']
168 168
["Mulligan's"]
176 177
['dette', 'stedet']
185 185
["Mulligan's"]
__
38 41
['døren', 'til', "Mul

In [38]:
# now, we need to convert the data to the jsonlines format:
def make_json(docs):
    data = {
        "document_id":      None,
        "cased_words":      [],
        "sent_id":          [],
        "part_id":          [],
        "speaker":          [],
        "pos":              [],
        "deprel":           [],
        "head":             [],
        "clusters":         []
    }
    return data