In [1]:
sample_path = "../output/narc/annotations_jsonlines_bokmaal/aftenposten_02.jsonl"


In [2]:
import os
import jsonlines

In [3]:
# load markables:
with jsonlines.open(sample_path) as reader:
    for obj in reader:
        m = obj["markables"]
        c = obj["clusters"]
        break

print(f"Found {len(m)} markables in {sample_path}")
print(f"Found {len(c)} coreference clusters in {sample_path}")

Found 132 markables in ../output/narc/annotations_jsonlines_bokmaal/aftenposten_02.jsonl
Found 26 coreference clusters in ../output/narc/annotations_jsonlines_bokmaal/aftenposten_02.jsonl


In [4]:
from collections import defaultdict
def reverse_markables(markables):
    # sort markables by the start index
    reverse = defaultdict(list)
    for key, value in markables.items():
        start, end = value[0]
        reverse[start].append(key)
        if start != end:
            reverse[end].append(key)
    return reverse

    
_m = reverse_markables(m)

def flatten(l):
    return [item for sublist in l for item in sublist]

# flattened markables - i.e., no grouped coreferences
flat_vals = flatten(_m.values())
len(flat_vals)

207

In [5]:
# non-singleton entities:
singletons = [k for k, v in m.items() if v[0][0] == v[0][1]]
len(singletons)

57

In [6]:
from collections import defaultdict

def update_stats(stats, data):
    markables = list(data["markables"].keys())
    stats["MiscStats-Entities"] += len(reverse_markables(data["markables"]))
    stats["Stats-Mentions"] += len(markables)

    refs = data["references"]
    # also add each mentioned markable in Corefs:
    mentioned_markables = []
    for t_start, t_end in refs["Coref"]:
        mentioned_markables.append(t_start)
        mentioned_markables.append(t_end)
        
    if "Split_antecedent" in refs:
        split_ante = refs["Split_antecedent"]
        split_ents = []
        for t_start, t_end in split_ante:
            split_ents.append(t_start)
        stats["MiscStats-SplitAnte"] += len(set(split_ents))

    if "Bridging" in refs:
        bridges = refs["Bridging"]
        valid_bridges = []
        for t_start, t_end in bridges:
            valid_bridges.append(t_start)
        stats["MiscStats-Bridge"] += len(set(valid_bridges))
        
    return stats
    

def get_stats():
    stats = defaultdict(int)
    with jsonlines.open(os.path.join(sample_path)) as reader:
        data = [obj for obj in reader][0]
        # if len(data) > 0:
        #     data = data[0]
        stats = update_stats(stats, data)

    return stats



In [7]:
# load sample:
# iterate and add markables, corefes, bridges, splits
from collections import defaultdict

def get_stats(folder):
    stats = defaultdict(int)
    for jsonl in os.listdir(folder):
        if "jsonl" not in jsonl:
            continue
        with jsonlines.open(os.path.join(folder, jsonl)) as reader:
            data = [obj for obj in reader][0]
            stats = update_stats(stats, data)
    return stats

get_stats("../output/narc/annotations_jsonlines_bokmaal/")

defaultdict(int,
            {'MiscStats-Entities': 92950,
             'Stats-Mentions': 77575,
             'MiscStats-Bridge': 1060,
             'MiscStats-SplitAnte': 157})

In [8]:
get_stats("../output/narc/annotations_jsonlines_nynorsk/")

defaultdict(int,
            {'MiscStats-Entities': 76298,
             'Stats-Mentions': 63152,
             'MiscStats-Bridge': 868,
             'MiscStats-SplitAnte': 88})

# sentence counter

In [9]:
from conllu import parse
import os

for lang in ["bokmaal", "nynorsk"]:
    path = "../output/narc/annotations_conll_" + lang
    sents = 0
    tokens = 0
    for conll in os.listdir(path):
        if "conll" not in conll:
            continue
        with open(os.path.join(path, conll), encoding="utf-8") as f:
            data = f.read()
            parsed = parse(data)
            sents += len(parsed)
            tokens += sum([len(s) for s in parsed])
    print(lang, sents, tokens)

bokmaal 16461 257646
nynorsk 12762 213222


In [10]:
from conllu import parse
import os

for lang in ["bokmaal", "nynorsk"]:
    path = "../data/UD/UD_Norwegian-" + lang.capitalize()
    sents = 0
    tokens = 0
    for conll in os.listdir(path):
        if "conll" not in conll:
            continue
        with open(os.path.join(path, conll), encoding="utf-8") as f:
            data = f.read()
            parsed = parse(data)
            sents += len(parsed)
            tokens += sum([len(s) for s in parsed])
    print(lang, sents, tokens)

bokmaal 20044 310221
nynorsk 17575 301353
