In [1]:
from conllu import parse

# From manual checks:
NARC v1.0:

- 357 bokmål files
- 401 nynorsk files

In [2]:
import os

In [3]:
e1 = '(vg~vg_20111003_10039641__2401--1-(vg~vg_20111003_10039641__81112--1-)'
e2 = 'vg~vg_20111003_10039641__T204)vg~vg_20111003_10039641__T203)'

import re
def parse_ent_str(estr):
    cluster_pattern = re.compile(r'__\d+')
    markable_pattern = re.compile(r'__T\d+')
    return cluster_pattern.findall(estr) + markable_pattern.findall(estr)

parse_ent_str(e2)

['__T204', '__T203']

In [4]:
from collections import defaultdict

def get_lang_stats(path):
    # path = f"narc-merged/annotations_conll_{lang}"
    total_sents = 0
    total_toks = 0
    total_references = 0
    entities = []
    # keep track of all entities along with their count (occurrence)
    entity_counts = defaultdict(int)

    norne_entities = []
    for doc_count, conll in enumerate(os.listdir(path)):
        if not conll.endswith(".conllu"):
            continue
        parsed = parse(open(os.path.join(path, conll), encoding="utf-8").read())
        for sentence in parsed:
            total_sents += 1
            for token in sentence:
                total_toks += 1
                if token["misc"] is not None:
                    if "Entity" in token["misc"]:
                        # count entities by counting "__" as each entity
                        actual_ents = token["misc"]["Entity"]
                        parsed = list(parse_ent_str(actual_ents))
                        for ent in parsed:
                            entity_counts[f"{ent}_{doc_count}"] += 1

                        entities.extend(parsed)
                    if "name" in token["misc"]:
                        # filter out all names that are not "O"
                        named_ents = token["misc"]["name"]
                        named_ents = [n for n in named_ents if n != "O"]
                        norne_entities.extend(named_ents)
        
    references = len(entities)
    unique_ents = set(entities)

    norne_unique = set(norne_entities)

    # find singletons: entities that only occur once
    _singletons = [k for k, v in entity_counts.items() if v == 1]
    _corefs = [k for k, v in entity_counts.items() if v > 1]
    

    print(f"""
    {path}
    {total_sents} sentences, 
    {total_toks} tokens, 
    {total_toks/total_sents} tokens/sent, 
    {references} references, 
    {len(_singletons)} singletons,
    {len(_corefs)} coreferences,
    {len(_singletons) + len(_corefs)} total entities (coref+singleton),
    {len(norne_entities)} norne entities,
    {len(norne_unique)} unique norne entities,
    {len(unique_ents)} unique entities
    """)


# NARC Original results (v1.0)

In [5]:
for lang in ["bokmaal", "nynorsk"]:
    path = f"../output/narc/annotations_conll_{lang}"
    # path = f"../../output/narc/{lang}"
    # path = f"narc-merged/OUTPUT/no-narc_{lang}"
    get_lang_stats(path)


    ../output/narc/annotations_conll_bokmaal
    16461 sentences, 
    257646 tokens, 
    15.651904501549117 tokens/sent, 
    116288 references, 
    18327 singletons,
    36898 coreferences,
    55225 total entities (coref+singleton),
    0 norne entities,
    0 unique norne entities,
    5073 unique entities
    

    ../output/narc/annotations_conll_nynorsk
    12762 sentences, 
    213222 tokens, 
    16.707569346497415 tokens/sent, 
    95671 references, 
    15228 singletons,
    30690 coreferences,
    45918 total entities (coref+singleton),
    0 norne entities,
    0 unique norne entities,
    3529 unique entities
    


# NARC aligned results (information loss!)

In [6]:
for lang in ["bokmaal", "nynorsk"]:
    path = f"../output/aligned/no-narc_{lang}"
    get_lang_stats(path)


    ../output/aligned/no-narc_bokmaal
    15672 sentences, 
    244136 tokens, 
    15.57784583971414 tokens/sent, 
    111091 references, 
    278 singletons,
    6214 coreferences,
    6492 total entities (coref+singleton),
    84410 norne entities,
    15 unique norne entities,
    4934 unique entities
    

    ../output/aligned/no-narc_nynorsk
    12481 sentences, 
    206660 tokens, 
    16.557968111529526 tokens/sent, 
    93346 references, 
    60 singletons,
    4859 coreferences,
    4919 total entities (coref+singleton),
    80243 norne entities,
    15 unique norne entities,
    3472 unique entities
    


In [7]:
for lang in ["bokmaal", "nynorsk"]:
    _ud = f"UD_Norwegian-{lang.capitalize()}"
    ud_path = f"../data/UD/{_ud}"
    get_lang_stats(ud_path)


    ../data/UD/UD_Norwegian-Bokmaal
    20044 sentences, 
    310221 tokens, 
    15.477000598682897 tokens/sent, 
    0 references, 
    0 singletons,
    0 coreferences,
    0 total entities (coref+singleton),
    0 norne entities,
    0 unique norne entities,
    0 unique entities
    

    ../data/UD/UD_Norwegian-Nynorsk
    17575 sentences, 
    301353 tokens, 
    17.14668563300142 tokens/sent, 
    0 references, 
    0 singletons,
    0 coreferences,
    0 total entities (coref+singleton),
    0 norne entities,
    0 unique norne entities,
    0 unique entities
    


# NorNE stats

In [9]:
for lang in ["bokmaal", "nynorsk"]:
    _id = "nno" if lang == "nynorsk" else "nob"
    path = f"../data/norne/ud/{_id}"
    get_lang_stats(path)


    ../data/norne/ud/nob
    20045 sentences, 
    310222 tokens, 
    15.476278373659266 tokens/sent, 
    0 references, 
    0 singletons,
    0 coreferences,
    0 total entities (coref+singleton),
    105162 norne entities,
    15 unique norne entities,
    0 unique entities
    

    ../data/norne/ud/nno
    17575 sentences, 
    301353 tokens, 
    17.14668563300142 tokens/sent, 
    0 references, 
    0 singletons,
    0 coreferences,
    0 total entities (coref+singleton),
    103829 norne entities,
    15 unique norne entities,
    0 unique entities
    
