In [1]:
from conllu import parse

# From manual checks:
NARC v0.6:

- 357 bokmål files
- 401 nynorsk files

In [40]:
import os

In [41]:
e1 = '(vg~vg_20111003_10039641__2401--1-(vg~vg_20111003_10039641__81112--1-)'
e2 = 'vg~vg_20111003_10039641__T204)vg~vg_20111003_10039641__T203)'

import re
def parse_ent_str(estr):
    cluster_pattern = re.compile(r'__\d+')
    markable_pattern = re.compile(r'__T\d+')
    return cluster_pattern.findall(estr) + markable_pattern.findall(estr)

parse_ent_str(e2)

['__T204', '__T203']

In [58]:
def get_lang_stats(path):
    # path = f"narc-merged/annotations_conll_{lang}"
    total_sents = 0
    total_toks = 0
    total_references = 0
    entities = []
    norne_entities = []
    for conll in os.listdir(path):
        if not conll.endswith(".conllu"):
            continue
        parsed = parse(open(os.path.join(path, conll)).read())
        for sentence in parsed:
            total_sents += 1
            for token in sentence:
                total_toks += 1
                if token["misc"] is not None:
                    if "Entity" in token["misc"]:
                        # count entities by counting "__" as each entity
                        actual_ents = token["misc"]["Entity"]
                        parsed = parse_ent_str(actual_ents)
                        entities.extend(parsed)
                    if "name" in token["misc"]:
                        # filter out all names that are not "O"
                        named_ents = token["misc"]["name"]
                        named_ents = [n for n in named_ents if n != "O"]
                        norne_entities.extend(named_ents)
        
    references = len(entities)
    unique_ents = set(entities)

    norne_unique = set(norne_entities)

    print(f"""
    {path}
    {total_sents} sentences, 
    {total_toks} tokens, 
    {total_toks/total_sents} tokens/sent, 
    {references} references, 
    {len(norne_entities)} norne entities,
    {len(norne_unique)} unique norne entities,
    {len(unique_ents)} unique entities
    """)


In [55]:
for lang in ["bokmaal", "nynorsk"]:
    path = f"narc-merged/OUTPUT/no-narc_{lang}"
    get_lang_stats(path)


    bokmaal: 
    15742 sentences, 
    245515 tokens, 
    15.596175835344937 tokens/sent, 
    111663 references, 
    84826 norne entities,
    15 unique norne entities,
    4961 unique entities
    

    nynorsk: 
    12481 sentences, 
    206660 tokens, 
    16.557968111529526 tokens/sent, 
    93346 references, 
    80243 norne entities,
    15 unique norne entities,
    3472 unique entities
    


In [56]:
for lang in ["bokmaal", "nynorsk"]:
    path = f"narc-merged/annotations_conll_{lang}"
    get_lang_stats(path)


    bokmaal: 
    16531 sentences, 
    259026 tokens, 
    15.669106527130845 tokens/sent, 
    116861 references, 
    0 norne entities,
    0 unique norne entities,
    5098 unique entities
    

    nynorsk: 
    12762 sentences, 
    213222 tokens, 
    16.707569346497415 tokens/sent, 
    95671 references, 
    0 norne entities,
    0 unique norne entities,
    3529 unique entities
    


In [59]:
for lang in ["bokmaal", "nynorsk"]:
    _ud = f"UD_Norwegian-{lang.capitalize()}"
    ud_path = f"../UD/{_ud}"
    get_lang_stats(ud_path)


    ../UD/UD_Norwegian-Bokmaal
    20044 sentences, 
    310221 tokens, 
    15.477000598682897 tokens/sent, 
    0 references, 
    0 norne entities,
    0 unique norne entities,
    0 unique entities
    

    ../UD/UD_Norwegian-Nynorsk
    17575 sentences, 
    301353 tokens, 
    17.14668563300142 tokens/sent, 
    0 references, 
    0 norne entities,
    0 unique norne entities,
    0 unique entities
    


In [60]:
for lang in ["bokmaal", "nynorsk"]:
    _id = "nno" if lang == "nynorsk" else "nob"
    path = f"../NorNE/ud/{_id}"
    get_lang_stats(path)


    ../NorNE/ud/nob
    20045 sentences, 
    310222 tokens, 
    15.476278373659266 tokens/sent, 
    0 references, 
    105162 norne entities,
    15 unique norne entities,
    0 unique entities
    

    ../NorNE/ud/nno
    17575 sentences, 
    301353 tokens, 
    17.14668563300142 tokens/sent, 
    0 references, 
    103829 norne entities,
    15 unique norne entities,
    0 unique entities
    


In [12]:
def stat_loss_from_original(orig: int, new: int) -> float:
    val = (orig - new) / orig
    # as perc:
    return val * 100

stat_loss_from_original(16531, 15742) #bm sent
stat_loss_from_original(259026, 245515) #bm tok

stat_loss_from_original(12762, 12481) #nn sent
stat_loss_from_original(213222, 206660) #nn tok

3.0775435930626296

In [14]:
before_sents = 16531 + 12762
after_sents = 15742 + 12481
loss = before_sents - after_sents
print(loss)


1070


In [15]:
before = 259026 + 213222
after = 245515 + 206660
loss = before - after
print(loss)


20073
