In [1]:
import os
import sys
import subprocess

In [82]:
# udtools_path = "C:/Users/tollef/Documents/Git/UDTOOLS/"
udtools_path = "/Users/tollef/Downloads/git/PHD/COREF/UniversalDepTools/"
validate_path = os.path.join(udtools_path, "validate.py")
# ud_narc_path = "NARC/UD_NARC_MERGED_bokmaal"
ud_narc_path = "NARC/UD_NARC_MERGED_nynorsk"

In [83]:
from tqdm import tqdm
from collections import defaultdict

all_results = {}

for split_folder in os.listdir(ud_narc_path):
    doc_results = {}

    split_path = os.path.join(ud_narc_path, split_folder)
    print(f"On split {split_folder}...")
    for file in tqdm(os.listdir(split_path)):
        if ".conllu" not in file:
            continue
        narc_file = os.path.join(split_path, file)
        doc = file.split(".")[0]

        # run the validate script and get the outputs!
        output = subprocess.run(["python", validate_path, "--lang", "no", "--coref", "--level", "2", narc_file], capture_output=True)
        # get stderr and stdout
        output = output.stderr.decode("utf-8")  # everything is in stderr
        stdout = output.split("\n")
        doc_results[doc] = stdout

    all_results[split_folder] = doc_results

On split test...


100%|██████████| 31/31 [00:03<00:00,  7.85it/s]


On split train...


100%|██████████| 342/342 [00:42<00:00,  8.10it/s]


On split dev...


100%|██████████| 25/25 [00:03<00:00,  7.53it/s]


In [84]:
err_docs = []
errors = []
for split, doc_results in all_results.items():
    coref_errs = defaultdict(list)
    warnings = defaultdict(list)
    passed = []

    print(f"Stats for {split} split:")
    for doc, results in doc_results.items():
        for line in results:
            if "PASSED" in line:
                passed.append(doc)
            elif "Coref" in line:
                coref_errs[doc].append(line)
            elif "Warning" in line:
                warnings[doc].append(line)
    print(f"{len(coref_errs)} documents with coref errors")
    print(f"{len(warnings)} documents with warnings")
    print(f"{len(passed)} documents passed")

    print(f"Should remove the following documents from {split_folder} split:")
    print(coref_errs.keys())
    errors.append(coref_errs)
    err_docs.extend(coref_errs.keys())
    print("_"*40)

Stats for test split:
4 documents with coref errors
27 documents passed
Should remove the following documents from dev split:
dict_keys(['vtbnn~20090625-4277', 'kknn~20050406-2630', 'firdann~20110118-5455276', 'kknn~20060803-38315'])
________________________________________
Stats for train split:
20 documents with coref errors
322 documents passed
Should remove the following documents from dev split:
dict_keys(['vtbnn~20030930-1531', 'vtbnn~20020413-809', 'kknn~20050628-5798', 'kknn~20100628-57640', 'firdann~20110902-5720916', 'vtbnn~20040401-1780', 'dot~20110930-2119', 'dot~20060901-1348', 'dot~20111021-2134', 'kknn~20050201-844', 'dot~20110923-2116', 'mom~mom_002', 'dot~20051126-525', 'dot~20110916-2111', 'vtbnn~20030902-1488', 'firdann~20100415-5072393', 'kknn~20110825-59206', 'dot~20111021-2135', 'dot~20060901-845', 'firdann~20100305-5006815'])
________________________________________
Stats for dev split:
2 documents with coref errors
23 documents passed
Should remove the following

In [85]:
import re
entity_pattern = re.compile(r"__T\d+|__\d+")
between_apos = re.compile(r"'(.*?)'")
def entity_filter(ent):
    ent = ent.replace("(", "").replace(")", "")
    ent = ent.replace("--1-", "")
    # remove anything between brackets: []
    ent = re.sub(r"\[.*?\]", "", ent)
    return ent

filtered_errors = []
for error in errors:
    for doc, coreferr in error.items():
        for _err in coreferr:
            candidates = between_apos.findall(_err)
            candidates = [entity_filter(c) for c in candidates if "__" in c]
            if len(candidates) > 0:
                filtered_errors.extend(candidates)

filtered_errors = list(set(filtered_errors))
with_splits = []
for fe in filtered_errors:
    if "<" in fe:
        all_splits = fe.split(",")
        for _split in all_splits:
            start, end = _split.split("<")
            with_splits.append(start)
            with_splits.append(end)
    else:
        with_splits.append(fe)
with_splits

['kknn~20060803_38315__T365',
 'kknn~20050201_844__T91',
 'dot~20110923_2116__T18',
 'kknn~20100628_57640__T269',
 'kknn~20050628_5798__T35',
 'vtbnn~20090625_4277__T7',
 'vtbnn~20040401_1780__3256',
 'firdann~20110118_5455276__T9',
 'firdann~20100305_5006815__269',
 'vtbnn~20070403_3234__T254',
 'firdann~20110902_5720916__T111',
 'dot~20110930_2119__T111',
 'vtbnn~20030930_1531__T315',
 'vtbnn~20020413_809__T146',
 'dot~20060901_1348__T278',
 'dot~20110916_2111__T11',
 'vtbnn~20030902_1488__91488',
 'vtbnn~20030902_1488__3719',
 'firdann~20100305_5006815__13802',
 'dot~20111021_2135__217',
 'kknn~20050406_2630__T22',
 'mom~mom_002__T58',
 'firdann~20100415_5072393__T41',
 'dot~20060901_845__T171',
 'vtbnn~20090625_4277__T8',
 'vtbnn~20090625_4277__T10',
 'vtbnn~20030902_1488__91488',
 'vtbnn~20030902_1488__3719',
 'vtbnn~20030902_1488__91488',
 'vtbnn~20030902_1488__3719',
 'dot~20051126_525__14941',
 'kknn~20110825_59206__T182',
 'kknn~20110825_59206__T52',
 'vtbnn~20090623_4272__T25

In [86]:
# write these to a file: nynorsk_invalid_entities.txt
with open("invalid_entities.txt", "a") as f:
    # newline separated list above:
    f.writelines("\n".join(sorted(with_splits)))
    f.writelines("\n")

In [50]:
'vtbnn~20030902_1488__91488<vtbnn~20030902_1488__3719'.split("<")

['vtbnn~20030902_1488__91488', 'vtbnn~20030902_1488__3719']

# results before change 02.02.23:
 2 (test), 10 (train), 1 (dev)
 
 0, 3, 0

In [9]:
sorted(err_docs)

['dot~20051126-525',
 'dot~20110916-2111',
 'dot~20111021-2135',
 'dot~20111104-2146',
 'vtbnn~20030902-1488',
 'vtbnn~20040401-1780']

In [10]:
# if bokmaal:
# with open("invalid_docs.txt", "w") as f:
#     f.writelines("\n".join(sorted(err_docs)))

with open("invalid_docs.txt", "a") as f:
    f.write("\n")
    f.writelines("\n".join(sorted(err_docs)))