In [None]:
import json
import csv
import os
import sys


# some tsv were too big, so increase max memory allocation of csv module
csv.field_size_limit(sys.maxsize)

In [None]:
INPUT_TXT_FOLDER = "/veld/input/"
OUTPUT_JSON_FOLDER = "/veld/output/"

In [None]:
IS_TEST_RUN = False

In [None]:
def process_conllu_data(conllu_file_path, pos_set_not_existing, feat_set_not_existing):
    """main function for conllu processing, takes repo_id and stem and returns conllu statistics"""
    
    # returned dictionary, defined here for communication of its structure
    # data points from univesal dependencies are defined here: https://universaldependencies.org/format.html
    result_dict = {
        "count_token": 0,
        "count_lemma_total": 0,
        "count_lemma_normalized_by_token": 0,
        # part of speech, as defined here: https://universaldependencies.org/u/pos/index.html
        "count_pos": {
            "ADJ": 0, 
            "ADP": 0, 
            "ADV": 0, 
            "AUX": 0, 
            "CCONJ": 0, 
            "DET": 0, 
            "INTJ": 0, 
            "NOUN": 0, 
            "NUM": 0, 
            "PART": 0, 
            "PRON": 0, 
            "PROPN": 0, 
            "PUNCT": 0,
            "SCONJ": 0, 
            "SYM": 0, 
            "VERB": 0, 
            "X": 0,
        },
        # universal features, as defined here: https://universaldependencies.org/u/feat/index.html
        "count_feat": {
            # Lexical features
            "PronType": 0,
            "NumType": 0,
            "Poss": 0,
            "Reflex": 0,
            "Foreign": 0,
            "Abbr": 0,
            "Typo": 0,
            # Inflectional features, Nominal
            "Gender": 0,
            "Animacy": 0,
            "NounClass": 0,
            "Number": 0,
            "Case": 0,
            "Definite": 0,
            "Deixis": 0,
            "DeixisRef": 0,
            "Degree": 0,
            # Inflectional features, Verbal
            "VerbForm": 0,
            "Mood": 0,
            "Tense": 0,
            "Aspect": 0,
            "Voice": 0,
            "Evident": 0,
            "Polarity": 0,
            "Person": 0,
            "Polite": 0,
            "Clusivity": 0,
        },
    }
    
    lemma_set = set()
    
    # main loop over rows, create statistics
    with open(conllu_file_path, "r") as f:
        for row in csv.reader(f, delimiter="\t"):
            if len(row) == 10 and not row[0].startswith("#"):

                # token
                result_dict["count_token"] = result_dict["count_token"] + 1

                # lemma
                lemma_set.add(row[2])

                # part of speech
                pos = row[3]
                if pos != "_":
                    count_pos = result_dict["count_pos"].get(pos)
                    if count_pos is not None:
                        result_dict["count_pos"][pos] = count_pos + 1
                    else:
                        pos_set_not_existing.add(pos)

                # universal features
                feat_all = row[5]
                if feat_all != "_":
                    for feat_pair in feat_all.split("|"):
                        feat = feat_pair.split("=")[0]
                        count_feat = result_dict["count_feat"].get(feat)
                        if count_feat is not None:
                            result_dict["count_feat"][feat] = count_feat + 1
                        else:
                            feat_set_not_existing.add(feat)

        # count lemmas from set and normalize
        result_dict["count_lemma_total"] = len(lemma_set)
        if result_dict["count_token"] != 0:
            result_dict["count_lemma_normalized_by_token"] = round(result_dict["count_lemma_total"] / result_dict["count_token"], 4)
        else:
            result_dict["count_lemma_normalized_by_token"] = result_dict["count_lemma_total"]
    
    return result_dict, pos_set_not_existing, feat_set_not_existing

In [None]:
if IS_TEST_RUN:
    i_limit = 2
    j_limit = 3
else:
    i_limit = None
    j_limit = None

# main loop over all eltec corpora
for i, sub_folder in enumerate(os.listdir(INPUT_TXT_FOLDER)):

    if i == i_limit:
        break

    if sub_folder == ".gitkeep":
        continue

    # create data on corpus
    eltec_corpus_id = sub_folder
    input_conllu_sub_folder = INPUT_TXT_FOLDER + sub_folder + "/level1"
    print(f"processing eltec folder: {input_conllu_sub_folder}")
    output_conllu_json = []
    pos_set_not_existing = set()
    feat_set_not_existing = set()

    # loop over files of corpus
    for j, sub_file in enumerate(sorted(os.listdir(input_conllu_sub_folder))):

        if j == j_limit:
            break

        # create data on file
        eltec_resource_id = sub_file.replace(".conllu", "")
        input_conllu_file = input_conllu_sub_folder + "/" + sub_file

        # create stats
        conllu_stats, pos_set_not_existing, feat_set_not_existing = process_conllu_data(
            input_conllu_file, pos_set_not_existing, feat_set_not_existing)
        if IS_TEST_RUN:
            print(f"processing eltec conllu file : {input_conllu_file}")
            print(conllu_stats)

        # append results to json data carrier
        output_conllu_json.append({
            "resource_uri": f"https://raw.githubusercontent.com/COST-ELTeC/{eltec_corpus_id}/master/level1/{eltec_resource_id}.xml",
            "conllu_stats": conllu_stats,
        })

    
    # show missing / wrong data handling
    print(f"non-handled part-of-speech tags: {pos_set_not_existing}")
    print(f"non-handled features: {feat_set_not_existing}")

    if not IS_TEST_RUN:
        # write data of corpus into json file
        output_json_file = OUTPUT_JSON_FOLDER + eltec_corpus_id + ".json"
        print(f"writing results to {output_json_file}")
        with open(output_json_file, "w") as f:
            json.dump(output_conllu_json, f, indent=4)

print("done")