In [None]:
import json
import csv
import os
import sys


# some tsv were too big, so increase max memory allocation of csv module
csv.field_size_limit(sys.maxsize)

In [None]:
def process_conllu_data(conllu_file_path):
  """main function for conllu processing, takes repo_id and stem and returns conllu statistics"""

  # returned dictionary, defined here for communication of its structure
  result_dict = {
      "token_count": None,
  }
    
  # statistics of interest
  token_count = 0

  # main loop over rows, create statistics
  with open(conllu_file_path, "r") as f:
      for row in csv.reader(f, delimiter="\t"):
          if len(row) != 0 and not row[0].startswith("#"):
              token_count += 1

  # load statistics into dict, return
  result_dict["token_count"] = token_count
  return result_dict

In [None]:
input_root_folder = "/veld/input/data/"

# main loop over all eltec corpora
for sub_folder in os.listdir(input_root_folder):

    # create data on corpus
    eltec_corpus_id = sub_folder
    input_conllu_sub_folder = input_root_folder + sub_folder + "/level1"
    print(f"processing eltec folder: {input_conllu_sub_folder}")
    output_conllu_json = []

    # loop over files of corpus
    for sub_file in sorted(os.listdir(input_conllu_sub_folder)):

        # create data on file
        eltec_resource_id = sub_file.replace(".conllu", "")
        input_conllu_file = input_conllu_sub_folder + "/" + sub_file

        # create stats
        #print(f"processing eltec conllu file : {input_conllu_file}")
        conllu_stats = process_conllu_data(input_conllu_file)
        #print(conllu_stats)

        # append results to json data carrier
        output_conllu_json.append({
            "resource_uri": f"https://raw.githubusercontent.com/COST-ELTeC/{eltec_corpus_id}/master/level1/{eltec_resource_id}.xml",
            "conllu_stats": conllu_stats,
        })

    # write data of corpus into json file
    output_json_file = f"/veld/output/data/{eltec_corpus_id}.json"
    print(f"writing results to {output_json_file}")
    with open(output_json_file, "w") as f:
        json.dump(output_conllu_json, f, indent=4)