In [1]:
import sys
import random
from tqdm import tqdm
from collections import Counter
# import functions as f

In [2]:
# Select language
language = "english"
# language = "slovenian"
# language = "romanian"
# language = "persian"

# Select number of sentences for each corpus
# num_sents = [100, 500, 1000, 2000]
num_sents = [125, 250]

In [3]:
def load_corpus(language):
    """
    Loads corpus files depending on selected language.
    """
    print(f"Loading {language} corpora...")
    if language=="english":
        # GUM (Georgetown University Multilayer) corpus from UD website
        corpus_A_dir = "data/ud-en-gum/"
        train_A = corpus_A_dir+"en_gum-ud-train.conllu"
        dev_A = corpus_A_dir+"en_gum-ud-dev.conllu"

        # wsj corpus with different conventions
        # converted from Stanford dependencies (?)
        corpus_B_dir = "data/wsj-DIFF-CONVENTIONS/"
        train_B = corpus_B_dir+"train.conllu"
        dev_B = corpus_B_dir+"dev.conllu"
        test_B = corpus_B_dir+"test.conllu"
    elif language == "persian":
        # PerDT corpus
        corpus_A_dir = "data/PerDT/"
        train_A = corpus_A_dir+"fa_perdt-ud-train-removeLines.conllu"
        dev_A = corpus_A_dir+"fa_perdt-ud-dev-removeLines.conllu"
        # original seraji
        corpus_B_dir = "data/original_seraji/"
        train_B = corpus_B_dir+"fa_seraji-ud-train-removeLines.conllu"
        dev_B = corpus_B_dir+"fa_seraji-ud-dev-removeLines.conllu"
        # new seraji
        corpus_C_dir = "data/new_seraji/"
        train_C = corpus_C_dir+"fa_newseraji-ud-train-removeLines.conllu"
        dev_C = corpus_C_dir+"fa_newseraji-ud-dev-removeLines.conllu"
    else:
        print("Please select a language!")
    print("Corpora loaded!")

#     return train_A, train_B#, train_C
    return dev_A, dev_B#, dev_C

In [4]:
corpus_A, corpus_B = load_corpus(language)

Loading english corpora...
Corpora loaded!


In [5]:
def create_training_partitions(ud_file, ab, num_sents):
    """
    Processes UD training data into randomly sampled training partitions of a fixed size.
    """
    with open(ud_file) as infile:
        ud_lines = infile.readlines()
    # get list of sentences
    sentences = []
    sentence = []
    for line in ud_lines:
        if line[0] == "#":
            continue
        elif len(line.strip()) == 0:
            sentences.append(sentence)
            sentence = []
        else:
            split = line.split("\t")
            sentence.append(split)
    
    for size in num_sents:
        i = 1
        while i <=5:
            print(f"Creating dev partition {i} of {size} sentences for {language} corpus {ab}.")
            seed_filename = "dev_data/"+language+"_dev_corpus="+ab+"_sents="+str(size)+"_seed="+str(i)+".txt"
            random.shuffle(sentences)
            with open(seed_filename, "w") as outfile:
                for line in sentences[:size]:
                    for entry in line:
                        outfile.write("\t".join(entry))
                    outfile.write("\n")
            i += 1

In [6]:
create_training_partitions(corpus_A, "A", num_sents)
create_training_partitions(corpus_B, "B", num_sents)
# create_training_partitions(corpus_C, "C", num_sents)

Creating dev partition 1 of 125 sentences for english corpus A.
Creating dev partition 2 of 125 sentences for english corpus A.
Creating dev partition 3 of 125 sentences for english corpus A.
Creating dev partition 4 of 125 sentences for english corpus A.
Creating dev partition 5 of 125 sentences for english corpus A.
Creating dev partition 1 of 250 sentences for english corpus A.
Creating dev partition 2 of 250 sentences for english corpus A.
Creating dev partition 3 of 250 sentences for english corpus A.
Creating dev partition 4 of 250 sentences for english corpus A.
Creating dev partition 5 of 250 sentences for english corpus A.
Creating dev partition 1 of 125 sentences for english corpus B.
Creating dev partition 2 of 125 sentences for english corpus B.
Creating dev partition 3 of 125 sentences for english corpus B.
Creating dev partition 4 of 125 sentences for english corpus B.
Creating dev partition 5 of 125 sentences for english corpus B.
Creating dev partition 1 of 250 sentence