In [1]:
import sys
from tqdm import tqdm
from collections import Counter
import functions as f

# set hyperparameters
# noise_type = ["none", "freq", "10_types", "20_types", "30_types", "40_types", "50_types"]
# noise_type = ["10_types", "20_types", "30_types", "40_types", "50_types"]
# noise_type = ["10_tokens", "20_tokens", "30_tokens", "40_tokens", "50_tokens"]
noise_type = ["none", "freq"]
top_n = 20
threshold = 0.0

# load vectors
print("loading vectors...")
vectors_gd = "bilingual_vectors_scannell.magnitude"
vectors_ga = "bilingual_vectors_scannell.magnitude"

for noise in noise_type:
    print(f"\nGenerating training data based on noise_type: {noise}\n")

    corpus_gd = "train_data/gd/gd_noise="+str(noise)+"-ud-train.conllu"
    corpus_ga = "train_data/ga/ga_noise=none-ud-train.conllu"

    # train data
    list_gd, sentences_gd = f.process_training_data(corpus_gd)
    list_ga, sentences_ga = f.process_training_data(corpus_ga)

    # vector conversion only
    conversion_gd = f.get_conversions_bilingual_vectors(sentences_gd,
                                                    list_gd, list_ga,
                                                    vectors_gd, vectors_ga)
    converted_corpus_gd = corpus_gd[:-16]+"_conversion=bilingual_vectors_scannell-ud-train.conllu"

    i = 0
    j = len(conversion_gd)
    for item in conversion_gd:
        if len(list(conversion_gd[item])) != 0:
    #         print(item, conversion_gd[item])
            i += 1

    print(f"Total word pairs:\t{j}")
    print(f"Word pairs w/ other relations in Irish (train):\t{i}")

    (total_lines, changed) = f.apply_conversions_bilingual(corpus_gd, converted_corpus_gd, conversion_gd)
    print(f"Total lines:\t{total_lines}")
    print(f"Lines changed:\t{changed}")
    
    
    print(f"\nGenerating dev data based on noise_type: {noise}\n")
    corpus_gd = "dev_data/gd/gd_noise="+str(noise)+"-ud-dev.conllu"
    corpus_ga = "dev_data/ga/ga_noise=none-ud-dev.conllu"

    # dev data
    list_gd, sentences_gd = f.process_training_data(corpus_gd)
    list_ga, sentences_ga = f.process_training_data(corpus_ga)

    # vector conversion only
    conversion_gd = f.get_conversions_bilingual_vectors(sentences_gd,
                                                    list_gd, list_ga,
                                                    vectors_gd, vectors_ga)
    converted_corpus_gd = corpus_gd[:-14]+"_conversion=bilingual_vectors_scannell-ud-dev.conllu"

    i = 0
    j = len(conversion_gd)
    for item in conversion_gd:
        if len(list(conversion_gd[item])) != 0:
    #         print(item, conversion_gd[item])
            i += 1

    print(f"Total word pairs:\t{j}")
    print(f"Word pairs w/ other relations in Irish (dev):\t{i}")

    (total_lines, changed) = f.apply_conversions_bilingual(corpus_gd, converted_corpus_gd, conversion_gd)
    print(f"Total lines:\t{total_lines}")
    print(f"Lines changed:\t{changed}")


loading vectors...

Generating training data based on noise_type: none

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
30,808 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [4:45:52<00:00,  1.72it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 15506.52it/s]


Total lines:	51610
Number of lines changed:	8132
Total lines:	51610
Lines changed:	8132

Generating dev data based on noise_type: none

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
7,590 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:23<00:00, 16.72it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13998.28it/s]


Total lines:	10409
Number of lines changed:	653
Total lines:	10409
Lines changed:	653

Generating training data based on noise_type: freq

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
29,476 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:07:36<00:00,  1.60it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 14679.95it/s]


Total lines:	51610
Number of lines changed:	9681
Total lines:	51610
Lines changed:	9681

Generating dev data based on noise_type: freq

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
7,410 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:41<00:00, 16.07it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13947.69it/s]

Total lines:	10409
Number of lines changed:	680
Total lines:	10409
Lines changed:	680





In [2]:
# set hyperparameters
# noise_type = ["none", "freq", "10_types", "20_types", "30_types", "40_types", "50_types"]
noise_type = ["10_types", "20_types", "30_types", "40_types", "50_types"]
# noise_type = ["10_tokens", "20_tokens", "30_tokens", "40_tokens", "50_tokens"]
# noise_type = ["none", "freq"]
top_n = 20
threshold = 0.0

# load vectors
print("loading vectors...")
vectors_gd = "bilingual_vectors_scannell.magnitude"
vectors_ga = "bilingual_vectors_scannell.magnitude"

for noise in noise_type:
    print(f"\nGenerating training data based on noise_type: {noise}\n")

    corpus_gd = "train_data/gd/gd_noise="+str(noise)+"-ud-train.conllu"
    corpus_ga = "train_data/ga/ga_noise=none-ud-train.conllu"

    # train data
    list_gd, sentences_gd = f.process_training_data(corpus_gd)
    list_ga, sentences_ga = f.process_training_data(corpus_ga)

    # vector conversion only
    conversion_gd = f.get_conversions_bilingual_vectors(sentences_gd,
                                                    list_gd, list_ga,
                                                    vectors_gd, vectors_ga)
    converted_corpus_gd = corpus_gd[:-16]+"_conversion=bilingual_vectors-ud-train.conllu"

    i = 0
    j = len(conversion_gd)
    for item in conversion_gd:
        if len(list(conversion_gd[item])) != 0:
    #         print(item, conversion_gd[item])
            i += 1

    print(f"Total word pairs:\t{j}")
    print(f"Word pairs w/ other relations in Irish (train):\t{i}")

    (total_lines, changed) = f.apply_conversions_bilingual(corpus_gd, converted_corpus_gd, conversion_gd)
    print(f"Total lines:\t{total_lines}")
    print(f"Lines changed:\t{changed}")
    
    
    print(f"\nGenerating dev data based on noise_type: {noise}\n")
    corpus_gd = "dev_data/gd/gd_noise="+str(noise)+"-ud-dev.conllu"
    corpus_ga = "dev_data/ga/ga_noise=none-ud-dev.conllu"

    # dev data
    list_gd, sentences_gd = f.process_training_data(corpus_gd)
    list_ga, sentences_ga = f.process_training_data(corpus_ga)

    # vector conversion only
    conversion_gd = f.get_conversions_bilingual_vectors(sentences_gd,
                                                    list_gd, list_ga,
                                                    vectors_gd, vectors_ga)
    converted_corpus_gd = corpus_gd[:-14]+"_conversion=bilingual_vectors-ud-dev.conllu"

    i = 0
    j = len(conversion_gd)
    for item in conversion_gd:
        if len(list(conversion_gd[item])) != 0:
    #         print(item, conversion_gd[item])
            i += 1

    print(f"Total word pairs:\t{j}")
    print(f"Word pairs w/ other relations in Irish (dev):\t{i}")

    (total_lines, changed) = f.apply_conversions_bilingual(corpus_gd, converted_corpus_gd, conversion_gd)
    print(f"Total lines:\t{total_lines}")
    print(f"Lines changed:\t{changed}")


loading vectors...

Generating training data based on noise_type: 10_types

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
33,156 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:26:38<00:00,  1.50it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 13040.96it/s]


Total lines:	51610
Number of lines changed:	8729
Total lines:	51610
Lines changed:	8729

Generating dev data based on noise_type: 10_types

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
7,959 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:50<00:00, 15.76it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13946.90it/s]


Total lines:	10409
Number of lines changed:	692
Total lines:	10409
Lines changed:	692

Generating training data based on noise_type: 20_types

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
35,353 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:30:38<00:00,  1.49it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 11311.83it/s]


Total lines:	51610
Number of lines changed:	9386
Total lines:	51610
Lines changed:	9386

Generating dev data based on noise_type: 20_types

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
8,316 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:51<00:00, 15.73it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 12382.30it/s]


Total lines:	10409
Number of lines changed:	734
Total lines:	10409
Lines changed:	734

Generating training data based on noise_type: 30_types

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
37,456 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:32:36<00:00,  1.48it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 13097.63it/s]


Total lines:	51610
Number of lines changed:	9978
Total lines:	51610
Lines changed:	9978

Generating dev data based on noise_type: 30_types

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
8,659 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:51<00:00, 15.70it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13414.73it/s]


Total lines:	10409
Number of lines changed:	768
Total lines:	10409
Lines changed:	768

Generating training data based on noise_type: 40_types

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
39,206 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:30:32<00:00,  1.49it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 10753.69it/s]


Total lines:	51610
Number of lines changed:	10615
Total lines:	51610
Lines changed:	10615

Generating dev data based on noise_type: 40_types

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
8,954 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [08:02<00:00, 15.36it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 14157.14it/s]


Total lines:	10409
Number of lines changed:	799
Total lines:	10409
Lines changed:	799

Generating training data based on noise_type: 50_types

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
40,888 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:28:10<00:00,  1.50it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 13717.50it/s]


Total lines:	51610
Number of lines changed:	11139
Total lines:	51610
Lines changed:	11139

Generating dev data based on noise_type: 50_types

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
9,142 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:53<00:00, 15.63it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 14190.21it/s]

Total lines:	10409
Number of lines changed:	849
Total lines:	10409
Lines changed:	849





In [3]:
# set hyperparameters
# noise_type = ["none", "freq", "10_types", "20_types", "30_types", "40_types", "50_types"]
# noise_type = ["10_types", "20_types", "30_types", "40_types", "50_types"]
noise_type = ["10_tokens", "20_tokens", "30_tokens", "40_tokens", "50_tokens"]
# noise_type = ["none", "freq"]
top_n = 20
threshold = 0.0

# load vectors
print("loading vectors...")
vectors_gd = "bilingual_vectors_scannell.magnitude"
vectors_ga = "bilingual_vectors_scannell.magnitude"

for noise in noise_type:
    print(f"\nGenerating training data based on noise_type: {noise}\n")

    corpus_gd = "train_data/gd/gd_noise="+str(noise)+"-ud-train.conllu"
    corpus_ga = "train_data/ga/ga_noise=none-ud-train.conllu"

    # train data
    list_gd, sentences_gd = f.process_training_data(corpus_gd)
    list_ga, sentences_ga = f.process_training_data(corpus_ga)

    # vector conversion only
    conversion_gd = f.get_conversions_bilingual_vectors(sentences_gd,
                                                    list_gd, list_ga,
                                                    vectors_gd, vectors_ga)
    converted_corpus_gd = corpus_gd[:-16]+"_conversion=bilingual_vectors-ud-train.conllu"

    i = 0
    j = len(conversion_gd)
    for item in conversion_gd:
        if len(list(conversion_gd[item])) != 0:
    #         print(item, conversion_gd[item])
            i += 1

    print(f"Total word pairs:\t{j}")
    print(f"Word pairs w/ other relations in Irish (train):\t{i}")

    (total_lines, changed) = f.apply_conversions_bilingual(corpus_gd, converted_corpus_gd, conversion_gd)
    print(f"Total lines:\t{total_lines}")
    print(f"Lines changed:\t{changed}")
    
    
    print(f"\nGenerating dev data based on noise_type: {noise}\n")
    corpus_gd = "dev_data/gd/gd_noise="+str(noise)+"-ud-dev.conllu"
    corpus_ga = "dev_data/ga/ga_noise=none-ud-dev.conllu"

    # dev data
    list_gd, sentences_gd = f.process_training_data(corpus_gd)
    list_ga, sentences_ga = f.process_training_data(corpus_ga)

    # vector conversion only
    conversion_gd = f.get_conversions_bilingual_vectors(sentences_gd,
                                                    list_gd, list_ga,
                                                    vectors_gd, vectors_ga)
    converted_corpus_gd = corpus_gd[:-14]+"_conversion=bilingual_vectors-ud-dev.conllu"

    i = 0
    j = len(conversion_gd)
    for item in conversion_gd:
        if len(list(conversion_gd[item])) != 0:
    #         print(item, conversion_gd[item])
            i += 1

    print(f"Total word pairs:\t{j}")
    print(f"Word pairs w/ other relations in Irish (dev):\t{i}")

    (total_lines, changed) = f.apply_conversions_bilingual(corpus_gd, converted_corpus_gd, conversion_gd)
    print(f"Total lines:\t{total_lines}")
    print(f"Lines changed:\t{changed}")


loading vectors...

Generating training data based on noise_type: 10_tokens

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
33,152 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:56:09<00:00,  1.38it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 12424.46it/s]


Total lines:	51610
Number of lines changed:	8696
Total lines:	51610
Lines changed:	8696

Generating dev data based on noise_type: 10_tokens

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
7,937 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [08:23<00:00, 14.71it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 12746.07it/s]


Total lines:	10409
Number of lines changed:	689
Total lines:	10409
Lines changed:	689

Generating training data based on noise_type: 20_tokens

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
35,045 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [6:32:31<00:00,  1.25it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 12966.24it/s]


Total lines:	51610
Number of lines changed:	9311
Total lines:	51610
Lines changed:	9311

Generating dev data based on noise_type: 20_tokens

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
8,305 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:49<00:00, 15.78it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 12522.39it/s]


Total lines:	10409
Number of lines changed:	721
Total lines:	10409
Lines changed:	721

Generating training data based on noise_type: 30_tokens

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
36,850 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:31:14<00:00,  1.48it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 12040.91it/s]


Total lines:	51610
Number of lines changed:	9847
Total lines:	51610
Lines changed:	9847

Generating dev data based on noise_type: 30_tokens

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
8,597 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:50<00:00, 15.76it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13100.17it/s]


Total lines:	10409
Number of lines changed:	767
Total lines:	10409
Lines changed:	767

Generating training data based on noise_type: 40_tokens

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
38,469 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:31:05<00:00,  1.48it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 12900.13it/s]


Total lines:	51610
Number of lines changed:	10433
Total lines:	51610
Lines changed:	10433

Generating dev data based on noise_type: 40_tokens

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
8,868 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:58<00:00, 15.47it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13532.18it/s]


Total lines:	10409
Number of lines changed:	802
Total lines:	10409
Lines changed:	802

Generating training data based on noise_type: 50_tokens

2,613 sentences processed
51,610 tokens processed
Average sentence length:	19.751243781094526
29,476 head-dependent:relation pairs
39,729 head-dependent-relation:sentence triples
4,005 sentences processed
95,881 tokens processed
Average sentence length:	23.940324594257177
63,769 head-dependent:relation pairs
64,946 head-dependent-relation:sentence triples


100%|███████████████████████████████████| 29476/29476 [5:36:28<00:00,  1.46it/s]


Original word pairs:	29476
New word pairs:	13514
Total word pairs:	29476
Word pairs w/ other relations in Irish (train):	6773


100%|████████████████████████████████████| 2613/2613 [00:00<00:00, 13285.66it/s]


Total lines:	51610
Number of lines changed:	11032
Total lines:	51610
Lines changed:	11032

Generating dev data based on noise_type: 50_tokens

646 sentences processed
10,409 tokens processed
Average sentence length:	16.113003095975234
7,410 head-dependent:relation pairs
9,110 head-dependent-relation:sentence triples
451 sentences processed
10,000 tokens processed
Average sentence length:	22.172949002217294
8,685 head-dependent:relation pairs
8,760 head-dependent-relation:sentence triples


100%|███████████████████████████████████████| 7410/7410 [07:57<00:00, 15.52it/s]


Original word pairs:	7410
New word pairs:	842
Total word pairs:	7410
Word pairs w/ other relations in Irish (dev):	641


100%|██████████████████████████████████████| 646/646 [00:00<00:00, 13560.28it/s]

Total lines:	10409
Number of lines changed:	820
Total lines:	10409
Lines changed:	820



