In [2]:
'''Create synthetic parallel corpora to evaluate pipeline by injecting real parallel sentences into different monolingual corpora'''

import bucc_style_dataset as bsd

si_mono_lines = []
eng_mono_lines = []
with open("data/en-si/sin_wikipedia_2021_30K-sentences.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.removesuffix("\n")
        line = line.replace("\u200d", "")
        split  =line.split("\t")
        si_mono_lines.append(split[1])
with open("data/en-si/eng_news_2024_30K-sentences.txt", "r", encoding="utf-8") as f:
    for line in f: 
        line = line.removesuffix("\n")
        line = line.replace("\u200d", "")
        split  =line.split("\t")
        eng_mono_lines.append(split[1])

si_par_lines = []
eng_par_lines = []

with open("data/en-si/eng_Latn.dev", "r", encoding="utf-8") as f:
    for line in f:
        line = line.removesuffix("\n")
        line = line.replace("\u200d", "")
        eng_par_lines.append(line)
with open("data/en-si/sin_Sinh.dev", "r", encoding="utf-8") as f:
    for line in f: 
        line = line.removesuffix("\n")
        line = line.replace("\u200d", "")
        si_par_lines.append(line)

train_list, test_list = bsd.split_shuffle_create_corpus(eng_mono_lines, si_mono_lines, eng_par_lines, si_par_lines)

en_list = train_list[0].split("\n")
si_list = train_list[1].split("\n")
en_corpus_lines = []
si_corpus_lines = []
for line in en_list:
    line = line.split("\t")[1]
    en_corpus_lines.append(line)
for line in si_list:
    line = line.split("\t")[1]
    si_corpus_lines.append(line)

In [13]:
'''Convert synthetic corpora to Word2Vec format to perform sentence mining'''

import filtering
e1 = filtering.to_multilingual_embedding("english", en_corpus_lines, "labse")
e2 = filtering.to_multilingual_embedding("sinhala", si_corpus_lines, "labse")  
source_lines = [f"{e1.shape[0]} {e1.shape[1]}"]
target_lines = [f"{e2.shape[0]} {e2.shape[1]}"]

for sent, encoding in zip(en_corpus_lines, e1):
    sent = sent.replace(" ", "_")
    encoding_str = " ".join([f"{x:.4f}" for x in encoding])
    source_lines.append(f"{sent} {encoding_str}")
for sent, encoding in zip(si_corpus_lines, e2):
    sent = sent.replace(" ", "_")
    encoding_str = " ".join([f"{x:.4f}" for x in encoding])
    target_lines.append(f"{sent} {encoding_str}")  

with open("evaluation/en.source.vec", "w", encoding="utf-8") as f: 
    for line in source_lines:
        f.write(f"{line}\n")
with open("evaluation/si.target.vec", "w", encoding="utf-8") as f: 
    for line in target_lines:
        f.write(f"{line}\n")
        
import bilingual_nearest_neighbor as bnn 
bnn.main(source_embeddings="evaluation/en.source.vec", target_embeddings="evaluation/si.target.vec", output="evaluation/en-si.mined.txt", binary=0)

  from .autonotebook import tqdm as notebook_tqdm


2025-05-20 15:37:01,664 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2025-05-20 15:37:01,905 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-20 15:37:01,961 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-20 15:37:01,962 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-20 15:37:01,963 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-20 15:37:02,507 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-20 15:37:02,570 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-20 15:37:02,571 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-20 15:37:02,572 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded


Batches: 100%|██████████| 243/243 [04:18<00:00,  1.07s/it]
Batches: 100%|██████████| 243/243 [05:13<00:00,  1.29s/it]


2025-05-20 15:46:40,242 | INFO | faiss.loader | Loading faiss with AVX2 support.
2025-05-20 15:46:40,284 | INFO | faiss.loader | Successfully loaded faiss with AVX2 support.
2025-05-20 15:46:40,288 | INFO | gensim.models.keyedvectors | loading projection weights from evaluation/en.source.vec
2025-05-20 15:46:41,712 | INFO | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (7749, 768) matrix of type float32 from evaluation/en.source.vec', 'binary': 0, 'encoding': 'utf-8', 'datetime': '2025-05-20T15:46:41.712880', 'gensim': '4.3.3', 'python': '3.9.21 (main, Dec 11 2024, 16:24:11) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.39', 'event': 'load_word2vec_format'}
2025-05-20 15:46:41,713 | INFO | gensim.models.keyedvectors | loading projection weights from evaluation/si.target.vec
2025-05-20 15:46:43,116 | INFO | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (7749, 768) matrix of type float32 from evaluation/si.target.vec

In [18]:
'''Apply pipeline to the mined text'''

en_mined = []
si_mined = []
with open("evaluation/en-si.mined.txt", "r", encoding="utf-8") as f:
    for line in f: 
        split = line.split("\t")
        en_line = split[0]
        si_line = split[1]
        en_line = en_line.replace("\u200d", "")
        en_line = en_line.removesuffix("\n")
        si_line = si_line.replace("\u200d", "")
        si_line = si_line.removesuffix("\n")
        en_line = en_line.replace("_", " ")
        si_line = si_line.replace("_", " ")
        en_mined.append(en_line)
        si_mined.append(si_line)

with open("evaluation/en.mined.txt", "w", encoding="utf-8") as f1, open("evaluation/si.mined.txt", "w", encoding="utf-8") as f2:
    for l1 in en_mined:
        f1.write(f"{l1}\n")
    for l2 in si_mined:
        f2.write(f"{l2}\n")

filtering.main(files=["evaluation/en.mined.txt", "evaluation/si.mined.txt"], langs=["english", "sinhala"], model="labse", output="evaluation/en-si.filtered.txt")

Batches: 100%|██████████| 16/16 [00:14<00:00,  1.10it/s]
Batches: 100%|██████████| 16/16 [00:23<00:00,  1.47s/it]
Batches: 100%|██████████| 16/16 [00:15<00:00,  1.04it/s]
Batches: 100%|██████████| 16/16 [00:27<00:00,  1.69s/it]
Batches: 100%|██████████| 16/16 [00:15<00:00,  1.03it/s]
Batches: 100%|██████████| 16/16 [00:24<00:00,  1.52s/it]
Batches: 100%|██████████| 16/16 [00:14<00:00,  1.09it/s]
Batches: 100%|██████████| 16/16 [00:24<00:00,  1.54s/it]
Batches: 100%|██████████| 16/16 [00:17<00:00,  1.12s/it]
Batches: 100%|██████████| 16/16 [00:23<00:00,  1.49s/it]
Batches: 100%|██████████| 16/16 [00:17<00:00,  1.12s/it]
Batches: 100%|██████████| 16/16 [00:26<00:00,  1.65s/it]
Batches: 100%|██████████| 16/16 [00:16<00:00,  1.05s/it]
Batches: 100%|██████████| 16/16 [00:27<00:00,  1.70s/it]
Batches: 100%|██████████| 16/16 [00:17<00:00,  1.07s/it]
Batches: 100%|██████████| 16/16 [00:26<00:00,  1.65s/it]
Batches: 100%|██████████| 16/16 [00:16<00:00,  1.05s/it]
Batches: 100%|██████████| 16/16

2025-05-20 16:03:12,185 | INFO | simalign.simalign | Initialized the EmbeddingLoader with model: xlm-roberta-base


100%|██████████| 2784/2784 [05:46<00:00,  8.03it/s]
2025-05-20 16:09:02,881 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: xlm-roberta-base


2025-05-20 16:09:02,881 | INFO | simalign.simalign | Initialized the EmbeddingLoader with model: xlm-roberta-base


100%|██████████| 2784/2784 [00:00<00:00, 4552.13it/s]
100%|██████████| 2784/2784 [00:00<00:00, 2015002.99it/s]

Raw corpus: 7749

After dropping duplicates: 7749

After removing length based outliers: 6029

After performing language identification: 6008

After filtering based on similarity scores: 2784

After filtering based on word alignment: 862






In [3]:
'''Generate gold file and test file from the filtered data and evaluate using the bucc_f-score script'''

gold_dict = {}
with open("evaluation/en-si.gold.txt", "w", encoding="utf-8") as f:
    for source, target in zip(eng_par_lines, si_par_lines):
        gold_dict[source] = target
        f.write(f"{source}\t{target}\n")

en_lines = []
si_lines = []
with open("evaluation/en-si.filtered.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.replace("\u200d", "")
        line = line.removesuffix("\n")
        split = line.split("\t")
        en_lines.append(split[1])
        si_lines.append(split[2])

with open("evaluation/en-si.test.txt", "w", encoding="utf-8") as f:
    for line1, line2 in zip(en_lines[1:], si_lines[1:]):
        f.write(f"{line1}\t{line2}\n")