In [None]:
#Convert Moses format files into a pandas DataFrame
def moses_to_df(file1, file2, lang1, lang2):
    lang1_lines = []
    lang2_lines = []
    with open(file1, "r", encoding="utf-8") as f1:
        for line in f1: 
            line = line.removesuffix("\n")
            lang1_lines.append(line)
    with open(file2, "r", encoding="utf-8") as f2:
        for line in f2: 
            line = line.removesuffix("\n")
            lang2_lines.append(line)
    import pandas as pd
    df = pd.DataFrame({lang1: lang1_lines, lang2: lang2_lines})
    return df

#Convert a list of sentences into their multilingual embeddings according to the given model
def to_multilingual_embedding(language, sentences, model):
    if model.lower() == "labse":
        from sentence_transformers import SentenceTransformer
        encoder = SentenceTransformer('sentence-transformers/LaBSE')
        embedding = encoder.encode(sentences)
    if model.lower() == "laser":
        import torch 
        import argparse
        torch.serialization.add_safe_globals([argparse.Namespace])
        from laser_encoders import LaserEncoderPipeline
        encoder = LaserEncoderPipeline(lang=language)
        embedding = encoder.encode_sentences(sentences)
    return embedding

#Find similarity scores for sentence pairs using cosine similarity
def find_similarity_score(embeddings1, embeddings2): 
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(embeddings1, embeddings2)
    similarity_scores = [float(vector.sum()) for vector in similarities]
    return similarity_scores

#Given a pandas DataFrame, filter best x percent of sentence pairs and store the results in a .tsv file
def filter_top_percentile(df, percentile, tsv_path): 
    df.sort_values("Similarity score", ascending=False, inplace=True)
    df[df['Similarity score'] >= df["Similarity score"].quantile(percentile)].to_csv(sep="\t", path_or_buf=tsv_path)


# Sinhala-English Filtering

In [2]:
# import os
# english_files=[]
# sinhala_files = []
# for filename in os.listdir("data/en-si"):
#     if filename.endswith(".en"):
#         english_files.append(filename)
#     if filename.endswith(".si"): 
#         sinhala_files.append(filename)
# with open("data/en-si/sentences.en", "w", encoding="utf-8") as f:
#     for file in english_files:
#         for line in open(f"data/en-si/{file}", "r", encoding="utf-8"):
#             f.write(line)
# with open("data/en-si/sentences.si", "w", encoding="utf-8") as f:
#     for file in sinhala_files:
#         for line in open(f"data/en-si/{file}", "r", encoding="utf-8"):
#             f.write(line)
 

In [12]:
df_labse = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_labse = to_multilingual_embedding("english", df_labse["English sentence"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_labse["Sinhala sentence"], "labse")

2025-04-29 09:00:38,541 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: cpu
2025-04-29 09:00:38,542 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE


Batches: 100%|██████████| 417/417 [02:31<00:00,  2.76it/s]


2025-04-29 09:03:13,215 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: cpu
2025-04-29 09:03:13,217 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE


Batches: 100%|██████████| 417/417 [02:47<00:00,  2.49it/s]


In [13]:
similiarity_scores = find_similarity_score(en_embedding_labse, si_embedding_labse)
df_labse["Similarity score"] = similiarity_scores

In [None]:
filter_top_percentile(df_labse, 0.9, "outputs\\GNOME.en-si.top10_labse.tsv")

In [7]:
df_laser = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_laser = to_multilingual_embedding("english", df_laser["English sentence"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_laser["Sinhala sentence"], "laser")

2025-04-29 07:59:41,599 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2025-04-29 07:59:42,547 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-04-29 07:59:42,669 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-04-29 07:59:42,669 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-04-29 07:59:42,669 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-04-29 08:10:16,358 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-04-29 08:10:16,535 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-04-29 08:10:16,537 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-04-29 08:10:16,540 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded


  x = torch._nested_tensor_from_mask(


In [8]:
similiarity_scores = find_similarity_score(en_embedding_laser, si_embedding_laser)
df_laser["Similarity score"] = similiarity_scores

In [None]:
filter_top_percentile(df_laser, 0.9, "outputs\\GNOME.en-si.top10_laser.tsv")

In [11]:
df_laser

Unnamed: 0,English sentence,Sinhala sentence,Similarity score
384,Shift,Shift,7700.271973
157,Shift,Shift,7700.271973
214,Shift,Shift,7700.271973
327,Shift,Shift,7700.271973
441,Shift,Shift,7700.271973
...,...,...,...
1264,Remove the seven of hearts.,හාර්ත හත,2880.132568
10122,These are the plugins selected by you when you...,"අන්ජුටා, සුදුසු ප්ලගින සමූහයක් අතුරින් එකක් තෝ...",2872.951172
6268,These are the plugins selected by you when you...,"අන්ජුටා, සුදුසු ප්ලගින සමූහයක් අතුරින් එකක් තෝ...",2872.951172
6923,These are the plugins selected by you when you...,"අන්ජුටා, සුදුසු ප්ලගින සමූහයක් අතුරින් එකක් තෝ...",2872.951172
