In [1]:
import os
os.environ["LD_LIBRARY_PATH"] = os.environ["CONDA_PREFIX"] + "/lib:" + os.environ.get("LD_LIBRARY_PATH", "")

In [2]:
# pip install fasttext
# pip install huggingface_hub

import fasttext
from huggingface_hub import hf_hub_download

# download model and get the model path
# cache_dir is the path to the folder where the downloaded model will be stored/cached.
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
print("model path:", model_path)

# load the model
model = fasttext.load_model(model_path)

  from .autonotebook import tqdm as notebook_tqdm


model path: /home/tanmay/.cache/huggingface/hub/models--cis-lmu--glotlid/snapshots/74cb50b709c9eefe0f790030c6c95c461b4e3b77/model.bin


In [None]:
model.predict("තරගයක් තෝරන්න")
#test

(('__label__sin_Sinh',), array([1.00000179]))

In [1]:
#Convert Moses format files into a pandas DataFrame
def moses_to_df(file1, file2, lang1, lang2):
    lang1_lines = []
    lang2_lines = []
    with open(file1, "r", encoding="utf-8") as f1:
        for line in f1: 
            line = line.removesuffix("\n")
            lang1_lines.append(line)
    with open(file2, "r", encoding="utf-8") as f2:
        for line in f2: 
            line = line.removesuffix("\n")
            lang2_lines.append(line)
    import pandas as pd
    df = pd.DataFrame({lang1: lang1_lines, lang2: lang2_lines})
    df.drop_duplicates(inplace=True, ignore_index=True)
    return df

#Convert a list of sentences into their multilingual embeddings according to the given model
def to_multilingual_embedding(language, sentences, model):
    if model.lower() == "labse":
        from sentence_transformers import SentenceTransformer
        encoder = SentenceTransformer('sentence-transformers/LaBSE')
        embedding = encoder.encode(sentences)
    if model.lower() == "laser":
        import torch 
        import argparse
        torch.serialization.add_safe_globals([argparse.Namespace])
        from laser_encoders import LaserEncoderPipeline
        encoder = LaserEncoderPipeline(lang=language)
        embedding = encoder.encode_sentences(sentences)
    return embedding

#Find similarity scores for sentence pairs using cosine similarity
def find_similarity_score(embeddings1, embeddings2): 
    import statistics
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(embeddings1, embeddings2)
    similarity_scores = [statistics.fmean(vector) for vector in similarities]
    # similarity_scores = [float(vector.sum()) for vector in similarities]
    return similarity_scores

#Given a pandas DataFrame, filter best x percent of sentence pairs and store the results in a .tsv file
def filter_top_percentile(df, percentile, tsv_path): 
    df.sort_values("Similarity score", ascending=False, inplace=True)
    df[df['Similarity score'] >= df["Similarity score"].quantile(percentile)].to_csv(sep="\t", path_or_buf=tsv_path)

# Sinhala-English Filtering

In [2]:
# import os
# english_files=[]
# sinhala_files = []
# for filename in os.listdir("data/en-si"):
#     if filename.endswith(".en"):
#         english_files.append(filename)
#     if filename.endswith(".si"): 
#         sinhala_files.append(filename)
# with open("data/en-si/sentences.en", "w", encoding="utf-8") as f:
#     for file in english_files:
#         for line in open(f"data/en-si/{file}", "r", encoding="utf-8"):
#             f.write(line)
# with open("data/en-si/sentences.si", "w", encoding="utf-8") as f:
#     for file in sinhala_files:
#         for line in open(f"data/en-si/{file}", "r", encoding="utf-8"):
#             f.write(line)
 

In [20]:
df_labse = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_labse = to_multilingual_embedding("english", df_labse["English sentence"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_labse["Sinhala sentence"], "labse")

In [21]:
similiarity_scores = find_similarity_score(en_embedding_labse, si_embedding_labse)
df_labse["Similarity score"] = similiarity_scores

In [22]:
filter_top_percentile(df_labse, 0.9, "outputs\\GNOME.en-si.top10_labse.tsv")

In [23]:
df_labse

Unnamed: 0,English sentence,Sinhala sentence,Similarity score
735,C_ut,කපන්න (_u),0.314207
1826,New _Tab,නව ටැබය (_T),0.312588
1788,New _Window,නව කවුළුව (_N),0.312385
110,_Select,තරගයක් තෝරන්න,0.312249
429,_Select,තරගයක් තෝරන්නslot type,0.312249
...,...,...,...
375,Remove the king of clubs.,කලාබර කිංග්,-0.011855
359,Move ~a onto the queen of diamonds.,රුවිත ක්වීන්,-0.012506
348,Move ~a onto the ace of diamonds.,රුවිත ආසියා,-0.016199
320,Move ~a onto the king of clubs.,කලාබර කිංග්,-0.016575


In [24]:
df_laser = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_laser = to_multilingual_embedding("english", df_laser["English sentence"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_laser["Sinhala sentence"], "laser")

2025-05-01 10:28:06,334 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2025-05-01 10:28:06,696 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:28:06,768 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-01 10:28:06,768 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:28:06,768 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-01 10:29:23,214 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:29:23,264 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-01 10:29:23,265 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:29:23,266 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded


  x = torch._nested_tensor_from_mask(


In [25]:
similiarity_scores = find_similarity_score(en_embedding_laser, si_embedding_laser)
df_laser["Similarity score"] = similiarity_scores

In [26]:
filter_top_percentile(df_laser, 0.9, "outputs\\GNOME.en-si.top10_laser.tsv")

In [27]:
df_laser

Unnamed: 0,English sentence,Sinhala sentence,Similarity score
860,_Settings,සැකසුම් (_S),0.562136
42,Shift,Shift,0.560907
1555,Device,මෙවලම්,0.560882
1637,Keyboard,යතුරු පුවරුව,0.560587
856,Paste,අලවන්න,0.560271
...,...,...,...
121,You should have received a copy of the GNU Gen...,slot type,0.239138
398,Remove the nine of hearts.,හාර්ත නවය,0.238071
1567,Please enter a name and try again.,කරුණාකර නම නැවත ඇතුලත් කර නැවත උත්සාහ කරන්න.,0.233735
1387,These are the plugins selected by you when you...,"අන්ජුටා, සුදුසු ප්ලගින සමූහයක් අතුරින් එකක් තෝ...",0.228415


In [28]:
df_KDE4_labse = moses_to_df("data/en-si/KDE4.en-si.en", "data/en-si/KDE4.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_labse = to_multilingual_embedding("english", df_KDE4_labse["English sentence"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_KDE4_labse["Sinhala sentence"], "labse")
df_KDE4_labse["Similarity score"] = find_similarity_score(en_embedding_labse, si_embedding_labse)
filter_top_percentile(df_KDE4_labse, 0.9, "outputs/KDE4.en-si.top10_labse.tsv")

2025-05-01 10:32:29,529 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: cpu
2025-05-01 10:32:29,530 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE


Batches: 100%|██████████| 791/791 [06:02<00:00,  2.18it/s]


2025-05-01 10:38:35,472 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: cpu
2025-05-01 10:38:35,474 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE


Batches: 100%|██████████| 791/791 [08:07<00:00,  1.62it/s]


In [29]:
df_KDE4_laser = moses_to_df("data/en-si/KDE4.en-si.en", "data/en-si/KDE4.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_laser = to_multilingual_embedding("english", df_KDE4_laser["English sentence"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_KDE4_laser["Sinhala sentence"], "laser")
df_KDE4_laser["Similarity score"] = find_similarity_score(en_embedding_laser, si_embedding_laser)
filter_top_percentile(df_KDE4_laser, 0.9, "outputs/KDE4.en-si.top10_laser.tsv")

2025-05-01 10:47:58,888 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:47:58,944 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-01 10:47:58,946 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:47:58,946 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-01 11:01:05,573 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 11:01:05,631 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-01 11:01:05,632 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 11:01:05,633 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
