In [1]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir="./models")
model = fasttext.load_model(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Return true if the sentence belongs to the specified language
def check_language(sentence, language_code):
    x = model.predict(sentence)
    if x[0][0] == language_code and x[1][0] >= 0.5:
        return True 
    else:
        return False
 
#Convert Moses format files into a pandas DataFrame
def moses_to_df(file1, file2, lang1, lang2):
    lang1_lines = []
    lang2_lines = []
    with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
        for line1, line2 in zip(f1, f2): 
            line1 = line1.removesuffix("\n")
            line2= line2.removesuffix("\n")
            if (check_language(line1, lang1) and check_language(line2, lang2)):
                lang1_lines.append(line1)
                lang2_lines.append(line2)
    import pandas as pd
    df = pd.DataFrame({lang1: lang1_lines, lang2: lang2_lines})
    df.drop_duplicates(inplace=True, ignore_index=True)
    return df

#Convert a list of sentences into their multilingual embeddings according to the given model
def to_multilingual_embedding(language, sentences, model):
    if model.lower() == "labse":
        from sentence_transformers import SentenceTransformer
        encoder = SentenceTransformer('sentence-transformers/LaBSE')
        embedding = encoder.encode(sentences)
    if model.lower() == "laser":
        import torch 
        import argparse
        torch.serialization.add_safe_globals([argparse.Namespace])
        from laser_encoders import LaserEncoderPipeline
        encoder = LaserEncoderPipeline(lang=language)
        embedding = encoder.encode_sentences(sentences)
    return embedding

#Find similarity scores for sentence pairs using cosine similarity
def find_similarity_score(embeddings1, embeddings2): 
    import statistics
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(embeddings1, embeddings2)
    similarity_scores = [statistics.fmean(vector) for vector in similarities]
    # similarity_scores = [float(vector.sum()) for vector in similarities]
    return similarity_scores

#Given a pandas DataFrame, filter best x percent of sentence pairs and store the results in a .tsv file
def filter_top_percentile(df, percentile, tsv_path): 
    df.sort_values("Similarity score", ascending=False, inplace=True)
    df[df['Similarity score'] >= df["Similarity score"].quantile(percentile)].to_csv(sep="\t", path_or_buf=tsv_path)

# Sinhala-English Filtering

In [9]:
df_labse = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_labse = to_multilingual_embedding("english", df_labse["__label__eng_Latn"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_labse["__label__sin_Sinh"], "labse")

In [10]:
similiarity_scores = find_similarity_score(en_embedding_labse, si_embedding_labse)
df_labse["Similarity score"] = similiarity_scores

In [11]:
filter_top_percentile(df_labse, 0.9, "outputs/GNOME.en-si.top10_labse.tsv")

In [12]:
df_labse

Unnamed: 0,__label__eng_Latn,__label__sin_Sinh,Similarity score
470,Select file,සියල්ල තෝරන්න (_A),0.289072
714,Select File,ගොනුව තෝරන්න,0.283206
416,Remove File,ඉවත් කරන්න,0.282978
1067,Example: username,උදාහරණය: පරිශීලක නාමය,0.278915
392,File not found,%s ආයිත්තම හමු නොවි,0.274207
...,...,...,...
151,Move ~a onto the queen of diamonds.,රුවිත ක්වීන්,-0.002639
152,Move ~a onto the king of diamonds.,රුවිත කිංග්,-0.003756
141,Move ~a onto the ace of diamonds.,රුවිත ආසියා,-0.006234
116,Move ~a onto the queen of clubs.,කලාබර ක්වීන්,-0.007917


In [3]:
df_laser = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_laser = to_multilingual_embedding("english", df_laser["__label__eng_Latn"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_laser["__label__sin_Sinh"], "laser")

2025-05-02 10:11:52,493 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2025-05-02 10:11:53,432 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:11:53,492 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-02 10:11:53,492 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:11:53,493 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-02 10:12:15,094 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:12:15,158 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-02 10:12:15,159 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:12:15,160 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded


  x = torch._nested_tensor_from_mask(


In [4]:
similiarity_scores = find_similarity_score(en_embedding_laser, si_embedding_laser)
df_laser["Similarity score"] = similiarity_scores

In [6]:
filter_top_percentile(df_laser, 0.9, "outputs/GNOME.en-si.top10_laser.tsv")

In [7]:
df_laser

Unnamed: 0,__label__eng_Latn,__label__sin_Sinh,Similarity score
704,Switch device,උපකරණය සිරුමාරුව,0.523275
711,Keyboard Type,යතුරු පුවරු වර්ගය,0.511623
582,Mailbox files,සියළු ගොනු,0.508577
450,Copy the selection,මේ තෝරාගැනීම මතක තබා ගන්න,0.507063
482,Highlight syntax,උද්දීපන ආකාරය (_H),0.506887
...,...,...,...
677,Please enter a name and try again.,කරුණාකර නම නැවත ඇතුලත් කර නැවත උත්සාහ කරන්න.,0.269569
715,Please Wait...,කරුණාකර රැදී සිටින්න...,0.265065
728,Please Wait…,කරුණාකර රැදී සිටින්න...,0.265065
175,Remove the seven of hearts.,හාර්ත හත,0.259336


In [21]:
df_KDE4_labse = moses_to_df("data/en-si/KDE4.en-si.en", "data/en-si/KDE4.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_labse = to_multilingual_embedding("english", df_KDE4_labse["__label__eng_Latn"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_KDE4_labse["__label__sin_Sinh"], "labse")

In [22]:
df_KDE4_labse

Unnamed: 0,__label__eng_Latn,__label__sin_Sinh
0,Unknown,නොදන්නා
1,Downloading remote playlist,වාදන ලැයිස්තුව සෙවුම
2,Podcasts on %1,පොඩ්කාස්ට්
3,& Read Device,උපාංගය එක් කරන්න...
4,Could not save playlist.,වාදන ලැයිස්තුව සෙවුම
...,...,...
4884,Get help...,උදව් ගන්න...
4885,Do you really want to reset all toolbars of th...,ඔබට ඇත්ත වශයෙන්ම මෙම වැඩසටහනේ සියළු මෙවලම් තීර...
4886,Available actions:,තිබෙන ක්‍රීයා (v):
4887,Current actions:,පවත්නා ක්‍රියා (e):


In [24]:
similiarity_scores = find_similarity_score(en_embedding_labse, si_embedding_labse)
df_KDE4_labse["Similarity score"] = similiarity_scores
filter_top_percentile(df_KDE4_labse, 0.9, "outputs/KDE4.en-si.top10_labse.tsv")

In [26]:
df_KDE4_laser = moses_to_df("data/en-si/KDE4.en-si.en", "data/en-si/KDE4.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_laser = to_multilingual_embedding("english", df_KDE4_laser["__label__eng_Latn"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_KDE4_laser["__label__sin_Sinh"], "laser")
df_KDE4_laser["Similarity score"] = find_similarity_score(en_embedding_laser, si_embedding_laser)
filter_top_percentile(df_KDE4_laser, 0.9, "outputs/KDE4.en-si.top10_laser.tsv")

2025-05-01 15:56:54,225 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 15:56:54,290 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-01 15:56:54,291 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 15:56:54,292 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-01 15:59:07,320 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 15:59:07,389 | INFO | laser_encoders.download_models |  - Downloading laser3-sin_Sinh.v1.pt


100%|██████████| 608M/608M [01:00<00:00, 10.0MB/s]   

2025-05-01 16:00:08,937 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 16:00:08,938 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded



  x = torch._nested_tensor_from_mask(


In [8]:
df_Ubuntu_labse = moses_to_df("data/en-si/Ubuntu.en-si.en", "data/en-si/Ubuntu.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_laser = to_multilingual_embedding("english", df_Ubuntu_labse["__label__eng_Latn"], "labse")
si_embedding_laser = to_multilingual_embedding("sinhala", df_Ubuntu_labse["__label__sin_Sinh"], "labse")
df_Ubuntu_labse["Similarity score"] = find_similarity_score(en_embedding_laser, si_embedding_laser)
filter_top_percentile(df_Ubuntu_labse, 0.9, "outputs/Ubuntu.en-si.top10_labse.tsv")

2025-05-02 10:19:16,213 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: cpu
2025-05-02 10:19:16,214 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE


Batches: 100%|██████████| 44/44 [00:16<00:00,  2.64it/s]

2025-05-02 10:19:35,721 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: cpu
2025-05-02 10:19:35,722 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE



Batches: 100%|██████████| 44/44 [00:24<00:00,  1.76it/s]


In [9]:
df_Ubuntu_laser = moses_to_df("data/en-si/Ubuntu.en-si.en", "data/en-si/Ubuntu.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_laser = to_multilingual_embedding("english", df_Ubuntu_laser["__label__eng_Latn"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_Ubuntu_laser["__label__sin_Sinh"], "laser")
df_Ubuntu_laser["Similarity score"] = find_similarity_score(en_embedding_laser, si_embedding_laser)
filter_top_percentile(df_Ubuntu_laser, 0.9, "outputs/Ubuntu.en-si.top10_laser.tsv")

2025-05-02 10:20:05,827 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:20:05,879 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-02 10:20:05,880 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:20:05,881 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-02 10:20:31,056 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:20:31,137 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-02 10:20:31,139 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-02 10:20:31,140 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
