In [3]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir="./models")
model = fasttext.load_model(model_path)

In [None]:
#Return true if the sentence belongs to the specified language
def check_language(sentence, language_code):
    x = model.predict(sentence)
    if x[0][0] == language_code and x[1][0] >= 0.5:
        return True 
    else:
        return False
 
#Convert Moses format files into a pandas DataFrame
def moses_to_df(file1, file2, lang1, lang2):
    lang1_lines = []
    lang2_lines = []
    with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2:
        for line1, line2 in zip(f1, f2): 
            line1 = line1.removesuffix("\n")
            line2= line2.removesuffix("\n")
            if (check_language(line1, lang1) and check_language(line2, lang2)):
                lang1_lines.append(line1)
                lang2_lines.append(line2)
    import pandas as pd
    df = pd.DataFrame({lang1: lang1_lines, lang2: lang2_lines})
    df.drop_duplicates(inplace=True, ignore_index=True)
    return df

#Convert a list of sentences into their multilingual embeddings according to the given model
def to_multilingual_embedding(language, sentences, model):
    if model.lower() == "labse":
        from sentence_transformers import SentenceTransformer
        encoder = SentenceTransformer('sentence-transformers/LaBSE')
        embedding = encoder.encode(sentences)
    if model.lower() == "laser":
        import torch 
        import argparse
        torch.serialization.add_safe_globals([argparse.Namespace])
        from laser_encoders import LaserEncoderPipeline
        encoder = LaserEncoderPipeline(lang=language)
        embedding = encoder.encode_sentences(sentences)
    return embedding

#Find similarity scores for sentence pairs using cosine similarity
def find_similarity_score(embeddings1, embeddings2): 
    import statistics
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(embeddings1, embeddings2)
    similarity_scores = [statistics.fmean(vector) for vector in similarities]
    # similarity_scores = [float(vector.sum()) for vector in similarities]
    return similarity_scores

#Given a pandas DataFrame, filter best x percent of sentence pairs and store the results in a .tsv file
def filter_top_percentile(df, percentile, tsv_path): 
    df.sort_values("Similarity score", ascending=False, inplace=True)
    df[df['Similarity score'] >= df["Similarity score"].quantile(percentile)].to_csv(sep="\t", path_or_buf=tsv_path)

# Sinhala-English Filtering

In [9]:
df_labse = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_labse = to_multilingual_embedding("english", df_labse["__label__eng_Latn"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_labse["__label__sin_Sinh"], "labse")

In [10]:
similiarity_scores = find_similarity_score(en_embedding_labse, si_embedding_labse)
df_labse["Similarity score"] = similiarity_scores

In [11]:
filter_top_percentile(df_labse, 0.9, "outputs/GNOME.en-si.top10_labse.tsv")

In [12]:
df_labse

Unnamed: 0,__label__eng_Latn,__label__sin_Sinh,Similarity score
470,Select file,සියල්ල තෝරන්න (_A),0.289072
714,Select File,ගොනුව තෝරන්න,0.283206
416,Remove File,ඉවත් කරන්න,0.282978
1067,Example: username,උදාහරණය: පරිශීලක නාමය,0.278915
392,File not found,%s ආයිත්තම හමු නොවි,0.274207
...,...,...,...
151,Move ~a onto the queen of diamonds.,රුවිත ක්වීන්,-0.002639
152,Move ~a onto the king of diamonds.,රුවිත කිංග්,-0.003756
141,Move ~a onto the ace of diamonds.,රුවිත ආසියා,-0.006234
116,Move ~a onto the queen of clubs.,කලාබර ක්වීන්,-0.007917


In [24]:
df_laser = moses_to_df("data/en-si/GNOME.en-si.en", "data/en-si/GNOME.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_laser = to_multilingual_embedding("english", df_laser["English sentence"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_laser["Sinhala sentence"], "laser")

2025-05-01 10:28:06,334 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2025-05-01 10:28:06,696 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:28:06,768 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-01 10:28:06,768 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:28:06,768 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-01 10:29:23,214 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:29:23,264 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-01 10:29:23,265 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:29:23,266 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded


  x = torch._nested_tensor_from_mask(


In [25]:
similiarity_scores = find_similarity_score(en_embedding_laser, si_embedding_laser)
df_laser["Similarity score"] = similiarity_scores

In [26]:
filter_top_percentile(df_laser, 0.9, "outputs\\GNOME.en-si.top10_laser.tsv")

In [27]:
df_laser

Unnamed: 0,English sentence,Sinhala sentence,Similarity score
860,_Settings,සැකසුම් (_S),0.562136
42,Shift,Shift,0.560907
1555,Device,මෙවලම්,0.560882
1637,Keyboard,යතුරු පුවරුව,0.560587
856,Paste,අලවන්න,0.560271
...,...,...,...
121,You should have received a copy of the GNU Gen...,slot type,0.239138
398,Remove the nine of hearts.,හාර්ත නවය,0.238071
1567,Please enter a name and try again.,කරුණාකර නම නැවත ඇතුලත් කර නැවත උත්සාහ කරන්න.,0.233735
1387,These are the plugins selected by you when you...,"අන්ජුටා, සුදුසු ප්ලගින සමූහයක් අතුරින් එකක් තෝ...",0.228415


In [21]:
df_KDE4_labse = moses_to_df("data/en-si/KDE4.en-si.en", "data/en-si/KDE4.en-si.si", "__label__eng_Latn", "__label__sin_Sinh")
en_embedding_labse = to_multilingual_embedding("english", df_KDE4_labse["__label__eng_Latn"], "labse")
si_embedding_labse = to_multilingual_embedding("sinhala", df_KDE4_labse["__label__sin_Sinh"], "labse")

In [22]:
df_KDE4_labse

Unnamed: 0,__label__eng_Latn,__label__sin_Sinh
0,Unknown,නොදන්නා
1,Downloading remote playlist,වාදන ලැයිස්තුව සෙවුම
2,Podcasts on %1,පොඩ්කාස්ට්
3,& Read Device,උපාංගය එක් කරන්න...
4,Could not save playlist.,වාදන ලැයිස්තුව සෙවුම
...,...,...
4884,Get help...,උදව් ගන්න...
4885,Do you really want to reset all toolbars of th...,ඔබට ඇත්ත වශයෙන්ම මෙම වැඩසටහනේ සියළු මෙවලම් තීර...
4886,Available actions:,තිබෙන ක්‍රීයා (v):
4887,Current actions:,පවත්නා ක්‍රියා (e):


In [24]:
similiarity_scores = find_similarity_score(en_embedding_labse, si_embedding_labse)
df_KDE4_labse["Similarity score"] = similiarity_scores
filter_top_percentile(df_KDE4_labse, 0.9, "outputs/KDE4.en-si.top10_labse.tsv")

In [29]:
df_KDE4_laser = moses_to_df("data/en-si/KDE4.en-si.en", "data/en-si/KDE4.en-si.si", "English sentence", "Sinhala sentence")
en_embedding_laser = to_multilingual_embedding("english", df_KDE4_laser["English sentence"], "laser")
si_embedding_laser = to_multilingual_embedding("sinhala", df_KDE4_laser["Sinhala sentence"], "laser")
df_KDE4_laser["Similarity score"] = find_similarity_score(en_embedding_laser, si_embedding_laser)
filter_top_percentile(df_KDE4_laser, 0.9, "outputs/KDE4.en-si.top10_laser.tsv")

2025-05-01 10:47:58,888 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:47:58,944 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-05-01 10:47:58,946 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 10:47:58,946 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-05-01 11:01:05,573 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 11:01:05,631 | INFO | laser_encoders.download_models |  - laser3-sin_Sinh.v1.pt already downloaded
2025-05-01 11:01:05,632 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-05-01 11:01:05,633 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
