In [None]:
# AR-EN: https://data.statmt.org/news-commentary/v18/training/news-commentary-v18.ar-en.tsv.gz
# CS-DE: https://data.statmt.org/news-commentary/v18/training/news-commentary-v18.cs-de.tsv.gz

# Czech-German
file_path = "data/news-commentary-v18.cs-de.tsv"
src_lang = "cs"
tgt_lang = "de"
full_src_lang = "czech"
full_tgt_lang = "german"

# Arabic-English
# file_path = "data/news-commentary-v18.ar-en.tsv"
# src_lang = "ar"
# tgt_lang = "en"
# full_src_lang = "arabic"
# full_tgt_lang = "english"

In [None]:
import polars as pl

pl.Config.set_fmt_str_lengths(100)

In [None]:

df = pl.read_csv(
    file_path,
    separator="\t",
    has_header=False,
    new_columns=["src", "tgt"],
    quote_char=None,  # Disable quote parsing to handle unescaped quotes
    null_values=[" ", "null"]
)

df.head(10)

In [None]:
# Processing options

src_col = "src"
tgt_col = "tgt"

too_long = 200
len_ratio = 1.5
too_short = 5
too_short_type = "char" # "word" or "char"

In [None]:
# Processing function

def process_dataset_df(df, src_col, tgt_col, len_ratio, too_long, too_short, too_short_type, remove_html=True):

    print("Dataset original size: \t-->", df.shape[0])

    df = df.unique()
    print("Duplicates removed \t-->", df.shape[0]) 
    

    df = df.drop_nulls()
    print("Nulls removed \t\t-->", df.shape[0])

    if remove_html:
        df = df.with_columns(
                        pl.col(src_col)
                        .str.replace_all(r'<.*?>|&lt;.*?&gt;|&?(amp|nbsp|quot);|\{\}', ' ')
                        .str.replace_all(r'\s+', ' ')
                        .str.strip_chars(" \t\n"),

                        pl.col(tgt_col)
                        .str.replace_all(r'<.*?>|&lt;.*?&gt;|&?(amp|nbsp|quot);|\{\}', ' ')
                        .str.replace_all(r'\s+', ' ')
                        .str.strip_chars(" \t\n")
                            )
        print("HTML removed \t\t-->", df.shape[0])

    no_list = ['�', '؟؟؟']
    df = df.filter(pl.col(src_col).str.contains_any(no_list) == False)
    df = df.filter(pl.col(tgt_col).str.contains_any(no_list) == False)
    print("Removed no-list \t-->", df.shape[0])

    df = df.filter(pl.col(src_col).str.split(' ').list.len() <= too_long)
    df = df.filter(pl.col(tgt_col).str.split(' ').list.len() <= too_long)
    print("Too long removed \t-->", df.shape[0])

    df = df.filter(pl.col(src_col).str.len_chars() <= (pl.col(tgt_col).str.len_chars()) * len_ratio)
    df = df.filter(pl.col(tgt_col).str.len_chars() <= (pl.col(src_col).str.len_chars()) * len_ratio)
    print("Too long ratio removed \t-->", df.shape[0])

    if too_short_type == "word":
        df = df.filter(pl.col(src_col).str.split(' ').list.len() > too_short)
        df = df.filter(pl.col(tgt_col).str.split(' ').list.len() > too_short)
    else:
        df = df.filter(pl.col(src_col).str.len_chars() > too_short)
        df = df.filter(pl.col(tgt_col).str.len_chars() > too_short)
    print("Too short removed \t-->", df.shape[0])

    return df

In [None]:
df_filtered = process_dataset_df(df, src_col, tgt_col, len_ratio, too_long, too_short, too_short_type, remove_html=False)
df_filtered.head(10)

# Language Detection

In [None]:
# Language detection with fasttext
# curl  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -o lid.176.bin

import fasttext

model = fasttext.load_model("lid.176.bin")

def detect_language(model, lines, batch_size):
    # Process in batches
    results = []
    scores = []

    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        predictions = model.predict(batch, k=1)
        prediction_ids = [pred[0].replace("__label__", "") for pred in predictions[0]]
        prediction_scores = [pred[0] for pred in predictions[1]]
        results.extend(prediction_ids)
        scores.extend(prediction_scores)

    return results, scores

In [None]:
print("Current dataset size: ", df_filtered.shape[0])

# Detect languages for source and target columns
src_codes, src_scores = detect_language(model, df_filtered[src_col].to_list(), batch_size=df_filtered.shape[0])
tgt_codes, tgt_scores = detect_language(model, df_filtered[tgt_col].to_list(), batch_size=df_filtered.shape[0])

df_filtered = df_filtered.with_columns(
    pl.Series("src_lang", src_codes),
    pl.Series("tgt_lang", tgt_codes),
    pl.Series("src_score", src_scores),
    pl.Series("tgt_score", tgt_scores)
)

df_filtered = df_filtered.filter(
    (pl.col("src_lang") == src_lang) & (pl.col("tgt_lang") == tgt_lang) & (pl.col("src_score") > 0.9) & (pl.col("tgt_score") > 0.9)
)

print("Language detection completed.\nCurrent dataset size: ", df_filtered.shape[0])
df_filtered.head(10)

# Semantic filtering

In [None]:
from sentence_transformers import SentenceTransformer
import torch

muse_langs = ['ar', 'de', 'en', 'es', 'fr', 'it', 'ko', 'nl', 'pt', 'ru', 'tr', 'zh']
para_langs = ["ar", "bg", "ca", "cs", "da", "de", "en", "el", "es", "et", "fa", "fi", "fr", "gl", "gu", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "ka", "ko", "ku", "lt", "lv", "mk", "mn", "mr", "ms", "my", "nb", "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "th", "tr", "uk", "ur", "vi", "zh"]

if len(src_lang) > 2 or len(tgt_lang) > 2:
    raise SystemExit("Please use an ISO 639‑1 language code, e.g. 'en'!")
elif src_lang in muse_langs and tgt_lang in muse_langs:
    model_name = "distiluse-base-multilingual-cased-v1"  # 15 languages
elif src_lang in para_langs and tgt_lang in para_langs:
    model_name = "paraphrase-multilingual-MiniLM-L12-v2"  # 50 languages
else:
    raise SystemExit("Language pair is not supported!")

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)
model = SentenceTransformer(model_name, device=device)
print("Model loaded:", model_name)

In [None]:
def semantic_filter(df, src_col, tgt_col, threshold=0.6, batch_size=32, device="cpu"):
    print("Current dataset size: ", df.shape[0])
    print("Semantic filtering started...")

    print("Encoding source texts...")
    src_embeddings = model.encode(
        df[src_col].to_list(),
        convert_to_tensor=True,
        batch_size=batch_size,
        show_progress_bar=True,
    )

    print("Encoding target texts...")
    tgt_embeddings = model.encode(
        df[tgt_col].to_list(),
        convert_to_tensor=True,
        batch_size=batch_size,
        show_progress_bar=True,
    )

    cosine_scores = torch.nn.functional.cosine_similarity(src_embeddings, tgt_embeddings)
    # Convert tensor to numpy array then to list for Polars
    mask = cosine_scores.cpu().numpy() > threshold
    df_filtered = df.filter(pl.Series(mask))

    print("Semantic filtering completed.\nFiltered dataset size: ", df_filtered.shape[0])
    return df_filtered

In [None]:
df_semantic_filtered = semantic_filter(df_filtered, src_col, tgt_col, threshold=0.7, batch_size=32)

In [None]:
# Select columns to keep
df_semantic_filtered = df_semantic_filtered.select(["src", "tgt"]).rename({"src": full_src_lang, "tgt": full_tgt_lang})

In [None]:
# # Only when we need to use a backup ndjson file

# # Save the filtered dataset to ndjson
# df_semantic_filtered.write_ndjson("filtered_dataset.ndjson")

# # Load the filtered dataset from ndjson
# import polars as pl

# df_semantic_filtered = pl.read_ndjson("filtered_dataset.ndjson")

# print(df_semantic_filtered.shape)
# df_semantic_filtered.head(10)

# Create Dataset

In [None]:
from datasets import Dataset

# # convert dataframe to dictionary
dataset_dict = df_semantic_filtered.to_dict(as_series=False)

dataset = Dataset.from_dict(dataset_dict)
print(dataset)

In [None]:
output_dataset = f"ymoslem/news-commentary-{src_lang}-{tgt_lang}"
print(f"Pushing dataset to Hugging Face Hub: {output_dataset}")

dataset.push_to_hub(output_dataset)