In [6]:
# !pip install -q datasets pandas unidecode

from datasets import load_dataset, DatasetDict, load_from_disk
from pathlib import Path
import re, unicodedata
from unidecode import unidecode
import pandas as pd

# -------------------------
# CONFIG – edit these
# -------------------------
LANG_PAIRS = ["et-en", "si-en"]   # choose what you want

# Root folder where *all* pairs will go (put this in your OneDrive project)
SAVE_ROOT = Path(r"C:\Users\gilbe\OneDrive\MT_Project\data\mlqe")  # <- adjust if needed

LOWERCASE = True
ASCII_FOLD = False   # True if you want á→a, ü→u; optional

# -------------------------
# Cleaning helpers
# -------------------------
_url_re = re.compile(r"http\S+|www\.\S+", flags=re.IGNORECASE)
_multi_ws = re.compile(r"\s+")

def clean_text(s: str) -> str:
    if s is None:
        return s
    s = _url_re.sub("", s)
    s = unicodedata.normalize("NFKC", s)
    if ASCII_FOLD:
        s = unidecode(s)
    s = _multi_ws.sub(" ", s).strip()
    if LOWERCASE:
        s = s.lower()
    return s

def make_src_mt(example, src_lg, tgt_lg):
    # translation is a dict: {"en": "...", "de": "..."} etc.
    tr = example["translation"]
    example["src"] = tr[src_lg]
    example["mt"]  = tr[tgt_lg]
    # z_mean is the human DA z-score
    example["score"] = example["z_mean"]
    return example

def clean_example(ex):
    ex["src_clean"] = clean_text(ex["src"])
    ex["mt_clean"]  = clean_text(ex["mt"])
    return ex

# -------------------------
# Main loop over language pairs
# -------------------------
SAVE_ROOT.mkdir(parents=True, exist_ok=True)

for lp in LANG_PAIRS:
    print(f"\n==============================")
    print(f"Processing language pair: {lp}")
    print(f"==============================")

    # Deduce src / tgt language codes from the config, e.g. "en-de" -> "en", "de"
    src_lg, tgt_lg = lp.split("-")

    # 1) Load specific config
    ds_cfg = load_dataset("wmt/wmt20_mlqe_task1", lp)   # train / validation / test

    # 2) Add src, mt, score columns
    ds_std = {}
    for split in ds_cfg:
        d = ds_cfg[split]

        print(f"{lp} - {split} columns before:", d.column_names)

        # Map translation dict + z_mean -> src, mt, score
        d = d.map(lambda ex: make_src_mt(ex, src_lg, tgt_lg))

        # Add lp column
        if "lp" not in d.column_names:
            d = d.add_column("lp", [lp] * len(d))

        ds_std[split] = d

    ds_std = DatasetDict(ds_std)

    # 3) Clean text
    ds_clean = DatasetDict({
        split: ds_std[split].map(clean_example)
        for split in ds_std.keys()
    })

    # 4) Save to disk in its own folder
    out_dir = SAVE_ROOT / lp   # e.g. .../mlqe/en-de
    out_dir.mkdir(parents=True, exist_ok=True)
    ds_clean.save_to_disk(str(out_dir))
    print(f"✅ Saved cleaned dataset for {lp} to: {out_dir}")

    # 5) Optional CSV preview (first 1000 rows of each split)
    for split in ds_clean:
        preview_path = out_dir / f"{split}_preview.csv"
        df = pd.DataFrame(ds_clean[split][:1000])
        df.to_csv(preview_path, index=False, encoding="utf-8")
        print(f"CSV preview ({lp}, {split}) → {preview_path}")

print("\n✨ All language pairs processed.")



Processing language pair: et-en


et-en/train-00000-of-00001.parquet:   0%|          | 0.00/2.39M [00:00<?, ?B/s]

et-en/test-00000-of-00001.parquet:   0%|          | 0.00/355k [00:00<?, ?B/s]

et-en/validation-00000-of-00001.parquet:   0%|          | 0.00/360k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

et-en - train columns before: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

et-en - test columns before: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

et-en - validation columns before: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

✅ Saved cleaned dataset for et-en to: C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\et-en
CSV preview (et-en, train) → C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\et-en\train_preview.csv
CSV preview (et-en, test) → C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\et-en\test_preview.csv
CSV preview (et-en, validation) → C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\et-en\validation_preview.csv

Processing language pair: si-en


si-en/train-00000-of-00001.parquet:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

si-en/test-00000-of-00001.parquet:   0%|          | 0.00/418k [00:00<?, ?B/s]

si-en/validation-00000-of-00001.parquet:   0%|          | 0.00/417k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

si-en - train columns before: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

si-en - test columns before: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

si-en - validation columns before: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

✅ Saved cleaned dataset for si-en to: C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\si-en
CSV preview (si-en, train) → C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\si-en\train_preview.csv
CSV preview (si-en, test) → C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\si-en\test_preview.csv
CSV preview (si-en, validation) → C:\Users\gilbe\OneDrive\MT_Project\data\mlqe\si-en\validation_preview.csv

✨ All language pairs processed.
