In [None]:
import os
import re
import json
import unicodedata

In [None]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.strip().lower()
    text = re.sub(r"\s+", " ", text)  # Remove redundant spaces
    text = re.sub(r"&[^;]+;", "", text)  # Remove HTML entities
    text = re.sub(r"[<>\\\[\]{}~^|]", "", text)  # Remove noisy chars
    return text

In [None]:
def is_valid_pair(en, vi, min_len=3, max_len=100):
    if not en or not vi:
        return False
    if len(en.split()) < min_len or len(vi.split()) < min_len:
        return False
    if len(en.split()) > max_len or len(vi.split()) > max_len:
        return False
    return True

In [None]:
def load_split(folder, split_name):
    en_path = os.path.join(folder, f"{split_name}.en")
    vi_path = os.path.join(folder, f"{split_name}.vi")

    with open(en_path, encoding="utf-8") as f:
        en_lines = [line.strip() for line in f]
    with open(vi_path, encoding="utf-8") as f:
        vi_lines = [line.strip() for line in f]

    assert len(en_lines) == len(vi_lines), f"❌ Mismatch in line count for {split_name}"

    data = []
    for en, vi in zip(en_lines, vi_lines):
        en_clean = normalize_text(en)
        vi_clean = normalize_text(vi)
        if is_valid_pair(en_clean, vi_clean):
            data.append({
                "translation": {
                    "en": en_clean,
                    "vi": vi_clean
                }
            })
    return data

In [None]:
def prepare_phomt(root_folder, output_path="phomt_cleaned.json", combine_all=True):
    print(f"📁 Loading PhoMT from: {root_folder}")
    splits = ["train", "dev", "test"]
    data_all = []

    for split in splits:
        folder_path = os.path.join(root_folder, "detokenization", split)
        print(f"🔹 Processing split: {split}")
        split_data = load_split(folder_path, split)
        print(f"✅ {split}: {len(split_data)} valid pairs")

        if combine_all:
            data_all.extend(split_data)
        else:
            out_file = f"{split}.json" # out_file = f"{output_path.replace('.json', '')}_{split}.json"
            with open(out_file, "w", encoding="utf-8") as f:
                json.dump(split_data, f, ensure_ascii=False, indent=2)
            print(f"💾 Saved: {out_file}")

    if combine_all:
        print(f"\n📦 Total cleaned pairs: {len(data_all)}")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data_all, f, ensure_ascii=False, indent=2)
        print(f"✅ Saved combined file to {output_path}")

In [None]:
if __name__ == "__main__":
    prepare_phomt(
        root_folder=r"D:\PhoMT",  # ✅ đường dẫn gốc chứa folder "detokenization"
        output_path="phomt_cleaned.json",
        combine_all=True  # Nếu muốn chia nhỏ theo split thì đặt False
    )