In [11]:
import os
import json

In [15]:
# folder paths
BASE_DIR = r"C:\Users\silap\Desktop\spring24\cs7650\final_project"
FLORES_DEV = os.path.join(BASE_DIR, "flores", "flores200_dataset", "dev")
FLORES_DEVTEST = os.path.join(BASE_DIR, "flores", "flores200_dataset", "devtest")
WMT21_PATH = os.path.join(BASE_DIR, "wmt21", "wikititles-v3.ru-en.tsv")
OUTPUT_DIR = os.path.join(BASE_DIR, "processed")
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [16]:
# === LANGUAGE CODES ===
LANG_MAP = {
    "ne": "npi_Deva",
    "si": "sin_Sinh",
    "en": "eng_Latn",
    "ru": "rus_Cyrl"
}

FLORES_PAIRS = [
    ("en", "ne"),
    ("en", "si"),
    ("en", "ru"),
    ("ne", "si")
]

In [17]:
def convert_to_jsonl(src_path, tgt_path, output_path):
    with open(src_path, "r", encoding="utf-8") as src_f, \
         open(tgt_path, "r", encoding="utf-8") as tgt_f, \
         open(output_path, "w", encoding="utf-8") as out_f:
        for src_line, tgt_line in zip(src_f, tgt_f):
            src_text = src_line.strip()
            tgt_text = tgt_line.strip()
            if src_text and tgt_text:
                out_f.write(json.dumps({"src": src_text, "tgt": tgt_text}, ensure_ascii=False) + "\n")

def convert_tsv_to_jsonl(tsv_path, output_path):
    with open(tsv_path, "r", encoding="utf-8") as in_f, \
         open(output_path, "w", encoding="utf-8") as out_f:
        for line in in_f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            ru, en = parts
            if ru and en:
                out_f.write(json.dumps({"src": en, "tgt": ru}, ensure_ascii=False) + "\n")  # en → ru

In [18]:
def process_flores():
    for src_lang, tgt_lang in FLORES_PAIRS:
        src_code = LANG_MAP[src_lang]
        tgt_code = LANG_MAP[tgt_lang]

        for direction in [(src_code, tgt_code), (tgt_code, src_code)]:
            src, tgt = direction

            for split in ["dev", "devtest"]:
                split_ext = ".dev" if split == "dev" else ".devtest"
                src_path = os.path.join(FLORES_DEV if split == "dev" else FLORES_DEVTEST, f"{src}{split_ext}")
                tgt_path = os.path.join(FLORES_DEV if split == "dev" else FLORES_DEVTEST, f"{tgt}{split_ext}")

                short_src = [k for k, v in LANG_MAP.items() if v == src][0]
                short_tgt = [k for k, v in LANG_MAP.items() if v == tgt][0]
                out_name = f"{short_src}_{short_tgt}_{split}.jsonl"
                output_path = os.path.join(OUTPUT_DIR, out_name)

                if os.path.exists(src_path) and os.path.exists(tgt_path):
                    print(f"FLoRes: {src} ↔ {tgt} → {out_name}")
                    convert_to_jsonl(src_path, tgt_path, output_path)
                else:
                    print(f"Missing FLoRes files: {src_path} or {tgt_path}")

In [19]:
def process_wmt21():
    output_path = os.path.join(OUTPUT_DIR, "en_ru_wmt21.jsonl")
    if os.path.exists(WMT21_PATH):
        print(f"🔄 WMT21: Converting {WMT21_PATH} → en_ru_wmt21.jsonl")
        convert_tsv_to_jsonl(WMT21_PATH, output_path)
    else:
        print(f"⚠️ Missing WMT21 file: {WMT21_PATH}")


In [20]:
def main():
    print("Processing FLoRes-200...")
    process_flores()
    print("\nProcessing WMT21 News Commentary...")
    process_wmt21()
    print("\n✅ All datasets processed into:", OUTPUT_DIR)

if __name__ == "__main__":
    main()

Processing FLoRes-200...
FLoRes: eng_Latn ↔ npi_Deva → en_ne_dev.jsonl
FLoRes: eng_Latn ↔ npi_Deva → en_ne_devtest.jsonl
FLoRes: npi_Deva ↔ eng_Latn → ne_en_dev.jsonl
FLoRes: npi_Deva ↔ eng_Latn → ne_en_devtest.jsonl
FLoRes: eng_Latn ↔ sin_Sinh → en_si_dev.jsonl
FLoRes: eng_Latn ↔ sin_Sinh → en_si_devtest.jsonl
FLoRes: sin_Sinh ↔ eng_Latn → si_en_dev.jsonl
FLoRes: sin_Sinh ↔ eng_Latn → si_en_devtest.jsonl
FLoRes: eng_Latn ↔ rus_Cyrl → en_ru_dev.jsonl
FLoRes: eng_Latn ↔ rus_Cyrl → en_ru_devtest.jsonl
FLoRes: rus_Cyrl ↔ eng_Latn → ru_en_dev.jsonl
FLoRes: rus_Cyrl ↔ eng_Latn → ru_en_devtest.jsonl
FLoRes: npi_Deva ↔ sin_Sinh → ne_si_dev.jsonl
FLoRes: npi_Deva ↔ sin_Sinh → ne_si_devtest.jsonl
FLoRes: sin_Sinh ↔ npi_Deva → si_ne_dev.jsonl
FLoRes: sin_Sinh ↔ npi_Deva → si_ne_devtest.jsonl

Processing WMT21 News Commentary...
🔄 WMT21: Converting C:\Users\silap\Desktop\spring24\cs7650\final_project\wmt21\wikititles-v3.ru-en.tsv → en_ru_wmt21.jsonl

✅ All datasets processed into: C:\Users\silap\