In [8]:
!pip -q install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\RoG\\anaconda3\\envs\\pythonRL\\Lib\\site-packages\\torch\\lib\\c10_cuda.dll'
Consider using the `--user` option or check the permissions.



In [1]:
import os
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR = PROJECT_ROOT / "data" / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_NAME = "Helsinki-NLP/opus-mt-hu-en"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_RAW:", DATA_RAW)
print("DATA_PROCESSED:", DATA_PROCESSED)
print("OUTPUT_DIR:", OUTPUT_DIR)

PROJECT_ROOT: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal
DATA_RAW: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\raw
DATA_PROCESSED: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed
OUTPUT_DIR: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs


In [3]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version (torch):", torch.version.cuda)


Torch version: 2.5.1+cu121
CUDA available: True
CUDA version (torch): 12.1


In [5]:
import numpy as np

# HU diacritics leak proxy (how often output still looks Hungarian)
hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s): 
    return any(c in hu_diacritics for c in s)


# Very small starter glossary for institutions/terms (expand later)
glossary = {
    "Bizottság": ["Comisia"],
    "Tanács": ["Consiliul"],
    "Közösség": ["Comunitatea", "Comunității"],
}

def glossary_hit(src, hyp, glossary):
    hits = []
    for k, vals in glossary.items():
        if k in src:
            ok = any(v in hyp for v in vals)
            hits.append(ok)
    return hits




In [6]:
import os
import torch
import numpy as np
import pandas as pd
import sacrebleu

from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel

# -------------------------
# 0) Paths + find last checkpoint
# -------------------------
RUN_DIR = Path(PROJECT_ROOT) / "checkpoints" / "opus_hu_ro_legal"   # adjust if needed

def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

CKPT_DIR = get_latest_checkpoint(RUN_DIR)
print("Using checkpoint:", CKPT_DIR)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

# -------------------------
# 1) Convert BASE model to safetensors locally (if needed)
# -------------------------
def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    """
    Creates a local folder containing config + model.safetensors for the base model,
    without calling transformers.from_pretrained on .bin (avoids torch>=2.6 gate).
    """
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    # If already converted, reuse
    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("Base safetensors already exists:", out_dir)
        return out_dir

    print("Downloading base model snapshot:", model_id)
    snap_dir = Path(snapshot_download(repo_id=model_id))

    # Locate weights
    bin_path = snap_dir / "pytorch_model.bin"
    if not bin_path.exists():
        # sometimes sharded
        shards = sorted(snap_dir.glob("pytorch_model-*.bin"))
        if not shards:
            raise FileNotFoundError("Could not find pytorch_model.bin or shards in snapshot.")
        # Merge shards by loading into model later (transformers can merge but would hit restriction)
        # We'll handle common non-sharded case; if sharded, tell user.
        raise RuntimeError(
            f"Found sharded weights ({len(shards)} files). "
            "This quick converter handles non-sharded pytorch_model.bin. "
            "Tell me and I’ll give you the sharded merge converter."
        )

    # Load config normally (safe)
    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    # Load state dict via torch.load (allowed on torch 2.5)
    print("Loading base .bin weights with torch.load:", bin_path)
    state = torch.load(bin_path, map_location="cpu")
    missing, unexpected = model.load_state_dict(state, strict=False)
    print(f"Loaded base weights. missing={len(missing)} unexpected={len(unexpected)}")

    # Save safetensors + config
    print("Saving base model as safetensors to:", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)

    return out_dir

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_DIR = ensure_base_safetensors(MODEL_NAME, BASE_SAFE_ROOT)

# Tokenizer can be loaded from the original model id (safe), or from BASE_SAFE_DIR
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# -------------------------
# 2) Convert LoRA adapter_model.bin -> adapter_model.safetensors (if needed)
# -------------------------
def ensure_adapter_safetensors(ckpt_dir: Path):
    bin_path = ckpt_dir / "adapter_model.bin"
    st_path  = ckpt_dir / "adapter_model.safetensors"

    if st_path.exists():
        print("Adapter safetensors already exists:", st_path)
        return

    if not bin_path.exists():
        raise FileNotFoundError(f"adapter_model.bin not found in {ckpt_dir}")

    print("Converting adapter bin -> safetensors:", bin_path)
    adapter_state = torch.load(bin_path, map_location="cpu")
    save_file(adapter_state, str(st_path))
    print("Wrote:", st_path)

ensure_adapter_safetensors(CKPT_DIR)

# -------------------------
# 3) Load base model from local safetensors + attach LoRA
# -------------------------
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_SAFE_DIR,
    use_safetensors=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None,
).to(DEVICE)

model = PeftModel.from_pretrained(base_model, CKPT_DIR).to(DEVICE)
model.eval()

print("Model device:", next(model.parameters()).device)

# -------------------------
# 4) Load test data
# -------------------------
test_df = pd.read_csv(DATA_PROCESSED / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()

# -------------------------
# 5) Translate
# -------------------------
def translate_batch(model, tokenizer, sentences, batch_size=16, max_input_len=256, max_new_tokens=96, num_beams=1):
    hyps = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_input_len
        ).to(DEVICE)

        with torch.inference_mode():
            out = model.generate(
                **inputs,
                num_beams=num_beams,
                do_sample=False,
                max_new_tokens=max_new_tokens,
                early_stopping=True,
            )
        hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return hyps

# Greedy for speed (set num_beams=4 for final run)
finetuned_hyps = translate_batch(
    model, tokenizer, src_sentences,
    batch_size=16 if DEVICE=="cuda" else 4,
    num_beams=1,
    max_new_tokens=96
)

# -------------------------
# 6) Metrics
# -------------------------
bleu_ft = sacrebleu.corpus_bleu(finetuned_hyps, [refs]).score
print("Finetuned BLEU:", round(bleu_ft, 2))

leak_rate_ft = np.mean([has_hu_diacritics(h) for h in finetuned_hyps])
print("HU diacritics leak rate (FT):", round(leak_rate_ft*100, 2), "%")

all_hits_ft = [glossary_hit(s, h, glossary) for s, h in zip(src_sentences, finetuned_hyps)]
flat_ft = [x for row in all_hits_ft for x in row]
if flat_ft:
    print("Glossary accuracy (FT):", round(np.mean(flat_ft)*100, 2), "%")
else:
    print("No glossary terms found in sample.")

# -------------------------
# 7) Save outputs
# -------------------------
out_path = Path(OUTPUT_DIR) / f"finetuned_predictions_lora_{CKPT_DIR.name}.csv"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": finetuned_hyps}).to_csv(out_path, index=False)
print("Saved:", out_path)


Using checkpoint: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750
DEVICE: cuda
Torch: 2.5.1+cu121
Base safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en




Adapter safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750\adapter_model.safetensors
Model device: cuda:0


100%|██████████| 1898/1898 [30:52<00:00,  1.02it/s]


Finetuned BLEU: 10.01
HU diacritics leak rate (FT): 0.81 %
Glossary accuracy (FT): 36.38 %
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\finetuned_predictions_lora_checkpoint-18750.csv


In [7]:
# FULL IMPROVED EVAL + PREDICTION CODE (Option C: safetensors; no torch>=2.6 needed)
# - Loads latest LoRA checkpoint
# - Uses safetensors base + adapter (converts if needed; handles sharded base weights)
# - Fast greedy decode + optional beam search sweep for best BLEU
# - Computes BLEU, HU-diacritics leak, English-leak proxy, normalized glossary accuracy
# - Saves predictions CSVs

import os, re, json, unicodedata
import torch
import numpy as np
import pandas as pd
import sacrebleu

from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel

# -------------------------
# 0) CONFIG (SET THESE)
# -------------------------
# Base model you used during fine-tuning:
MODEL_NAME = "Helsinki-NLP/opus-mt-hu-en"   # <-- keep consistent with training

RUN_DIR    = Path(PROJECT_ROOT) / "checkpoints" / "opus_hu_ro_legal"
OUT_DIR    = Path(OUTPUT_DIR)
BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"

# decode defaults
MAX_INPUT_LEN  = 256
MAX_NEW_TOKENS = 96

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# -------------------------
# 1) Find latest checkpoint
# -------------------------
def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

CKPT_DIR = get_latest_checkpoint(RUN_DIR)
print("Using checkpoint:", CKPT_DIR)

# -------------------------
# 2) Option C: ensure BASE safetensors exists (supports sharded weights)
# -------------------------
def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("Base safetensors already exists:", out_dir)
        return out_dir

    print("Downloading base model snapshot:", model_id)
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
        ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.tflite", "*.onnx"]
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    # Case 1: single weight file
    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        print("Found single pytorch_model.bin; loading with torch.load:", bin_path)
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        # Case 2: sharded weights (index.json + shard bins)
        index_path = snap_dir / "pytorch_model.bin.index.json"
        if not index_path.exists():
            shards = sorted(snap_dir.glob("pytorch_model-*.bin"))
            raise FileNotFoundError(
                f"Could not find pytorch_model.bin or pytorch_model.bin.index.json.\nFound {len(shards)} shard files."
            )

        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        print(f"Found sharded weights: {len(shard_files)} shards")

        for sf in shard_files:
            sp = snap_dir / sf
            if not sp.exists():
                raise FileNotFoundError(f"Missing shard file: {sp}")
            shard_state = torch.load(sp, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    print("Saving base model as safetensors to:", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

BASE_SAFE_DIR = ensure_base_safetensors(MODEL_NAME, BASE_SAFE_ROOT)

# -------------------------
# 3) Ensure adapter safetensors exists
# -------------------------
def ensure_adapter_safetensors(ckpt_dir: Path):
    bin_path = ckpt_dir / "adapter_model.bin"
    st_path  = ckpt_dir / "adapter_model.safetensors"

    if st_path.exists():
        print("Adapter safetensors already exists:", st_path)
        return

    if not bin_path.exists():
        raise FileNotFoundError(f"adapter_model.bin not found in {ckpt_dir}")

    print("Converting adapter bin -> safetensors:", bin_path)
    adapter_state = torch.load(bin_path, map_location="cpu")
    save_file(adapter_state, str(st_path))
    del adapter_state
    print("Wrote:", st_path)

ensure_adapter_safetensors(CKPT_DIR)

# -------------------------
# 4) Load tokenizer + base (safetensors) + attach LoRA
# -------------------------
# Tokenizer from HF id (safe)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_SAFE_DIR,
    use_safetensors=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None,
).to(DEVICE)

model = PeftModel.from_pretrained(base_model, CKPT_DIR).to(DEVICE)
model.eval()
print("Model device:", next(model.parameters()).device)

# -------------------------
# 5) Load test data
# -------------------------
test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()
print("Test size:", len(src_sentences))

# -------------------------
# 6) Metrics helpers
# -------------------------
hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diacritics for c in s)

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", s.lower()) + " "
    common = [" the ", " and ", " of ", " to ", " for ", " with ", " on "]
    return any(w in s for w in common)

def norm_ro(s: str) -> str:
    s = s.lower()
    s = s.replace("ţ", "ț").replace("ş", "ș")  # unify diacritics variants
    return unicodedata.normalize("NFKC", s)

# Your glossary should already exist as: glossary = { "Bizottság": ["Comisia", ...], ... }
# We'll normalize it for fairer matching.
glossary_norm = {norm_ro(k): [norm_ro(v) for v in vs] for k, vs in glossary.items()}

def glossary_hit(src: str, hyp: str, glos_norm: dict) -> list:
    src_n, hyp_n = norm_ro(src), norm_ro(hyp)
    checks = []
    for src_term, ro_forms in glos_norm.items():
        if src_term in src_n:
            checks.append(any(f in hyp_n for f in ro_forms))
    return checks

# -------------------------
# 7) Fast translation + OOM-safe batch fallback
# -------------------------
def translate_batch(
    model, tokenizer, sentences,
    batch_size=16, max_input_len=256, max_new_tokens=96,
    num_beams=1, length_penalty=1.0
):
    hyps = []
    i = 0
    while i < len(sentences):
        bs = min(batch_size, len(sentences) - i)
        batch = sentences[i:i+bs]
        try:
            inputs = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_input_len
            ).to(DEVICE)

            with torch.inference_mode():
                out = model.generate(
                    **inputs,
                    num_beams=num_beams,
                    length_penalty=length_penalty,
                    do_sample=False,
                    max_new_tokens=max_new_tokens,
                    early_stopping=True,
                )

            hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
            i += bs

        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            if batch_size <= 1:
                raise
            batch_size = max(1, batch_size // 2)
            print(f"OOM -> reducing batch_size to {batch_size} and retrying...")

    return hyps

# -------------------------
# 8) Greedy decode (fast) + metrics
# -------------------------
greedy_bs = 16 if DEVICE == "cuda" else 4
finetuned_hyps = translate_batch(
    model, tokenizer, src_sentences,
    batch_size=greedy_bs,
    max_input_len=MAX_INPUT_LEN,
    max_new_tokens=MAX_NEW_TOKENS,
    num_beams=1,
    length_penalty=1.0
)

bleu_g = sacrebleu.corpus_bleu(finetuned_hyps, [refs]).score
leak_hu_g = np.mean([has_hu_diacritics(h) for h in finetuned_hyps]) * 100
leak_en_g = np.mean([en_leak(h) for h in finetuned_hyps]) * 100
hits_g = [glossary_hit(s, h, glossary_norm) for s, h in zip(src_sentences, finetuned_hyps)]
flat_g = [x for row in hits_g for x in row]
glos_g = (np.mean(flat_g) * 100) if flat_g else float("nan")

print("\n=== GREEDY RESULTS ===")
print("BLEU:", round(bleu_g, 2))
print("HU diacritics leak %:", round(leak_hu_g, 2))
print("English leak proxy %:", round(leak_en_g, 2))
print("Glossary accuracy %:", round(glos_g, 2) if not np.isnan(glos_g) else "(no glossary terms found)")

OUT_DIR.mkdir(parents=True, exist_ok=True)
out_path_g = OUT_DIR / f"finetuned_predictions_lora_{CKPT_DIR.name}_greedy.csv"
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": finetuned_hyps}).to_csv(out_path_g, index=False)
print("Saved:", out_path_g)

# -------------------------
# 9) Optional: Beam search sweep for best BLEU (useful for final reporting)
# -------------------------
# Try a few length penalties; keep best BLEU.
beam_bs = 8 if DEVICE == "cuda" else 2
cands = []
for lp in [0.8, 1.0, 1.2]:
    hyps_b = translate_batch(
        model, tokenizer, src_sentences,
        batch_size=beam_bs,
        max_input_len=MAX_INPUT_LEN,
        max_new_tokens=128,
        num_beams=4,
        length_penalty=lp
    )
    bleu_b = sacrebleu.corpus_bleu(hyps_b, [refs]).score
    cands.append((bleu_b, lp, hyps_b))
    print(f"Beam4 lp={lp} BLEU={bleu_b:.2f}")

best_bleu, best_lp, best_hyps = max(cands, key=lambda x: x[0])

leak_hu_b = np.mean([has_hu_diacritics(h) for h in best_hyps]) * 100
leak_en_b = np.mean([en_leak(h) for h in best_hyps]) * 100
hits_b = [glossary_hit(s, h, glossary_norm) for s, h in zip(src_sentences, best_hyps)]
flat_b = [x for row in hits_b for x in row]
glos_b = (np.mean(flat_b) * 100) if flat_b else float("nan")

print("\n=== BEST BEAM RESULTS ===")
print("Best lp:", best_lp)
print("BLEU:", round(best_bleu, 2))
print("HU diacritics leak %:", round(leak_hu_b, 2))
print("English leak proxy %:", round(leak_en_b, 2))
print("Glossary accuracy %:", round(glos_b, 2) if not np.isnan(glos_b) else "(no glossary terms found)")

out_path_b = OUT_DIR / f"finetuned_predictions_lora_{CKPT_DIR.name}_beam4_lp{best_lp}.csv"
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": best_hyps}).to_csv(out_path_b, index=False)
print("Saved:", out_path_b)


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 2183fc6f-a243-4f41-99b8-f34aac5e98f7)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-hu-en/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


DEVICE: cuda
Torch: 2.5.1+cu121
Using checkpoint: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750
Base safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en
Adapter safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750\adapter_model.safetensors




Model device: cuda:0
Test size: 30366

=== GREEDY RESULTS ===
BLEU: 10.01
HU diacritics leak %: 0.81
English leak proxy %: 6.45
Glossary accuracy %: 40.57
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\finetuned_predictions_lora_checkpoint-18750_greedy.csv


KeyboardInterrupt: 

In [8]:
import os, gc, math, re
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model, TaskType

# -------------------------
# 0) Pick best available HU<->RO model (prefer HU->RO)
# -------------------------
CANDIDATES = [
    "Helsinki-NLP/opus-mt-hu-ro",
    "Helsinki-NLP/opus-mt-ro-hu",
]

chosen = None
for m in CANDIDATES:
    try:
        _ = AutoTokenizer.from_pretrained(m)
        chosen = m
        break
    except Exception as e:
        print("Not available:", m, "|", type(e).__name__)

if chosen is None:
    raise RuntimeError("No HU<->RO OPUS model available. You must use a pivot or different base.")

MODEL_NAME = chosen
print("Chosen model:", MODEL_NAME)

# Force direction HU -> RO
# If you got ro-hu, we will swap columns at preprocessing time.
REVERSE = (MODEL_NAME.endswith("ro-hu"))
print("REVERSE (means base is RO->HU):", REVERSE)

# -------------------------
# 1) Environment / memory settings (Windows + 6GB friendly)
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# -------------------------
# 2) Load data
# -------------------------
train_path = DATA_PROCESSED / "train.csv"
val_path   = DATA_PROCESSED / "val.csv"

train_full = pd.read_csv(train_path)
val_full   = pd.read_csv(val_path)

# -------------------------
# 3) Curriculum: oversample "legal-heavy" lines early
#    (helps terminology + reduces hallucinated institutions)
# -------------------------
LEGAL_MARKERS_RO = [
    "Regulamentul", "Directiva", "articol", "alineat", "considerent",
    "Comisia", "Consiliul", "Parlamentul", "Uniunii", "statele membre"
]
LEGAL_MARKERS_HU = [
    "rendelet", "irányelv", "cikk", "bekezdés",
    "Bizottság", "Tanács", "Parlament", "Unió", "tagállam"
]

def is_legalish(row):
    hu = str(row["hu"])
    ro = str(row["ro"])
    hu_hit = any(m in hu for m in LEGAL_MARKERS_HU)
    ro_hit = any(m in ro for m in LEGAL_MARKERS_RO)
    return hu_hit or ro_hit

# pick sizes (adjust)
train_n = min(50000, len(train_full))
val_n   = min(2000, len(val_full))

train_sample = train_full.sample(train_n, random_state=42)

# make a curriculum subset: legalish + some random
legal_part = train_sample[train_sample.apply(is_legalish, axis=1)]
rand_part  = train_sample.sample(min(len(train_sample), max(5000, train_n // 5)), random_state=43)
cur_df = pd.concat([legal_part, rand_part], ignore_index=True).drop_duplicates()

# final train is curriculum first then rest (Trainer shuffles each epoch, but this still improves mix)
train_df = pd.concat([cur_df, train_sample], ignore_index=True).drop_duplicates()
val_df   = val_full.head(val_n)

print("Train size:", len(train_df), "| curriculum chunk:", len(cur_df), "| Val size:", len(val_df))

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df, preserve_index=False)

# -------------------------
# 4) Tokenizer + preprocessing (correct direction)
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

max_src_len = 256
max_tgt_len = 256

def preprocess(examples):
    # If base is ro->hu, swap so the model sees ro as source and hu as target,
    # BUT our task is hu->ro, so we instead invert columns at *inference* if needed.
    # For training HU->RO specifically: always set source=hu and target=ro.
    src_texts = examples["hu"]
    tgt_texts = examples["ro"]

    # If you accidentally ended up with ro-hu base, you can still train HU->RO,
    # but it fights the pretrained direction; you’ll usually do better by using hu-ro.
    # We'll still allow it, but warn.
    if REVERSE:
        # We keep HU->RO data, but note: base is RO->HU so performance may be weaker.
        pass

    model_inputs = tokenizer(src_texts, truncation=True, max_length=max_src_len)

    labels = tokenizer(text_target=tgt_texts, truncation=True, max_length=max_tgt_len)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

# -------------------------
# 5) Load model + training stability tweaks
# -------------------------
config = AutoConfig.from_pretrained(MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)

# Memory saver + speed tradeoff
base_model.config.use_cache = False
if hasattr(base_model, "gradient_checkpointing_enable"):
    base_model.gradient_checkpointing_enable()

# -------------------------
# 6) LoRA tuned for Marian (safe targeting)
#    Marian usually has q_proj/v_proj in attention modules in newer versions,
#    but if your build doesn't, we gracefully fall back to full fine-tune.
# -------------------------
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

model = base_model
using_lora = False
try:
    model = get_peft_model(base_model, lora_config)
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    using_lora = True
    print("Using LoRA.")
except Exception as e:
    print("LoRA not compatible here; training full model instead:", type(e).__name__, e)
    model = base_model

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# -------------------------
# 7) Training arguments tuned for legal MT on small GPU
# -------------------------
# 6GB-safe defaults:
batch_size = 4 if DEVICE == "cuda" else 2
grad_accum = 4 if DEVICE == "cuda" else 8  # effective batch ~16
num_train_epochs = 2                       # start 2; go to 3 if still improving

args = Seq2SeqTrainingArguments(
    output_dir=str(PROJECT_ROOT / "checkpoints" / "opus_hu_ro_legal_direct"),
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    learning_rate=3e-4 if using_lora else 5e-5,   # LoRA can use higher LR
    num_train_epochs=num_train_epochs,
    fp16=(DEVICE == "cuda"),
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=0,  # Windows-safe
    # MT quality knobs
    label_smoothing_factor=0.1,  # helps robustness + reduces overconfident weird tokens
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.01 if not using_lora else 0.0,
    # speed/memory
    predict_with_generate=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
print("Done. Saved to:", args.output_dir)


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: ecb16652-d4dc-423d-be78-350d7120b966)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-hu-ro/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


Not available: Helsinki-NLP/opus-mt-hu-ro | OSError
Not available: Helsinki-NLP/opus-mt-ro-hu | OSError


RuntimeError: No HU<->RO OPUS model available. You must use a pivot or different base.

In [12]:
import os, gc, json
import torch
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM

# -------------------------
# Settings
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"  # strong + small enough
BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_PSEUDO_DIR = Path(PROJECT_ROOT) / "data" / "pseudo"
OUT_PSEUDO_DIR.mkdir(parents=True, exist_ok=True)

# Use a subset first if needed (speed)
TRAIN_N = 50000
VAL_N   = 2000

# -------------------------
# Option C helper: ensure base safetensors exists (handles sharded)
# -------------------------
def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("Base safetensors exists:", out_dir)
        return out_dir

    print(f"\n[Convert] Downloading snapshot for: {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
        ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.tflite", "*.onnx"]
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        print("[Convert] Found single pytorch_model.bin -> torch.load")
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        if not index_path.exists():
            raise FileNotFoundError("Could not find pytorch_model.bin or pytorch_model.bin.index.json")

        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        print(f"[Convert] Found sharded weights: {len(shard_files)} shards")

        for sf in tqdm(shard_files, desc=f"[Convert] Loading shards {model_id.split('/')[-1]}"):
            sp = snap_dir / sf
            shard_state = torch.load(sp, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    print("[Convert] Saving safetensors ->", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

HU_EN_SAFE = ensure_base_safetensors(HU_EN_BASE, BASE_SAFE_ROOT)

# -------------------------
# Translate with progress + OOM-safe batch shrink
# -------------------------
def translate(model, tok, texts, bs=16, max_new=128, num_beams=1):
    model.eval()
    outs = []
    i = 0
    pbar = tqdm(total=len(texts), desc="HU→EN pseudo", unit="sent")
    while i < len(texts):
        cur_bs = min(bs, len(texts) - i)
        batch = texts[i:i+cur_bs]
        try:
            inp = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
            with torch.inference_mode():
                gen = model.generate(**inp, num_beams=num_beams, max_new_tokens=max_new, do_sample=False)
            outs.extend(tok.batch_decode(gen, skip_special_tokens=True))
            i += cur_bs
            pbar.update(cur_bs)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            bs = max(1, bs // 2)
            print(f"⚠ OOM -> reducing batch size to {bs}")
            if bs == 1:
                continue
    pbar.close()
    return outs

# -------------------------
# Load data
# -------------------------
train_df = pd.read_csv(DATA_PROCESSED / "train.csv").sample(TRAIN_N, random_state=42)
val_df   = pd.read_csv(DATA_PROCESSED / "val.csv").head(VAL_N)

print("Train size:", len(train_df), "Val size:", len(val_df))

# -------------------------
# Load tokenizer + model from safetensors folder
# -------------------------
tok_hu_en = AutoTokenizer.from_pretrained(HU_EN_BASE, use_fast=True)
model_hu_en = AutoModelForSeq2SeqLM.from_pretrained(
    HU_EN_SAFE,
    use_safetensors=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)

# -------------------------
# Generate pseudo EN
# -------------------------
train_en = translate(model_hu_en, tok_hu_en, train_df["hu"].tolist(), bs=16, num_beams=1)
val_en   = translate(model_hu_en, tok_hu_en, val_df["hu"].tolist(), bs=16, num_beams=1)

train_p1 = train_df.copy()
val_p1   = val_df.copy()
train_p1["en_pseudo"] = train_en
val_p1["en_pseudo"]   = val_en

# -------------------------
# Save pseudo data
# -------------------------
train_p1_path = OUT_PSEUDO_DIR / f"train_hu_ro_enpseudo_{TRAIN_N}.csv"
val_p1_path   = OUT_PSEUDO_DIR / f"val_hu_ro_enpseudo_{VAL_N}.csv"

train_p1.to_csv(train_p1_path, index=False)
val_p1.to_csv(val_p1_path, index=False)

print("Saved:", train_p1_path)
print("Saved:", val_p1_path)


DEVICE: cuda
Torch: 2.5.1+cu121
Base safetensors exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en
Train size: 50000 Val size: 2000


HU→EN pseudo: 100%|██████████| 50000/50000 [29:52<00:00, 27.89sent/s]
HU→EN pseudo: 100%|██████████| 2000/2000 [01:12<00:00, 27.61sent/s]


Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\pseudo\train_hu_ro_enpseudo_50000.csv
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\pseudo\val_hu_ro_enpseudo_2000.csv


In [13]:
import os, gc, json
import torch
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model, TaskType

# -------------------------
# Settings
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

PSEUDO_DIR = Path(PROJECT_ROOT) / "data" / "pseudo"
# Match the filenames saved in Cell B:
TRAIN_N = 50000
VAL_N   = 2000
train_p1_path = PSEUDO_DIR / f"train_hu_ro_enpseudo_{TRAIN_N}.csv"
val_p1_path   = PSEUDO_DIR / f"val_hu_ro_enpseudo_{VAL_N}.csv"

HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"

# Training knobs (6GB-friendly)
MAX_LEN     = 256
BATCH_SIZE  = 4
GRAD_ACCUM  = 4
EPOCHS      = 2

# -------------------------
# Option C helper: ensure base safetensors exists (handles sharded)
# -------------------------
def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("✅ Base safetensors exists:", out_dir)
        return out_dir

    print(f"\n[Convert] Downloading snapshot for: {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
        ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.tflite", "*.onnx"]
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        print("[Convert] Found single pytorch_model.bin -> torch.load")
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        if not index_path.exists():
            raise FileNotFoundError("Could not find pytorch_model.bin or pytorch_model.bin.index.json")

        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        print(f"[Convert] Found sharded weights: {len(shard_files)} shards")

        for sf in tqdm(shard_files, desc=f"[Convert] Loading shards {model_id.split('/')[-1]}"):
            sp = snap_dir / sf
            shard_state = torch.load(sp, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    print("[Convert] Saving safetensors ->", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

HU_EN_SAFE = ensure_base_safetensors(HU_EN_BASE, BASE_SAFE_ROOT)
EN_RO_SAFE = ensure_base_safetensors(EN_RO_BASE, BASE_SAFE_ROOT)

# -------------------------
# LoRA fine-tune (older transformers: eval_strategy)
# -------------------------
def finetune_lora(
    base_name: str,
    base_safe_dir: Path,
    src_col: str, tgt_col: str,
    train_df: pd.DataFrame, val_df: pd.DataFrame,
    out_dir: str,
    lr_lora: float = 3e-4,
    epochs: int = 2,
    max_len: int = 256,
    bs: int = 4,
    grad_accum: int = 4,
):
    print(f"\n[Train] {base_name} | {src_col} → {tgt_col}")

    tok = AutoTokenizer.from_pretrained(base_name, use_fast=True)

    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    val_ds   = Dataset.from_pandas(val_df, preserve_index=False)

    def prep(ex):
        x = tok(ex[src_col], truncation=True, max_length=max_len)
        y = tok(text_target=ex[tgt_col], truncation=True, max_length=max_len)
        x["labels"] = y["input_ids"]
        return x

    train_tok = train_ds.map(prep, batched=True, remove_columns=train_ds.column_names)
    val_tok   = val_ds.map(prep, batched=True, remove_columns=val_ds.column_names)

    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe_dir,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else None
    ).to(DEVICE)

    base.config.use_cache = False
    if hasattr(base, "gradient_checkpointing_enable"):
        base.gradient_checkpointing_enable()

    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=8, lora_alpha=16, lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
    )

    using_lora = False
    try:
        model = get_peft_model(base, lora_cfg)
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
        using_lora = True
        print("✔ Using LoRA")
    except Exception as e:
        print("⚠ LoRA failed, full fine-tune:", type(e).__name__, e)
        model = base

    collator = DataCollatorForSeq2Seq(tok, model=model)

    args = Seq2SeqTrainingArguments(
        output_dir=out_dir,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        gradient_accumulation_steps=grad_accum,
        learning_rate=(lr_lora if using_lora else 5e-5),
        num_train_epochs=epochs,
        fp16=(DEVICE == "cuda"),

        # older transformers compat:
        eval_strategy="steps",
        eval_steps=1000,

        save_steps=1000,
        save_total_limit=2,
        logging_steps=100,
        report_to="none",
        dataloader_num_workers=0,

        label_smoothing_factor=0.1,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        predict_with_generate=False,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tok,
        data_collator=collator,
    )

    trainer.train()
    return model, tok

# -------------------------
# Load pseudo-parallel data created in Cell B
# -------------------------
if not train_p1_path.exists() or not val_p1_path.exists():
    raise FileNotFoundError(
        "Pseudo data not found. Run Cell B first.\n"
        f"Missing: {train_p1_path} or {val_p1_path}"
    )

train_p1 = pd.read_csv(train_p1_path)
val_p1   = pd.read_csv(val_p1_path)

print("Loaded pseudo data:", len(train_p1), "train |", len(val_p1), "val")

# -------------------------
# C1) Fine-tune HU→EN on (HU, EN_pseudo)
# -------------------------
hu_en_ft, hu_en_ft_tok = finetune_lora(
    HU_EN_BASE, HU_EN_SAFE,
    src_col="hu", tgt_col="en_pseudo",
    train_df=train_p1, val_df=val_p1,
    out_dir=str(PROJECT_ROOT / "checkpoints" / "hu_en_legal_lora"),
    epochs=EPOCHS,
    max_len=MAX_LEN,
    bs=BATCH_SIZE,
    grad_accum=GRAD_ACCUM,
)

# -------------------------
# C2) Fine-tune EN→RO on (EN_pseudo, RO)
# -------------------------
en_ro_ft, en_ro_ft_tok = finetune_lora(
    EN_RO_BASE, EN_RO_SAFE,
    src_col="en_pseudo", tgt_col="ro",
    train_df=train_p1, val_df=val_p1,
    out_dir=str(PROJECT_ROOT / "checkpoints" / "en_ro_legal_lora"),
    epochs=EPOCHS,
    max_len=MAX_LEN,
    bs=BATCH_SIZE,
    grad_accum=GRAD_ACCUM,
)

print("\n✅ Done training pivot adapters.")


DEVICE: cuda
Torch: 2.5.1+cu121
✅ Base safetensors exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en

[Convert] Downloading snapshot for: Helsinki-NLP/opus-mt-en-ro
[Convert] Found single pytorch_model.bin -> torch.load


  state = torch.load(bin_path, map_location="cpu")


[Convert] Saving safetensors -> D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-en-ro




Loaded pseudo data: 50000 train | 2000 val

[Train] Helsinki-NLP/opus-mt-hu-en | hu → en_pseudo


Map: 100%|██████████| 50000/50000 [00:19<00:00, 2565.35 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2560.82 examples/s]
  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


✔ Using LoRA


Step,Training Loss,Validation Loss
1000,1.7318,1.656345
2000,1.7305,1.652001
3000,1.7244,1.650608
4000,1.7232,1.648507
5000,1.7163,1.647224
6000,1.7175,1.647107


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 43da0b4e-4bfd-4589-b5f7-f362117eecbf)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-en-ro/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].



[Train] Helsinki-NLP/opus-mt-en-ro | en_pseudo → ro


Map: 100%|██████████| 50000/50000 [00:20<00:00, 2427.07 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2427.18 examples/s]
  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


✔ Using LoRA


Step,Training Loss,Validation Loss
1000,2.7221,2.654602
2000,2.7281,2.641886
3000,2.7173,2.640958
4000,2.7045,2.638611
5000,2.7056,2.63642
6000,2.6819,2.635623



✅ Done training pivot adapters.


In [14]:
import os, gc, json, re, unicodedata
import numpy as np
import pandas as pd
import torch
import sacrebleu
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel

# -------------------------
# 0) Settings
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Bases
HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"

# Run dirs (where Cell C saved checkpoints)
HU_EN_RUN = Path(PROJECT_ROOT) / "checkpoints" / "hu_en_legal_lora"
EN_RO_RUN = Path(PROJECT_ROOT) / "checkpoints" / "en_ro_legal_lora"

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_DIR = Path(OUTPUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Decode knobs
MAX_INPUT_LEN  = 256
MAX_NEW_TOKENS = 128
BS = 16 if DEVICE == "cuda" else 4
NUM_BEAMS = 1   # use 1 for speed; set 4 for final run
LENGTH_PENALTY = 1.0

# -------------------------
# 1) Helpers
# -------------------------
def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        return out_dir

    print(f"[Convert] Creating base safetensors for {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        for sf in tqdm(shard_files, desc=f"[Convert] shards {model_id.split('/')[-1]}"):
            shard_state = torch.load(snap_dir / sf, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

def load_lora_model(base_id: str, run_dir: Path):
    base_safe = ensure_base_safetensors(base_id, BASE_SAFE_ROOT)
    ckpt = get_latest_checkpoint(run_dir)
    print(f"Loading {base_id} + LoRA from:", ckpt)

    tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)
    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else None
    ).to(DEVICE)

    model = PeftModel.from_pretrained(base, ckpt).to(DEVICE)
    model.eval()
    return model, tok, ckpt

def batched_generate(model, tok, texts, bs=16, max_input_len=256, max_new=128, num_beams=1, length_penalty=1.0, desc="Gen"):
    outs = []
    i = 0
    pbar = tqdm(total=len(texts), desc=desc, unit="sent")
    while i < len(texts):
        cur_bs = min(bs, len(texts) - i)
        batch = texts[i:i+cur_bs]
        try:
            inp = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_input_len).to(DEVICE)
            with torch.inference_mode():
                gen = model.generate(
                    **inp,
                    num_beams=num_beams,
                    length_penalty=length_penalty,
                    max_new_tokens=max_new,
                    do_sample=False,
                )
            outs.extend(tok.batch_decode(gen, skip_special_tokens=True))
            i += cur_bs
            pbar.update(cur_bs)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            bs = max(1, bs // 2)
            print(f"⚠ OOM -> reducing batch size to {bs}")
    pbar.close()
    return outs

# -------------------------
# 2) Load the two fine-tuned LoRA models
# -------------------------
hu_en_model, hu_en_tok, hu_en_ckpt = load_lora_model(HU_EN_BASE, HU_EN_RUN)
en_ro_model, en_ro_tok, en_ro_ckpt = load_lora_model(EN_RO_BASE, EN_RO_RUN)

# -------------------------
# 3) Load test data
# -------------------------
test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()
print("Test size:", len(src_sentences))

# -------------------------
# 4) Pivot translation: HU -> EN -> RO
# -------------------------
en_mid = batched_generate(
    hu_en_model, hu_en_tok, src_sentences,
    bs=BS, max_input_len=MAX_INPUT_LEN, max_new=MAX_NEW_TOKENS,
    num_beams=NUM_BEAMS, length_penalty=LENGTH_PENALTY,
    desc="HU→EN (legal)"
)

hyps = batched_generate(
    en_ro_model, en_ro_tok, en_mid,
    bs=BS, max_input_len=MAX_INPUT_LEN, max_new=MAX_NEW_TOKENS,
    num_beams=NUM_BEAMS, length_penalty=LENGTH_PENALTY,
    desc="EN→RO (legal)"
)

# -------------------------
# 5) Metrics
# -------------------------
bleu = sacrebleu.corpus_bleu(hyps, [refs]).score
print("\nPivot BLEU:", round(bleu, 2))

hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diacritics for c in s)

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", s.lower()) + " "
    return any(w in s for w in [" the ", " and ", " of ", " to ", " for ", " with ", " on "])

hu_leak = np.mean([has_hu_diacritics(h) for h in hyps]) * 100
en_leak_rate = np.mean([en_leak(h) for h in hyps]) * 100
print("HU diacritics leak %:", round(hu_leak, 2))
print("English leak proxy %:", round(en_leak_rate, 2))

# ---- glossary metric (normalized) ----
def norm_ro(s):
    s = s.lower()
    s = s.replace("ţ","ț").replace("ş","ș")
    return unicodedata.normalize("NFKC", s)

glossary_norm = {norm_ro(k): [norm_ro(v) for v in vs] for k, vs in glossary.items()}

def glossary_hit(src: str, hyp: str) -> list:
    src_n, hyp_n = norm_ro(src), norm_ro(hyp)
    checks = []
    for src_term, ro_forms in glossary_norm.items():
        if src_term in src_n:
            checks.append(any(f in hyp_n for f in ro_forms))
    return checks

hits = [glossary_hit(s, h) for s, h in zip(src_sentences, hyps)]
flat = [x for row in hits for x in row]
if flat:
    print("Glossary accuracy %:", round(np.mean(flat) * 100, 2))
else:
    print("Glossary accuracy: (no glossary terms found in test)")

# -------------------------
# 6) Save outputs
# -------------------------
out_path = OUT_DIR / f"pivot_predictions_{hu_en_ckpt.name}__{en_ro_ckpt.name}_beam{NUM_BEAMS}.csv"
pd.DataFrame({
    "source_hu": src_sentences,
    "pivot_en": en_mid,
    "reference_ro": refs,
    "hypothesis_ro": hyps
}).to_csv(out_path, index=False)
print("Saved:", out_path)

# -------------------------
# 7) Optional COMET on first N (fast sanity check)
# -------------------------
# If you want COMET, run:
#   pip install -q unbabel-comet
# Then uncomment below.

# try:
#     from comet import download_model, load_from_checkpoint
#     N = 1000
#     comet_path = download_model("Unbabel/wmt22-comet-da")
#     comet_model = load_from_checkpoint(comet_path)
#     data = [{"src": s, "mt": m, "ref": r} for s,m,r in zip(src_sentences[:N], hyps[:N], refs[:N])]
#     out = comet_model.predict(data, batch_size=8, gpus=1 if DEVICE=="cuda" else 0)
#     print("COMET (first 1000):", round(float(np.mean(out.scores)), 4))
# except Exception as e:
#     print("COMET skipped:", type(e).__name__, e)


DEVICE: cuda
Torch: 2.5.1+cu121
Loading Helsinki-NLP/opus-mt-hu-en + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\hu_en_legal_lora\checkpoint-6250




Loading Helsinki-NLP/opus-mt-en-ro + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\en_ro_legal_lora\checkpoint-6250
Test size: 30366


HU→EN (legal): 100%|██████████| 30366/30366 [26:22<00:00, 19.19sent/s]
EN→RO (legal): 100%|██████████| 30366/30366 [30:01<00:00, 16.86sent/s]



Pivot BLEU: 38.64
HU diacritics leak %: 0.51
English leak proxy %: 0.07
Glossary accuracy %: 46.56
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\pivot_predictions_checkpoint-6250__checkpoint-6250_beam1.csv
