In [8]:
!pip -q install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\RoG\\anaconda3\\envs\\pythonRL\\Lib\\site-packages\\torch\\lib\\c10_cuda.dll'
Consider using the `--user` option or check the permissions.



In [4]:
import os
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR = PROJECT_ROOT / "data" / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_NAME = "Helsinki-NLP/opus-mt-hu-en"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_RAW:", DATA_RAW)
print("DATA_PROCESSED:", DATA_PROCESSED)
print("OUTPUT_DIR:", OUTPUT_DIR)

PROJECT_ROOT: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal
DATA_RAW: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\raw
DATA_PROCESSED: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed
OUTPUT_DIR: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs


In [3]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version (torch):", torch.version.cuda)


Torch version: 2.5.1+cu121
CUDA available: True
CUDA version (torch): 12.1


In [None]:
import numpy as np

hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s): 
    return any(c in hu_diacritics for c in s)


glossary = {
    "Bizottság": ["Comisia"],
    "Tanács": ["Consiliul"],
    "Közösség": ["Comunitatea", "Comunității"],
}

def glossary_hit(src, hyp, glossary):
    hits = [] 
    for k, vals in glossary.items():
        if k in src:
            ok = any(v in hyp for v in vals) 
            hits.append(ok)
    return hits




In [None]:
import os
import torch
import numpy as np
import pandas as pd
import sacrebleu

from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel


RUN_DIR = Path(PROJECT_ROOT) / "checkpoints" / "opus_hu_ro_legal"   


def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

CKPT_DIR = get_latest_checkpoint(RUN_DIR)
print("Using checkpoint:", CKPT_DIR)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)



def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:

    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)


    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("Base safetensors already exists:", out_dir)
        return out_dir

    print("Downloading base model snapshot:", model_id)
    snap_dir = Path(snapshot_download(repo_id=model_id))

    
    bin_path = snap_dir / "pytorch_model.bin"
    if not bin_path.exists():
        
        shards = sorted(snap_dir.glob("pytorch_model-*.bin"))
        if not shards:
            raise FileNotFoundError("Could not find pytorch_model.bin or shards in snapshot.")
        
        raise RuntimeError(
            f"Found sharded weights ({len(shards)} files). "
            "This quick converter handles non-sharded pytorch_model.bin. "
            "Tell me and I’ll give you the sharded merge converter."
        )

 
    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

   
    print("Loading base .bin weights with torch.load:", bin_path)
    state = torch.load(bin_path, map_location="cpu")
    missing, unexpected = model.load_state_dict(state, strict=False)
    print(f"Loaded base weights. missing={len(missing)} unexpected={len(unexpected)}")

    
    print("Saving base model as safetensors to:", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)

    return out_dir

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_DIR = ensure_base_safetensors(MODEL_NAME, BASE_SAFE_ROOT)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)


def ensure_adapter_safetensors(ckpt_dir: Path):
    bin_path = ckpt_dir / "adapter_model.bin"
    st_path  = ckpt_dir / "adapter_model.safetensors"

    if st_path.exists():
        print("Adapter safetensors already exists:", st_path)
        return

    if not bin_path.exists():
        raise FileNotFoundError(f"adapter_model.bin not found in {ckpt_dir}")

    print("Converting adapter bin -> safetensors:", bin_path)
    adapter_state = torch.load(bin_path, map_location="cpu")
    save_file(adapter_state, str(st_path))
    print("Wrote:", st_path)

ensure_adapter_safetensors(CKPT_DIR)


base_model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_SAFE_DIR,
    use_safetensors=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None,
).to(DEVICE)

model = PeftModel.from_pretrained(base_model, CKPT_DIR).to(DEVICE)
model.eval()

print("Model device:", next(model.parameters()).device)


test_df = pd.read_csv(DATA_PROCESSED / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()


def translate_batch(model, tokenizer, sentences, batch_size=16, max_input_len=256, max_new_tokens=96, num_beams=1):
    hyps = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_input_len
        ).to(DEVICE)

        with torch.inference_mode():
            out = model.generate(
                **inputs,
                num_beams=num_beams,
                do_sample=False,
                max_new_tokens=max_new_tokens,
                early_stopping=True,
            )
        hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return hyps


finetuned_hyps = translate_batch(
    model, tokenizer, src_sentences,
    batch_size=16 if DEVICE=="cuda" else 4,
    num_beams=1,
    max_new_tokens=96
)


bleu_ft = sacrebleu.corpus_bleu(finetuned_hyps, [refs]).score
print("Finetuned BLEU:", round(bleu_ft, 2))

leak_rate_ft = np.mean([has_hu_diacritics(h) for h in finetuned_hyps])
print("HU diacritics leak rate (FT):", round(leak_rate_ft*100, 2), "%")

all_hits_ft = [glossary_hit(s, h, glossary) for s, h in zip(src_sentences, finetuned_hyps)]
flat_ft = [x for row in all_hits_ft for x in row]
if flat_ft:
    print("Glossary accuracy (FT):", round(np.mean(flat_ft)*100, 2), "%")
else:
    print("No glossary terms found in sample.")


out_path = Path(OUTPUT_DIR) / f"finetuned_predictions_lora_{CKPT_DIR.name}.csv"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": finetuned_hyps}).to_csv(out_path, index=False)
print("Saved:", out_path)


Using checkpoint: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750
DEVICE: cuda
Torch: 2.5.1+cu121
Base safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en




Adapter safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750\adapter_model.safetensors
Model device: cuda:0


100%|██████████| 1898/1898 [30:52<00:00,  1.02it/s]


Finetuned BLEU: 10.01
HU diacritics leak rate (FT): 0.81 %
Glossary accuracy (FT): 36.38 %
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\finetuned_predictions_lora_checkpoint-18750.csv


In [None]:


import os, re, json, unicodedata
import torch
import numpy as np
import pandas as pd
import sacrebleu

from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel


MODEL_NAME = "Helsinki-NLP/opus-mt-hu-en"   

RUN_DIR    = Path(PROJECT_ROOT) / "checkpoints" / "opus_hu_ro_legal"
OUT_DIR    = Path(OUTPUT_DIR)
BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"


MAX_INPUT_LEN  = 256
MAX_NEW_TOKENS = 96

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

CKPT_DIR = get_latest_checkpoint(RUN_DIR)
print("Using checkpoint:", CKPT_DIR)


def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("Base safetensors already exists:", out_dir)
        return out_dir

    print("Downloading base model snapshot:", model_id)
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
        ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.tflite", "*.onnx"]
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    
    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        print("Found single pytorch_model.bin; loading with torch.load:", bin_path)
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        
        index_path = snap_dir / "pytorch_model.bin.index.json"
        if not index_path.exists():
            shards = sorted(snap_dir.glob("pytorch_model-*.bin"))
            raise FileNotFoundError(
                f"Could not find pytorch_model.bin or pytorch_model.bin.index.json.\nFound {len(shards)} shard files."
            )

        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        print(f"Found sharded weights: {len(shard_files)} shards")

        for sf in shard_files:
            sp = snap_dir / sf
            if not sp.exists():
                raise FileNotFoundError(f"Missing shard file: {sp}")
            shard_state = torch.load(sp, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    print("Saving base model as safetensors to:", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

BASE_SAFE_DIR = ensure_base_safetensors(MODEL_NAME, BASE_SAFE_ROOT)


def ensure_adapter_safetensors(ckpt_dir: Path):
    bin_path = ckpt_dir / "adapter_model.bin"
    st_path  = ckpt_dir / "adapter_model.safetensors"

    if st_path.exists():
        print("Adapter safetensors already exists:", st_path)
        return

    if not bin_path.exists():
        raise FileNotFoundError(f"adapter_model.bin not found in {ckpt_dir}")

    print("Converting adapter bin -> safetensors:", bin_path)
    adapter_state = torch.load(bin_path, map_location="cpu")
    save_file(adapter_state, str(st_path))
    del adapter_state
    print("Wrote:", st_path)

ensure_adapter_safetensors(CKPT_DIR)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_SAFE_DIR,
    use_safetensors=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None,
).to(DEVICE)

model = PeftModel.from_pretrained(base_model, CKPT_DIR).to(DEVICE)
model.eval()
print("Model device:", next(model.parameters()).device)


test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()
print("Test size:", len(src_sentences))


hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diacritics for c in s)

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", s.lower()) + " "
    common = [" the ", " and ", " of ", " to ", " for ", " with ", " on "]
    return any(w in s for w in common)

def norm_ro(s: str) -> str:
    s = s.lower()
    s = s.replace("ţ", "ț").replace("ş", "ș")
    return unicodedata.normalize("NFKC", s)


glossary_norm = {norm_ro(k): [norm_ro(v) for v in vs] for k, vs in glossary.items()}

def glossary_hit(src: str, hyp: str, glos_norm: dict) -> list:
    src_n, hyp_n = norm_ro(src), norm_ro(hyp)
    checks = []
    for src_term, ro_forms in glos_norm.items():
        if src_term in src_n:
            checks.append(any(f in hyp_n for f in ro_forms))
    return checks


def translate_batch(
    model, tokenizer, sentences,
    batch_size=16, max_input_len=256, max_new_tokens=96,
    num_beams=1, length_penalty=1.0
):
    hyps = []
    i = 0
    while i < len(sentences):
        bs = min(batch_size, len(sentences) - i)
        batch = sentences[i:i+bs]
        try:
            inputs = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_input_len
            ).to(DEVICE)

            with torch.inference_mode():
                out = model.generate(
                    **inputs,
                    num_beams=num_beams,
                    length_penalty=length_penalty,
                    do_sample=False,
                    max_new_tokens=max_new_tokens,
                    early_stopping=True,
                )

            hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
            i += bs

        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            if batch_size <= 1:
                raise
            batch_size = max(1, batch_size // 2)
            print(f"OOM -> reducing batch_size to {batch_size} and retrying...")

    return hyps


greedy_bs = 16 if DEVICE == "cuda" else 4
finetuned_hyps = translate_batch(
    model, tokenizer, src_sentences,
    batch_size=greedy_bs,
    max_input_len=MAX_INPUT_LEN,
    max_new_tokens=MAX_NEW_TOKENS,
    num_beams=1,
    length_penalty=1.0
)

bleu_g = sacrebleu.corpus_bleu(finetuned_hyps, [refs]).score
leak_hu_g = np.mean([has_hu_diacritics(h) for h in finetuned_hyps]) * 100
leak_en_g = np.mean([en_leak(h) for h in finetuned_hyps]) * 100
hits_g = [glossary_hit(s, h, glossary_norm) for s, h in zip(src_sentences, finetuned_hyps)]
flat_g = [x for row in hits_g for x in row]
glos_g = (np.mean(flat_g) * 100) if flat_g else float("nan")

print("\n=== GREEDY RESULTS ===")
print("BLEU:", round(bleu_g, 2))
print("HU diacritics leak %:", round(leak_hu_g, 2))
print("English leak proxy %:", round(leak_en_g, 2))
print("Glossary accuracy %:", round(glos_g, 2) if not np.isnan(glos_g) else "(no glossary terms found)")

OUT_DIR.mkdir(parents=True, exist_ok=True)
out_path_g = OUT_DIR / f"finetuned_predictions_lora_{CKPT_DIR.name}_greedy.csv"
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": finetuned_hyps}).to_csv(out_path_g, index=False)
print("Saved:", out_path_g)


beam_bs = 8 if DEVICE == "cuda" else 2
cands = []
for lp in [0.8, 1.0, 1.2]:
    hyps_b = translate_batch(
        model, tokenizer, src_sentences,
        batch_size=beam_bs,
        max_input_len=MAX_INPUT_LEN,
        max_new_tokens=128,
        num_beams=4,
        length_penalty=lp
    )
    bleu_b = sacrebleu.corpus_bleu(hyps_b, [refs]).score
    cands.append((bleu_b, lp, hyps_b))
    print(f"Beam4 lp={lp} BLEU={bleu_b:.2f}")

best_bleu, best_lp, best_hyps = max(cands, key=lambda x: x[0])

leak_hu_b = np.mean([has_hu_diacritics(h) for h in best_hyps]) * 100
leak_en_b = np.mean([en_leak(h) for h in best_hyps]) * 100
hits_b = [glossary_hit(s, h, glossary_norm) for s, h in zip(src_sentences, best_hyps)]
flat_b = [x for row in hits_b for x in row]
glos_b = (np.mean(flat_b) * 100) if flat_b else float("nan")

print("\n=== BEST BEAM RESULTS ===")
print("Best lp:", best_lp)
print("BLEU:", round(best_bleu, 2))
print("HU diacritics leak %:", round(leak_hu_b, 2))
print("English leak proxy %:", round(leak_en_b, 2))
print("Glossary accuracy %:", round(glos_b, 2) if not np.isnan(glos_b) else "(no glossary terms found)")

out_path_b = OUT_DIR / f"finetuned_predictions_lora_{CKPT_DIR.name}_beam4_lp{best_lp}.csv"
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": best_hyps}).to_csv(out_path_b, index=False)
print("Saved:", out_path_b)


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 2183fc6f-a243-4f41-99b8-f34aac5e98f7)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-hu-en/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


DEVICE: cuda
Torch: 2.5.1+cu121
Using checkpoint: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750
Base safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en
Adapter safetensors already exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750\adapter_model.safetensors




Model device: cuda:0
Test size: 30366

=== GREEDY RESULTS ===
BLEU: 10.01
HU diacritics leak %: 0.81
English leak proxy %: 6.45
Glossary accuracy %: 40.57
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\finetuned_predictions_lora_checkpoint-18750_greedy.csv


KeyboardInterrupt: 

In [None]:
import os, gc, math, re
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model, TaskType


CANDIDATES = [
    "Helsinki-NLP/opus-mt-hu-ro",
    "Helsinki-NLP/opus-mt-ro-hu",
]

chosen = None
for m in CANDIDATES:
    try:
        _ = AutoTokenizer.from_pretrained(m)
        chosen = m
        break
    except Exception as e:
        print("Not available:", m, "|", type(e).__name__)

if chosen is None:
    raise RuntimeError("No HU<->RO OPUS model available. You must use a pivot or different base.")

MODEL_NAME = chosen
print("Chosen model:", MODEL_NAME)


REVERSE = (MODEL_NAME.endswith("ro-hu"))
print("REVERSE (means base is RO->HU):", REVERSE)


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()


train_path = DATA_PROCESSED / "train.csv"
val_path   = DATA_PROCESSED / "val.csv"

train_full = pd.read_csv(train_path)
val_full   = pd.read_csv(val_path)


LEGAL_MARKERS_RO = [
    "Regulamentul", "Directiva", "articol", "alineat", "considerent",
    "Comisia", "Consiliul", "Parlamentul", "Uniunii", "statele membre"
]
LEGAL_MARKERS_HU = [
    "rendelet", "irányelv", "cikk", "bekezdés",
    "Bizottság", "Tanács", "Parlament", "Unió", "tagállam"
]

def is_legalish(row):
    hu = str(row["hu"])
    ro = str(row["ro"])
    hu_hit = any(m in hu for m in LEGAL_MARKERS_HU)
    ro_hit = any(m in ro for m in LEGAL_MARKERS_RO)
    return hu_hit or ro_hit


train_n = min(50000, len(train_full))
val_n   = min(2000, len(val_full))

train_sample = train_full.sample(train_n, random_state=42)

legal_part = train_sample[train_sample.apply(is_legalish, axis=1)]
rand_part  = train_sample.sample(min(len(train_sample), max(5000, train_n // 5)), random_state=43)
cur_df = pd.concat([legal_part, rand_part], ignore_index=True).drop_duplicates()

train_df = pd.concat([cur_df, train_sample], ignore_index=True).drop_duplicates()
val_df   = val_full.head(val_n)

print("Train size:", len(train_df), "| curriculum chunk:", len(cur_df), "| Val size:", len(val_df))

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df, preserve_index=False)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

max_src_len = 256
max_tgt_len = 256

def preprocess(examples):

    src_texts = examples["hu"]
    tgt_texts = examples["ro"]

    if REVERSE:
        pass

    model_inputs = tokenizer(src_texts, truncation=True, max_length=max_src_len)

    labels = tokenizer(text_target=tgt_texts, truncation=True, max_length=max_tgt_len)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)


config = AutoConfig.from_pretrained(MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)

base_model.config.use_cache = False
if hasattr(base_model, "gradient_checkpointing_enable"):
    base_model.gradient_checkpointing_enable()


lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

model = base_model
using_lora = False
try:
    model = get_peft_model(base_model, lora_config)
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    using_lora = True
    print("Using LoRA.")
except Exception as e:
    print("LoRA not compatible here; training full model instead:", type(e).__name__, e)
    model = base_model

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


batch_size = 4 if DEVICE == "cuda" else 2
grad_accum = 4 if DEVICE == "cuda" else 8  

args = Seq2SeqTrainingArguments(
    output_dir=str(PROJECT_ROOT / "checkpoints" / "opus_hu_ro_legal_direct"),
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    learning_rate=3e-4 if using_lora else 5e-5,   
    num_train_epochs=num_train_epochs,
    fp16=(DEVICE == "cuda"),
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=0,  
    
    label_smoothing_factor=0.1, 
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.01 if not using_lora else 0.0,
    
    predict_with_generate=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
print("Done. Saved to:", args.output_dir)


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: ecb16652-d4dc-423d-be78-350d7120b966)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-hu-ro/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


Not available: Helsinki-NLP/opus-mt-hu-ro | OSError
Not available: Helsinki-NLP/opus-mt-ro-hu | OSError


RuntimeError: No HU<->RO OPUS model available. You must use a pivot or different base.

In [None]:
import os, gc, json
import torch
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"  
BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_PSEUDO_DIR = Path(PROJECT_ROOT) / "data" / "pseudo"
OUT_PSEUDO_DIR.mkdir(parents=True, exist_ok=True)


TRAIN_N = 50000
VAL_N   = 2000

def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("Base safetensors exists:", out_dir)
        return out_dir

    print(f"\n[Convert] Downloading snapshot for: {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
        ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.tflite", "*.onnx"]
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        print("[Convert] Found single pytorch_model.bin -> torch.load")
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        if not index_path.exists():
            raise FileNotFoundError("Could not find pytorch_model.bin or pytorch_model.bin.index.json")

        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        print(f"[Convert] Found sharded weights: {len(shard_files)} shards")

        for sf in tqdm(shard_files, desc=f"[Convert] Loading shards {model_id.split('/')[-1]}"):
            sp = snap_dir / sf
            shard_state = torch.load(sp, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    print("[Convert] Saving safetensors ->", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

HU_EN_SAFE = ensure_base_safetensors(HU_EN_BASE, BASE_SAFE_ROOT)


def translate(model, tok, texts, bs=16, max_new=128, num_beams=1):
    model.eval()
    outs = []
    i = 0
    pbar = tqdm(total=len(texts), desc="HU→EN pseudo", unit="sent")
    while i < len(texts):
        cur_bs = min(bs, len(texts) - i)
        batch = texts[i:i+cur_bs]
        try:
            inp = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
            with torch.inference_mode():
                gen = model.generate(**inp, num_beams=num_beams, max_new_tokens=max_new, do_sample=False)
            outs.extend(tok.batch_decode(gen, skip_special_tokens=True))
            i += cur_bs
            pbar.update(cur_bs)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            bs = max(1, bs // 2)
            print(f"⚠ OOM -> reducing batch size to {bs}")
            if bs == 1:
                continue
    pbar.close()
    return outs


train_df = pd.read_csv(DATA_PROCESSED / "train.csv").sample(TRAIN_N, random_state=42)
val_df   = pd.read_csv(DATA_PROCESSED / "val.csv").head(VAL_N)

print("Train size:", len(train_df), "Val size:", len(val_df))


tok_hu_en = AutoTokenizer.from_pretrained(HU_EN_BASE, use_fast=True)
model_hu_en = AutoModelForSeq2SeqLM.from_pretrained(
    HU_EN_SAFE,
    use_safetensors=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)

train_en = translate(model_hu_en, tok_hu_en, train_df["hu"].tolist(), bs=16, num_beams=1)
val_en   = translate(model_hu_en, tok_hu_en, val_df["hu"].tolist(), bs=16, num_beams=1)

train_p1 = train_df.copy()
val_p1   = val_df.copy()
train_p1["en_pseudo"] = train_en
val_p1["en_pseudo"]   = val_en


train_p1_path = OUT_PSEUDO_DIR / f"train_hu_ro_enpseudo_{TRAIN_N}.csv"
val_p1_path   = OUT_PSEUDO_DIR / f"val_hu_ro_enpseudo_{VAL_N}.csv"

train_p1.to_csv(train_p1_path, index=False)
val_p1.to_csv(val_p1_path, index=False)

print("Saved:", train_p1_path)
print("Saved:", val_p1_path)


DEVICE: cuda
Torch: 2.5.1+cu121
Base safetensors exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en
Train size: 50000 Val size: 2000


HU→EN pseudo: 100%|██████████| 50000/50000 [29:52<00:00, 27.89sent/s]
HU→EN pseudo: 100%|██████████| 2000/2000 [01:12<00:00, 27.61sent/s]


Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\pseudo\train_hu_ro_enpseudo_50000.csv
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\pseudo\val_hu_ro_enpseudo_2000.csv


In [None]:
import os, gc, json
import torch
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from safetensors.torch import save_file
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model, TaskType


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

PSEUDO_DIR = Path(PROJECT_ROOT) / "data" / "pseudo"

TRAIN_N = 50000
VAL_N   = 2000
train_p1_path = PSEUDO_DIR / f"train_hu_ro_enpseudo_{TRAIN_N}.csv"
val_p1_path   = PSEUDO_DIR / f"val_hu_ro_enpseudo_{VAL_N}.csv"

HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"


MAX_LEN     = 256
BATCH_SIZE  = 4
GRAD_ACCUM  = 4
EPOCHS      = 2


def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        print("✅ Base safetensors exists:", out_dir)
        return out_dir

    print(f"\n[Convert] Downloading snapshot for: {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
        ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.tflite", "*.onnx"]
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        print("[Convert] Found single pytorch_model.bin -> torch.load")
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        if not index_path.exists():
            raise FileNotFoundError("Could not find pytorch_model.bin or pytorch_model.bin.index.json")

        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        print(f"[Convert] Found sharded weights: {len(shard_files)} shards")

        for sf in tqdm(shard_files, desc=f"[Convert] Loading shards {model_id.split('/')[-1]}"):
            sp = snap_dir / sf
            shard_state = torch.load(sp, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    print("[Convert] Saving safetensors ->", out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

HU_EN_SAFE = ensure_base_safetensors(HU_EN_BASE, BASE_SAFE_ROOT)
EN_RO_SAFE = ensure_base_safetensors(EN_RO_BASE, BASE_SAFE_ROOT)


def finetune_lora(
    base_name: str,
    base_safe_dir: Path,
    src_col: str, tgt_col: str,
    train_df: pd.DataFrame, val_df: pd.DataFrame,
    out_dir: str,
    lr_lora: float = 3e-4,
    epochs: int = 2,
    max_len: int = 256,
    bs: int = 4,
    grad_accum: int = 4,
):
    print(f"\n[Train] {base_name} | {src_col} → {tgt_col}")

    tok = AutoTokenizer.from_pretrained(base_name, use_fast=True)

    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    val_ds   = Dataset.from_pandas(val_df, preserve_index=False)

    def prep(ex):
        x = tok(ex[src_col], truncation=True, max_length=max_len)
        y = tok(text_target=ex[tgt_col], truncation=True, max_length=max_len)
        x["labels"] = y["input_ids"]
        return x

    train_tok = train_ds.map(prep, batched=True, remove_columns=train_ds.column_names)
    val_tok   = val_ds.map(prep, batched=True, remove_columns=val_ds.column_names)

    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe_dir,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else None
    ).to(DEVICE)

    base.config.use_cache = False
    if hasattr(base, "gradient_checkpointing_enable"):
        base.gradient_checkpointing_enable()

    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=8, lora_alpha=16, lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
    )

    using_lora = False
    try:
        model = get_peft_model(base, lora_cfg)
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
        using_lora = True
        print("✔ Using LoRA")
    except Exception as e:
        print("⚠ LoRA failed, full fine-tune:", type(e).__name__, e)
        model = base

    collator = DataCollatorForSeq2Seq(tok, model=model)

    args = Seq2SeqTrainingArguments(
        output_dir=out_dir,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        gradient_accumulation_steps=grad_accum,
        learning_rate=(lr_lora if using_lora else 5e-5),
        num_train_epochs=epochs,
        fp16=(DEVICE == "cuda"),

        # older transformers compat:
        eval_strategy="steps",
        eval_steps=1000,

        save_steps=1000,
        save_total_limit=2,
        logging_steps=100,
        report_to="none",
        dataloader_num_workers=0,

        label_smoothing_factor=0.1,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        predict_with_generate=False,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tok,
        data_collator=collator,
    )

    trainer.train()
    return model, tok

if not train_p1_path.exists() or not val_p1_path.exists():
    raise FileNotFoundError(
        "Pseudo data not found. Run Cell B first.\n"
        f"Missing: {train_p1_path} or {val_p1_path}"
    )

train_p1 = pd.read_csv(train_p1_path)
val_p1   = pd.read_csv(val_p1_path)

print("Loaded pseudo data:", len(train_p1), "train |", len(val_p1), "val")


hu_en_ft, hu_en_ft_tok = finetune_lora(
    HU_EN_BASE, HU_EN_SAFE,
    src_col="hu", tgt_col="en_pseudo",
    train_df=train_p1, val_df=val_p1,
    out_dir=str(PROJECT_ROOT / "checkpoints" / "hu_en_legal_lora"),
    epochs=EPOCHS,
    max_len=MAX_LEN,
    bs=BATCH_SIZE,
    grad_accum=GRAD_ACCUM,
)


en_ro_ft, en_ro_ft_tok = finetune_lora(
    EN_RO_BASE, EN_RO_SAFE,
    src_col="en_pseudo", tgt_col="ro",
    train_df=train_p1, val_df=val_p1,
    out_dir=str(PROJECT_ROOT / "checkpoints" / "en_ro_legal_lora"),
    epochs=EPOCHS,
    max_len=MAX_LEN,
    bs=BATCH_SIZE,
    grad_accum=GRAD_ACCUM,
)

print("\n✅ Done training pivot adapters.")


DEVICE: cuda
Torch: 2.5.1+cu121
✅ Base safetensors exists: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-hu-en

[Convert] Downloading snapshot for: Helsinki-NLP/opus-mt-en-ro
[Convert] Found single pytorch_model.bin -> torch.load


  state = torch.load(bin_path, map_location="cpu")


[Convert] Saving safetensors -> D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\safetensors_bases\Helsinki-NLP__opus-mt-en-ro




Loaded pseudo data: 50000 train | 2000 val

[Train] Helsinki-NLP/opus-mt-hu-en | hu → en_pseudo


Map: 100%|██████████| 50000/50000 [00:19<00:00, 2565.35 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2560.82 examples/s]
  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


✔ Using LoRA


Step,Training Loss,Validation Loss
1000,1.7318,1.656345
2000,1.7305,1.652001
3000,1.7244,1.650608
4000,1.7232,1.648507
5000,1.7163,1.647224
6000,1.7175,1.647107


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 43da0b4e-4bfd-4589-b5f7-f362117eecbf)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-en-ro/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].



[Train] Helsinki-NLP/opus-mt-en-ro | en_pseudo → ro


Map: 100%|██████████| 50000/50000 [00:20<00:00, 2427.07 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2427.18 examples/s]
  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


✔ Using LoRA


Step,Training Loss,Validation Loss
1000,2.7221,2.654602
2000,2.7281,2.641886
3000,2.7173,2.640958
4000,2.7045,2.638611
5000,2.7056,2.63642
6000,2.6819,2.635623



✅ Done training pivot adapters.


In [None]:
import os, gc, json, re, unicodedata
import numpy as np
import pandas as pd
import torch
import sacrebleu
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Bases
HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"

HU_EN_RUN = Path(PROJECT_ROOT) / "checkpoints" / "hu_en_legal_lora"
EN_RO_RUN = Path(PROJECT_ROOT) / "checkpoints" / "en_ro_legal_lora"

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_DIR = Path(OUTPUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

MAX_INPUT_LEN  = 256
MAX_NEW_TOKENS = 128
BS = 16 if DEVICE == "cuda" else 4
NUM_BEAMS = 1  
LENGTH_PENALTY = 1.0


def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)

    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        return out_dir

    print(f"[Convert] Creating base safetensors for {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        for sf in tqdm(shard_files, desc=f"[Convert] shards {model_id.split('/')[-1]}"):
            shard_state = torch.load(snap_dir / sf, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

def load_lora_model(base_id: str, run_dir: Path):
    base_safe = ensure_base_safetensors(base_id, BASE_SAFE_ROOT)
    ckpt = get_latest_checkpoint(run_dir)
    print(f"Loading {base_id} + LoRA from:", ckpt)

    tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)
    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else None
    ).to(DEVICE)

    model = PeftModel.from_pretrained(base, ckpt).to(DEVICE)
    model.eval()
    return model, tok, ckpt

def batched_generate(model, tok, texts, bs=16, max_input_len=256, max_new=128, num_beams=1, length_penalty=1.0, desc="Gen"):
    outs = []
    i = 0
    pbar = tqdm(total=len(texts), desc=desc, unit="sent")
    while i < len(texts):
        cur_bs = min(bs, len(texts) - i)
        batch = texts[i:i+cur_bs]
        try:
            inp = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_input_len).to(DEVICE)
            with torch.inference_mode():
                gen = model.generate(
                    **inp,
                    num_beams=num_beams,
                    length_penalty=length_penalty,
                    max_new_tokens=max_new,
                    do_sample=False,
                )
            outs.extend(tok.batch_decode(gen, skip_special_tokens=True))
            i += cur_bs
            pbar.update(cur_bs)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            bs = max(1, bs // 2)
            print(f"⚠ OOM -> reducing batch size to {bs}")
    pbar.close()
    return outs

hu_en_model, hu_en_tok, hu_en_ckpt = load_lora_model(HU_EN_BASE, HU_EN_RUN)
en_ro_model, en_ro_tok, en_ro_ckpt = load_lora_model(EN_RO_BASE, EN_RO_RUN)


test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()
print("Test size:", len(src_sentences))


en_mid = batched_generate(
    hu_en_model, hu_en_tok, src_sentences,
    bs=BS, max_input_len=MAX_INPUT_LEN, max_new=MAX_NEW_TOKENS,
    num_beams=NUM_BEAMS, length_penalty=LENGTH_PENALTY,
    desc="HU→EN (legal)"
)

hyps = batched_generate(
    en_ro_model, en_ro_tok, en_mid,
    bs=BS, max_input_len=MAX_INPUT_LEN, max_new=MAX_NEW_TOKENS,
    num_beams=NUM_BEAMS, length_penalty=LENGTH_PENALTY,
    desc="EN→RO (legal)"
)


bleu = sacrebleu.corpus_bleu(hyps, [refs]).score
print("\nPivot BLEU:", round(bleu, 2))

hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diacritics for c in s)

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", s.lower()) + " "
    return any(w in s for w in [" the ", " and ", " of ", " to ", " for ", " with ", " on "])

hu_leak = np.mean([has_hu_diacritics(h) for h in hyps]) * 100
en_leak_rate = np.mean([en_leak(h) for h in hyps]) * 100
print("HU diacritics leak %:", round(hu_leak, 2))
print("English leak proxy %:", round(en_leak_rate, 2))


def norm_ro(s):
    s = s.lower()
    s = s.replace("ţ","ț").replace("ş","ș")
    return unicodedata.normalize("NFKC", s)

glossary_norm = {norm_ro(k): [norm_ro(v) for v in vs] for k, vs in glossary.items()}

def glossary_hit(src: str, hyp: str) -> list:
    src_n, hyp_n = norm_ro(src), norm_ro(hyp)
    checks = []
    for src_term, ro_forms in glossary_norm.items():
        if src_term in src_n:
            checks.append(any(f in hyp_n for f in ro_forms))
    return checks

hits = [glossary_hit(s, h) for s, h in zip(src_sentences, hyps)]
flat = [x for row in hits for x in row]
if flat:
    print("Glossary accuracy %:", round(np.mean(flat) * 100, 2))
else:
    print("Glossary accuracy: (no glossary terms found in test)")


out_path = OUT_DIR / f"pivot_predictions_{hu_en_ckpt.name}__{en_ro_ckpt.name}_beam{NUM_BEAMS}.csv"
pd.DataFrame({
    "source_hu": src_sentences,
    "pivot_en": en_mid,
    "reference_ro": refs,
    "hypothesis_ro": hyps
}).to_csv(out_path, index=False)
print("Saved:", out_path)


# try:
#     from comet import download_model, load_from_checkpoint
#     N = 1000
#     comet_path = download_model("Unbabel/wmt22-comet-da")
#     comet_model = load_from_checkpoint(comet_path)
#     data = [{"src": s, "mt": m, "ref": r} for s,m,r in zip(src_sentences[:N], hyps[:N], refs[:N])]
#     out = comet_model.predict(data, batch_size=8, gpus=1 if DEVICE=="cuda" else 0)
#     print("COMET (first 1000):", round(float(np.mean(out.scores)), 4))
# except Exception as e:
#     print("COMET skipped:", type(e).__name__, e)


DEVICE: cuda
Torch: 2.5.1+cu121
Loading Helsinki-NLP/opus-mt-hu-en + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\hu_en_legal_lora\checkpoint-6250




Loading Helsinki-NLP/opus-mt-en-ro + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\en_ro_legal_lora\checkpoint-6250
Test size: 30366


HU→EN (legal): 100%|██████████| 30366/30366 [26:22<00:00, 19.19sent/s]
EN→RO (legal): 100%|██████████| 30366/30366 [30:01<00:00, 16.86sent/s]



Pivot BLEU: 38.64
HU diacritics leak %: 0.51
English leak proxy %: 0.07
Glossary accuracy %: 46.56
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\pivot_predictions_checkpoint-6250__checkpoint-6250_beam1.csv


## 4 beams

In [None]:
import os, gc, json, re, unicodedata
import numpy as np
import pandas as pd
import torch
import sacrebleu
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"

HU_EN_RUN = Path(PROJECT_ROOT) / "checkpoints" / "hu_en_legal_lora"
EN_RO_RUN = Path(PROJECT_ROOT) / "checkpoints" / "en_ro_legal_lora"

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_DIR = Path(OUTPUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# FINAL decode settings (beam=4 + anti-repeat)
NUM_BEAMS = 4
LENGTH_PENALTY = 1.0
NO_REPEAT_NGRAM = 3
REPETITION_PENALTY = 1.10

MAX_INPUT_LEN  = 256
MAX_NEW_TOKENS_NORMAL = 128


MAX_NEW_TOKENS_TABLE = 32
NUM_BEAMS_TABLE = 1

BS = 12 if DEVICE == "cuda" else 4  


USE_GLOSSARY_CONSTRAINTS = True
MAX_FORCED_TERMS_PER_SENT = 3


def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)
    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        return out_dir

    print(f"[Convert] Creating base safetensors for {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        for sf in tqdm(shard_files, desc=f"[Convert] shards {model_id.split('/')[-1]}"):
            shard_state = torch.load(snap_dir / sf, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

def load_lora_model(base_id: str, run_dir: Path):
    base_safe = ensure_base_safetensors(base_id, BASE_SAFE_ROOT)
    ckpt = get_latest_checkpoint(run_dir)
    print(f"Loading {base_id} + LoRA from:", ckpt)

    tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)
    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else None
    ).to(DEVICE)

    model = PeftModel.from_pretrained(base, ckpt).to(DEVICE)
    model.eval()
    return model, tok, ckpt


_punct = set(list("0123456789.,;:-/()[]{}<>|_+*=—–%°\"' \t"))
def is_table_like(s: str) -> bool:
    if s is None:
        return True
    t = str(s).strip()
    if len(t) == 0:
        return True
    if "_BAR_" in t or t.count("|") >= 2:
        return True
   
    good = sum(ch in _punct for ch in t)
    if good / max(1, len(t)) > 0.80:
        return True
    # Many long dash runs
    if re.search(r"[-—–]{6,}", t):
        return True
    return False


def norm_ro(s):
    s = s.lower()
    s = s.replace("ţ","ț").replace("ş","ș")
    return unicodedata.normalize("NFKC", s)

def build_force_words_ids_for_sentence(en_ro_tok, hu_src: str):
  
    
    if not USE_GLOSSARY_CONSTRAINTS:
        return None
    if "glossary" not in globals() or glossary is None:
        return None

    forced = []
    hu_s = str(hu_src)
    for hu_term, ro_forms in glossary.items():
        if hu_term in hu_s and ro_forms:
            ro_form = ro_forms[0]
            ids = en_ro_tok(ro_form, add_special_tokens=False).input_ids
            if ids:
                forced.append(ids)
        if len(forced) >= MAX_FORCED_TERMS_PER_SENT:
            break

    return forced if forced else None

def batched_generate_adaptive(
    model, tok,
    texts,
    bs: int,
    desc: str,
    max_input_len: int,
   
    num_beams: int,
    max_new_tokens: int,
    length_penalty: float,
    no_repeat_ngram_size: int,
    repetition_penalty: float,
   
    num_beams_table: int,
    max_new_tokens_table: int,
    per_sentence_force_words_ids=None
):
    outs = [None] * len(texts)

    idx_table = [i for i, s in enumerate(texts) if is_table_like(s)]
    idx_norm  = [i for i in range(len(texts)) if i not in set(idx_table)]

    def _run(indices, beams, max_new, tag):
        if not indices:
            return
        i = 0
        pbar = tqdm(total=len(indices), desc=f"{desc} [{tag}]", unit="sent")
        cur_bs = bs
        while i < len(indices):
            take = min(cur_bs, len(indices) - i)
            batch_idx = indices[i:i+take]
            batch_txt = [texts[j] for j in batch_idx]

            try:
                inp = tok(
                    batch_txt,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=max_input_len
                ).to(DEVICE)

               
                gen_kwargs = dict(
                    num_beams=beams,
                    length_penalty=length_penalty if tag == "normal" else 1.0,
                    max_new_tokens=max_new,
                    do_sample=False,
                    no_repeat_ngram_size=no_repeat_ngram_size if tag == "normal" else 0,
                    repetition_penalty=repetition_penalty if tag == "normal" else 1.0,
                )

             
                if per_sentence_force_words_ids is not None and tag == "normal":
                    
                    any_forced = any(per_sentence_force_words_ids[j] for j in batch_idx)
                    if any_forced:
                        
                        decoded = []
                        for j, txt in zip(batch_idx, batch_txt):
                            one_inp = tok(txt, return_tensors="pt", truncation=True, max_length=max_input_len).to(DEVICE)
                            one_kwargs = dict(gen_kwargs)
                            fw = per_sentence_force_words_ids[j]
                            if fw:
                                one_kwargs["force_words_ids"] = fw
                            with torch.inference_mode():
                                one_gen = model.generate(**one_inp, **one_kwargs)
                            decoded.append(tok.batch_decode(one_gen, skip_special_tokens=True)[0])
                        for j, d in zip(batch_idx, decoded):
                            outs[j] = d
                        i += take
                        pbar.update(take)
                        continue

                with torch.inference_mode():
                    gen = model.generate(**inp, **gen_kwargs)

                decoded = tok.batch_decode(gen, skip_special_tokens=True)
                for j, d in zip(batch_idx, decoded):
                    outs[j] = d

                i += take
                pbar.update(take)

            except torch.cuda.OutOfMemoryError:
                torch.cuda.empty_cache()
                cur_bs = max(1, cur_bs // 2)
                print(f"⚠ OOM in {desc}/{tag} -> reducing batch size to {cur_bs}")

        pbar.close()

    _run(idx_norm,  num_beams,       max_new_tokens,       "normal")
    _run(idx_table, num_beams_table, max_new_tokens_table, "table")

    assert all(o is not None for o in outs), "Some generations failed unexpectedly."
    return outs


hu_en_model, hu_en_tok, hu_en_ckpt = load_lora_model(HU_EN_BASE, HU_EN_RUN)
en_ro_model, en_ro_tok, en_ro_ckpt = load_lora_model(EN_RO_BASE, EN_RO_RUN)


test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()
print("Test size:", len(src_sentences))

en_mid = batched_generate_adaptive(
    hu_en_model, hu_en_tok,
    src_sentences,
    bs=BS,
    desc="HU→EN (legal)",
    max_input_len=MAX_INPUT_LEN,
    num_beams=NUM_BEAMS,
    max_new_tokens=MAX_NEW_TOKENS_NORMAL,
    length_penalty=LENGTH_PENALTY,
    no_repeat_ngram_size=NO_REPEAT_NGRAM,
    repetition_penalty=REPETITION_PENALTY,
    num_beams_table=NUM_BEAMS_TABLE,
    max_new_tokens_table=MAX_NEW_TOKENS_TABLE,
    per_sentence_force_words_ids=None
)


force_ids = None
if USE_GLOSSARY_CONSTRAINTS:
    print("Building per-sentence glossary constraints (conservative)...")
    force_ids = [build_force_words_ids_for_sentence(en_ro_tok, hu) for hu in tqdm(src_sentences, desc="Glossary constraints")]

hyps = batched_generate_adaptive(
    en_ro_model, en_ro_tok,
    en_mid,
    bs=BS,
    desc="EN→RO (legal)",
    max_input_len=MAX_INPUT_LEN,
    num_beams=NUM_BEAMS,
    max_new_tokens=MAX_NEW_TOKENS_NORMAL,
    length_penalty=LENGTH_PENALTY,
    no_repeat_ngram_size=NO_REPEAT_NGRAM,
    repetition_penalty=REPETITION_PENALTY,
    num_beams_table=NUM_BEAMS_TABLE,
    max_new_tokens_table=MAX_NEW_TOKENS_TABLE,
    per_sentence_force_words_ids=force_ids
)


bleu = sacrebleu.corpus_bleu(hyps, [refs]).score
print("\nPivot BLEU:", round(bleu, 2))

hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diacritics for c in s)

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", str(s).lower()) + " "
    return any(w in s for w in [" the ", " and ", " of ", " to ", " for ", " with ", " on "])

hu_leak = np.mean([has_hu_diacritics(h) for h in hyps]) * 100
en_leak_rate = np.mean([en_leak(h) for h in hyps]) * 100
print("HU diacritics leak %:", round(hu_leak, 2))
print("English leak proxy %:", round(en_leak_rate, 2))


def norm_ro(s):
    s = str(s).lower()
    s = s.replace("ţ","ț").replace("ş","ș")
    return unicodedata.normalize("NFKC", s)

glossary_norm = {k: [norm_ro(v) for v in vs] for k, vs in glossary.items()} if "glossary" in globals() else {}

def glossary_hit(src: str, hyp: str) -> list:
    hyp_n = norm_ro(hyp)
    checks = []
    for hu_term, ro_forms in glossary_norm.items():
        if hu_term in str(src):
            checks.append(any(f in hyp_n for f in ro_forms))
    return checks

if glossary_norm:
    hits = [glossary_hit(s, h) for s, h in zip(src_sentences, hyps)]
    flat = [x for row in hits for x in row]
    if flat:
        print("Glossary accuracy %:", round(np.mean(flat) * 100, 2))
    else:
        print("Glossary accuracy: (no glossary terms found in test)")
else:
    print("Glossary accuracy: skipped (no glossary loaded)")


out_path = OUT_DIR / f"pivot_predictions_{hu_en_ckpt.name}__{en_ro_ckpt.name}_beam{NUM_BEAMS}_antiRepeat.csv"
pd.DataFrame({
    "source_hu": src_sentences,
    "pivot_en": en_mid,
    "reference_ro": refs,
    "hypothesis_ro": hyps
}).to_csv(out_path, index=False)

print("Saved:", out_path)
print("Done.")


DEVICE: cuda
Torch: 2.5.1+cu121
Loading Helsinki-NLP/opus-mt-hu-en + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\hu_en_legal_lora\checkpoint-6250


`torch_dtype` is deprecated! Use `dtype` instead!
  return t.to(


Loading Helsinki-NLP/opus-mt-en-ro + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\en_ro_legal_lora\checkpoint-6250
Test size: 30366


HU→EN (legal) [normal]: 100%|██████████| 29820/29820 [1:20:20<00:00,  6.19sent/s]
HU→EN (legal) [table]: 100%|██████████| 546/546 [00:12<00:00, 44.88sent/s]


Building per-sentence glossary constraints (conservative)...


Glossary constraints: 100%|██████████| 30366/30366 [00:00<00:00, 2169378.90it/s]
EN→RO (legal) [normal]: 100%|██████████| 30200/30200 [1:28:12<00:00,  5.71sent/s]
EN→RO (legal) [table]: 100%|██████████| 166/166 [00:02<00:00, 63.02sent/s]



Pivot BLEU: 38.13
HU diacritics leak %: 0.53
English leak proxy %: 0.11
Glossary accuracy: skipped (no glossary loaded)
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\pivot_predictions_checkpoint-6250__checkpoint-6250_beam4_antiRepeat.csv
Done.


In [None]:
import re
import pandas as pd
from pathlib import Path
from collections import Counter, defaultdict

TRAIN_PATH = Path(DATA_PROCESSED) / "train.csv"
OUT_PATH = Path(PROJECT_ROOT) / "data" / "glossary.csv"
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(TRAIN_PATH)

hu = df["hu"].astype(str).tolist()
ro = df["ro"].astype(str).tolist()


LEGAL_HINTS_HU = [
    "bizottság", "tanács", "parlament", "rendelet", "irányelv", "határozat",
    "cikk", "bekezdés", "melléklet", "fejezet", "szakasz",
    "közösség", "unió", "szerződés",
    "európai", "bíróság", "biztos", "tagállam"
]


cap_phrase = re.compile(r"\b([A-ZÁÉÍÓÖŐÚÜŰ][\wÁÉÍÓÖŐÚÜŰáéíóöőúüű\-]+(?:\s+[A-ZÁÉÍÓÖŐÚÜŰ][\wÁÉÍÓÖŐÚÜŰáéíóöőúüű\-]+){0,4})\b")
word = re.compile(r"\b[\wÁÉÍÓÖŐÚÜŰáéíóöőúüű\-]{4,}\b")

def extract_candidates(sent: str):
    sent_l = sent.lower()
    out = set()

    for m in cap_phrase.finditer(sent):
        t = m.group(1).strip()
        if len(t) >= 4:
            out.add(t)

    
    for w_ in word.findall(sent):
        wl = w_.lower()
        if any(h in wl for h in LEGAL_HINTS_HU):
            out.add(w_)

    return out


cap_phrase_ro = re.compile(r"\b([A-ZĂÂÎȘȚ][\wĂÂÎȘȚăâîșț\-]+(?:\s+[A-ZĂÂÎȘȚ][\wĂÂÎȘȚăâîșț\-]+){0,4})\b")
legal_ro_words = re.compile(r"\b(comisia|consiliul|parlamentul|regulamentul|directiva|decizia|articolul|alineatul|anexa|capitolul|secțiunea|uniunea|comunitatea|curtea|statul)\b", re.IGNORECASE)

def ro_signals(sent: str):
    s = sent.strip()
    outs = []

    outs += [m.group(1).strip() for m in cap_phrase_ro.finditer(s)]

    
    outs += [m.group(0).strip() for m in legal_ro_words.finditer(s)]

    
    seen = set()
    out2 = []
    for x in outs:
        xl = x.lower()
        if xl not in seen:
            seen.add(xl)
            out2.append(x)
    return out2

pair_counts = defaultdict(Counter)


N = min(200000, len(hu))
for s_hu, s_ro in zip(hu[:N], ro[:N]):
    cands = extract_candidates(s_hu)
    ro_sigs = ro_signals(s_ro)
    if not ro_sigs:
        continue
    for term in cands:
        for r_ in ro_sigs:
            pair_counts[term][r_] += 1


rows = []
for term, ctr in pair_counts.items():
    total = sum(ctr.values())
    if total < 30:  
        continue
    top = ctr.most_common(5)
    ro_forms = [r for r, c in top if c >= max(5, 0.15 * top[0][1])]
    if ro_forms:
        rows.append({"hu": term, "ro": "|".join(ro_forms), "count": top[0][1], "total": total})

gloss_df = pd.DataFrame(rows).sort_values(["count", "total"], ascending=False)

gloss_df = gloss_df.head(500)

gloss_df[["hu", "ro"]].to_csv(OUT_PATH, index=False)
print("✅ Wrote glossary:", OUT_PATH)
print("Entries:", len(gloss_df))
gloss_df.head(20)


✅ Wrote glossary: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\glossary.csv
Entries: 500


Unnamed: 0,hu,ro,count,total
6,rendelet,Regulamentul|CEE|CE|Regulamentului|anexa,10427,58937
18,Bizottság,Comisia|Comisiei|Comisie|CE|În,7088,42319
25,irányelv,Directiva|CEE|CE|anexa|Directivei,6457,40351
0,tagállamok,Statele|Comisiei|Comisia|CEE|În,6332,33337
15,tanácsi,CEE|Regulamentul|CE|Directiva|Regulamentul Con...,5597,39791
9,cikke,CEE|Regulamentul|CE|Directiva|Regulamentul Con...,3982,30115
35,mellékletben,anexa|II,2711,10752
2,rendelettel,Regulamentul|CEE|CE|Regulamentul Consiliului|R...,2650,12772
34,tagállam,statul|În|Comisia|Dacă|Comisiei,2538,16640
30,Közösség,Comunităţii|Comunitate|Comunitatea|CEE|În,2416,18043


In [None]:
import pandas as pd
import numpy as np
import re
import unicodedata
from pathlib import Path
from collections import Counter


GLOSS_PATH = Path(PROJECT_ROOT) / "data" / "glossary.csv"
TEST_PATH  = Path(DATA_PROCESSED) / "test.csv"

assert GLOSS_PATH.exists(), f"Glossary not found at {GLOSS_PATH}. Run Option A first."
assert TEST_PATH.exists(), f"Test not found at {TEST_PATH}."

gloss_df = pd.read_csv(GLOSS_PATH)
test_df  = pd.read_csv(TEST_PATH)

assert "hu" in gloss_df.columns and "ro" in gloss_df.columns, "glossary.csv must have columns: hu, ro"
assert "hu" in test_df.columns and "ro" in test_df.columns, "test.csv must have columns: hu, ro"

src = test_df["hu"].astype(str).tolist()
ref = test_df["ro"].astype(str).tolist()


def norm_ro(s: str) -> str:
    s = str(s).lower()
    s = s.replace("ţ", "ț").replace("ş", "ș")
    return unicodedata.normalize("NFKC", s)

glossary = {}
for _, row in gloss_df.iterrows():
    hu_term = str(row["hu"])
    ro_forms = [norm_ro(x.strip()) for x in str(row["ro"]).split("|") if x.strip()]
    if ro_forms:
        glossary[hu_term] = ro_forms

print("Loaded glossary entries:", len(glossary))

hits_per_term = Counter()
misses_per_term = Counter()
term_occurrences = Counter()

for s_hu, s_ro in zip(src, ref):
    s_ro_n = norm_ro(s_ro)
    for hu_term, ro_forms in glossary.items():
        if hu_term in s_hu:
            term_occurrences[hu_term] += 1
            ok = any(f in s_ro_n for f in ro_forms)
            if ok:
                hits_per_term[hu_term] += 1
            else:
                misses_per_term[hu_term] += 1

total_mentions = sum(term_occurrences.values())
total_hits = sum(hits_per_term.values())
coverage_terms = sum(1 for t, c in term_occurrences.items() if c > 0)
coverage_examples = sum(1 for s_hu in src if any(t in s_hu for t in glossary.keys()))

print("\n=== Glossary coverage on TEST ===")
print("Test sentences:", len(src))
print("Sentences containing ≥1 glossary term:", coverage_examples)
print("Unique glossary terms that appear in test:", coverage_terms, f" / {len(glossary)}")
print("Total term mentions (all occurrences):", total_mentions)
if total_mentions > 0:
    print("Reference-side hit rate (upper bound realism):", round(100 * total_hits / total_mentions, 2), "%")
else:
    print("No glossary terms found in test set (increase glossary size or adjust extraction thresholds).")


def show_top(counter: Counter, title: str, n=20):
    print("\n" + title)
    for term, c in counter.most_common(n):
        occ = term_occurrences.get(term, 0)
        hit = hits_per_term.get(term, 0)
        miss = misses_per_term.get(term, 0)
        print(f"{term:40s}  occ={occ:4d}  hit={hit:4d}  miss={miss:4d}")

show_top(term_occurrences, "Top glossary terms in TEST (by occurrences)", n=25)
show_top(misses_per_term,  "Top MISSED glossary terms in TEST (ref doesn't contain expected RO forms)", n=25)


rows = []
for term, occ in term_occurrences.items():
    if occ < 10:
        continue
    hit = hits_per_term.get(term, 0)
    rate = hit / occ if occ else 0.0
    if rate < 0.6: 
        rows.append((term, occ, hit, 100*rate, gloss_df.loc[gloss_df["hu"] == term, "ro"].iloc[0] if (gloss_df["hu"] == term).any() else ""))

if rows:
    hard_df = pd.DataFrame(rows, columns=["hu_term", "occurrences", "hits", "hit_rate_%", "current_ro_forms"])
    hard_df = hard_df.sort_values(["occurrences", "hit_rate_%"], ascending=[False, True])
    out_hard = Path(PROJECT_ROOT) / "data" / "glossary_hard_terms.csv"
    hard_df.to_csv(out_hard, index=False)
    print("\n✅ Wrote:", out_hard)
    print("These are frequent terms where your RO variants likely need cleanup/expansion.")
    display(hard_df.head(30))
else:
    print("\nNo frequent low-hit glossary terms found (good sign).")


top_missed = [t for t, _ in misses_per_term.most_common(5)]
if top_missed:
    print("\n=== Example misses (first 3 for each of top 5 missed terms) ===")
    for term in top_missed:
        print("\nTERM:", term)
        shown = 0
        ro_forms = glossary[term]
        for s_hu, s_ro in zip(src, ref):
            if term in s_hu and not any(f in norm_ro(s_ro) for f in ro_forms):
                print("HU:", s_hu[:200])
                print("RO:", s_ro[:200])
                print("Expected one of:", ro_forms)
                print("---")
                shown += 1
                if shown >= 3:
                    break
else:
    print("\nNo misses to sample.")


Loaded glossary entries: 500

=== Glossary coverage on TEST ===
Test sentences: 30366
Sentences containing ≥1 glossary term: 21210
Unique glossary terms that appear in test: 500  / 500
Total term mentions (all occurrences): 83464
Reference-side hit rate (upper bound realism): 90.59 %

Top glossary terms in TEST (by occurrences)
cikk                                      occ=6560  hit=5737  miss= 823
tagállam                                  occ=5154  hit=4641  miss= 513
bekezdés                                  occ=4557  hit=3961  miss= 596
rendelet                                  occ=4248  hit=3761  miss= 487
Bizottság                                 occ=3270  hit=3245  miss=  25
tagállamok                                occ=3226  hit=3115  miss= 111
irányelv                                  occ=3183  hit=2839  miss= 344
melléklet                                 occ=2077  hit=1898  miss= 179
Közösség                                  occ=2024  hit=1964  miss=  60
biztosít              

Unnamed: 0,hu_term,occurrences,hits,hit_rate_%,current_ro_forms
0,Európai,1033,584,56.534366,Comunităţilor Europene|Jurnalul Oficial|Tratat...
1,Hivatal,353,153,43.342776,Biroul|Oficiului|Biroului|Oficiul|Dacă
6,Gazdasági,192,66,34.375,Comitetului Economic|Social|Comisia|Consiliulu...
3,Ezek,163,96,58.895706,Aceste
2,Azok,101,44,43.564356,Statele|Comisia|CEE|Regulamentul|Comisiei
7,Alap,64,34,53.125,Fondului|Fond|Comisia|Fondul|Comisiei
11,FEJEZET,34,15,44.117647,CAPITOLUL
9,Az Európai Közösség,29,13,44.827586,Comunitatea|Comunitatea Europeană|Acordul|Comu...
13,tagállamonként,29,17,58.62069,CEE|Comunităţii|Directivei Consiliului|Directi...
8,határozathoz,26,13,50.0,Textul|Textele



=== Example misses (first 3 for each of top 5 missed terms) ===

TERM: cikk
HU: b) a 6. cikkben felsorolt csatlakozni kívánó országokban lévő projekt a következők valamelyikére vonatkozik:
RO: (b) proiecte pe teritoriul ţărilor candidate la aderare unde este aplicabil art. 6, care prevede:
Expected one of: ['articolul', 'articolul', 'în', 'alineatul', 'comisia']
---
HU: a) - nem állítják ki többé a 70/156/EGK irányelv 10. cikke (1) bekezdésének utolsó francia bekezdésében említett bizonyítvány másolatát valamely járműtípus tekintetében,
RO: (a) - nu mai eliberează copii ale certificatului prevăzut la art. 10 alin. (1) ultima liniuţă din Directiva 70/156/CEE cu privire la un tip de vehicul,
Expected one of: ['articolul', 'articolul', 'în', 'alineatul', 'comisia']
---
HU: A véleményt a Szerződés 148. cikkének (2) bekezdésében arra az esetre megállapított többséggel kell meghozni, amikor a határozatokat a tanácsnak a Bizottság javaslata alapján kell elfogadnia.
RO: 148 alin. (2) din Trat

In [None]:
import os, gc, json, re, unicodedata
import numpy as np
import pandas as pd
import torch
import sacrebleu
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"

HU_EN_RUN = Path(PROJECT_ROOT) / "checkpoints" / "hu_en_legal_lora"
EN_RO_RUN = Path(PROJECT_ROOT) / "checkpoints" / "en_ro_legal_lora"

BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_DIR = Path(OUTPUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Decode settings
NUM_BEAMS = 4          
MAX_INPUT_LEN = 256
MAX_NEW_TOKENS_NORMAL = 128
BS = 12 if DEVICE == "cuda" else 4

# Table-like safe mode
MAX_NEW_TOKENS_TABLE = 32
NUM_BEAMS_TABLE = 1

# Anti-repeat 
USE_ANTI_REPEAT = True
NO_REPEAT_NGRAM = 3
REPETITION_PENALTY = 1.10
LENGTH_PENALTY = 1.0

# Glossary forcing
USE_GLOSSARY_CONSTRAINTS = True
MAX_FORCED_TERMS_PER_SENT = 3


# Load glossary.csv  (HU -> RO variants)

GLOSS_PATH = Path(PROJECT_ROOT) / "data" / "glossary.csv"
assert GLOSS_PATH.exists(), f"Missing glossary at {GLOSS_PATH}. Create it with Option A first."

def norm_ro(s: str) -> str:
    s = str(s).lower()
    s = s.replace("ţ","ț").replace("ş","ș")
    return unicodedata.normalize("NFKC", s)

gloss_df = pd.read_csv(GLOSS_PATH)
assert "hu" in gloss_df.columns and "ro" in gloss_df.columns, "glossary.csv must have columns: hu, ro"

glossary = {}
for _, row in gloss_df.iterrows():
    hu_term = str(row["hu"])
    ro_forms = [x.strip() for x in str(row["ro"]).split("|") if x.strip()]
    if ro_forms:
        glossary[hu_term] = ro_forms

print("Loaded glossary entries:", len(glossary))

# Helpers: safetensors base + checkpoint loader
def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)
    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        return out_dir

    print(f"[Convert] Creating base safetensors for {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json",
            "tokenizer.json",
            "tokenizer_config.json",
            "source.spm",
            "vocab.json", "merges.txt",
            "pytorch_model.bin",
            "pytorch_model.bin.index.json",
            "pytorch_model-*.bin",
        ],
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        index_path = snap_dir / "pytorch_model.bin.index.json"
        with open(index_path, "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        for sf in tqdm(shard_files, desc=f"[Convert] shards {model_id.split('/')[-1]}"):
            shard_state = torch.load(snap_dir / sf, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

def load_lora_model(base_id: str, run_dir: Path):
    base_safe = ensure_base_safetensors(base_id, BASE_SAFE_ROOT)
    ckpt = get_latest_checkpoint(run_dir)
    print(f"Loading {base_id} + LoRA from:", ckpt)

    tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)
    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else None
    ).to(DEVICE)

    model = PeftModel.from_pretrained(base, ckpt).to(DEVICE)
    model.eval()
    return model, tok, ckpt

# Generation helpers (table detection + glossary forcing)
_punct = set(list("0123456789.,;:-/()[]{}<>|_+*=—–%°\"' \t"))
def is_table_like(s: str) -> bool:
    if s is None:
        return True
    t = str(s).strip()
    if len(t) == 0:
        return True
    if "_BAR_" in t or t.count("|") >= 2:
        return True
    good = sum(ch in _punct for ch in t)
    if good / max(1, len(t)) > 0.80:
        return True
    if re.search(r"[-—–]{6,}", t):
        return True
    return False

def build_force_words_ids_for_sentence(tok, hu_src: str):
    
    if not USE_GLOSSARY_CONSTRAINTS:
        return None
    hu_s = str(hu_src)
    forced = []
    for hu_term, ro_forms in glossary.items():
        if hu_term in hu_s and ro_forms:
            ro_form = ro_forms[0]
            ids = tok(ro_form, add_special_tokens=False).input_ids
            if ids:
                forced.append(ids)
        if len(forced) >= MAX_FORCED_TERMS_PER_SENT:
            break
    return forced if forced else None

def batched_generate_adaptive(
    model, tok,
    inputs,                     
    desc: str,
    bs: int,
    max_input_len: int,
    num_beams: int,
    max_new_tokens: int,
    num_beams_table: int,
    max_new_tokens_table: int,
    anti_repeat: bool = False,
    per_sentence_force_words_ids=None,   
):
    outs = [None] * len(inputs)
    idx_table = [i for i, s in enumerate(inputs) if is_table_like(s)]
    idx_norm  = [i for i in range(len(inputs)) if i not in set(idx_table)]

    nr = NO_REPEAT_NGRAM if anti_repeat else 0
    rp = REPETITION_PENALTY if anti_repeat else 1.0

    def _run(indices, beams, max_new, tag):
        if not indices:
            return
        i = 0
        cur_bs = bs
        pbar = tqdm(total=len(indices), desc=f"{desc} [{tag}]", unit="sent")
        while i < len(indices):
            take = min(cur_bs, len(indices) - i)
            batch_idx = indices[i:i+take]
            batch_txt = [inputs[j] for j in batch_idx]
            try:

                if per_sentence_force_words_ids is not None and tag == "normal":
                    any_forced = any(per_sentence_force_words_ids[j] for j in batch_idx)
                    if any_forced:
                        decoded = []
                        for j, txt in zip(batch_idx, batch_txt):
                            one_inp = tok(txt, return_tensors="pt", truncation=True, max_length=max_input_len).to(DEVICE)
                            kwargs = dict(
                                num_beams=beams,
                                max_new_tokens=max_new,
                                do_sample=False,
                                length_penalty=LENGTH_PENALTY,
                                no_repeat_ngram_size=nr,
                                repetition_penalty=rp,
                            )
                            fw = per_sentence_force_words_ids[j]
                            if fw:
                                kwargs["force_words_ids"] = fw
                            with torch.inference_mode():
                                one_gen = model.generate(**one_inp, **kwargs)
                            decoded.append(tok.batch_decode(one_gen, skip_special_tokens=True)[0])
                        for j, d in zip(batch_idx, decoded):
                            outs[j] = d
                        i += take
                        pbar.update(take)
                        continue

                inp = tok(batch_txt, return_tensors="pt", padding=True, truncation=True, max_length=max_input_len).to(DEVICE)
                with torch.inference_mode():
                    gen = model.generate(
                        **inp,
                        num_beams=beams,
                        max_new_tokens=max_new,
                        do_sample=False,
                        length_penalty=LENGTH_PENALTY if tag == "normal" else 1.0,
                        no_repeat_ngram_size=nr if tag == "normal" else 0,
                        repetition_penalty=rp if tag == "normal" else 1.0,
                    )
                dec = tok.batch_decode(gen, skip_special_tokens=True)
                for j, d in zip(batch_idx, dec):
                    outs[j] = d
                i += take
                pbar.update(take)

            except torch.cuda.OutOfMemoryError:
                torch.cuda.empty_cache()
                cur_bs = max(1, cur_bs // 2)
                print(f"⚠ OOM -> reducing batch size to {cur_bs}")

        pbar.close()

    _run(idx_norm,  num_beams,       max_new_tokens,       "normal")
    _run(idx_table, num_beams_table, max_new_tokens_table, "table")

    assert all(o is not None for o in outs)
    return outs

# Load both LoRA models

hu_en_model, hu_en_tok, hu_en_ckpt = load_lora_model(HU_EN_BASE, HU_EN_RUN)
en_ro_model, en_ro_tok, en_ro_ckpt = load_lora_model(EN_RO_BASE, EN_RO_RUN)


# Load test data

test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_hu = test_df["hu"].astype(str).tolist()
ref_ro = test_df["ro"].astype(str).tolist()
print("Test size:", len(src_hu))


# Run pivot prediction with glossary forcing in EN→RO

# HU→EN stage: no glossary forcing (not needed)
en_mid = batched_generate_adaptive(
    hu_en_model, hu_en_tok,
    inputs=src_hu,
    desc=f"HU→EN beam{NUM_BEAMS}",
    bs=BS,
    max_input_len=MAX_INPUT_LEN,
    num_beams=NUM_BEAMS,
    max_new_tokens=MAX_NEW_TOKENS_NORMAL,
    num_beams_table=NUM_BEAMS_TABLE,
    max_new_tokens_table=MAX_NEW_TOKENS_TABLE,
    anti_repeat=USE_ANTI_REPEAT,
    per_sentence_force_words_ids=None
)

# Build constraints from HU source, but apply them during EN→RO
force_ids = None
if USE_GLOSSARY_CONSTRAINTS:
    print("Building per-sentence force_words_ids from glossary (conservative)...")
    force_ids = [build_force_words_ids_for_sentence(en_ro_tok, hu) for hu in tqdm(src_hu, desc="force_ids")]

hyps = batched_generate_adaptive(
    en_ro_model, en_ro_tok,
    inputs=en_mid,
    desc=f"EN→RO beam{NUM_BEAMS} + glossary",
    bs=BS,
    max_input_len=MAX_INPUT_LEN,
    num_beams=NUM_BEAMS,
    max_new_tokens=MAX_NEW_TOKENS_NORMAL,
    num_beams_table=NUM_BEAMS_TABLE,
    max_new_tokens_table=MAX_NEW_TOKENS_TABLE,
    anti_repeat=USE_ANTI_REPEAT,
    per_sentence_force_words_ids=force_ids
)

# Metrics (BLEU + leaks + glossary accuracy)

bleu = sacrebleu.corpus_bleu(hyps, [ref_ro]).score
print("\nPivot BLEU:", round(bleu, 2))

hu_diacritics = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diacritics for c in str(s))

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", str(s).lower()) + " "
    return any(w in s for w in [" the ", " and ", " of ", " to ", " for ", " with ", " on "])

hu_leak = np.mean([has_hu_diacritics(h) for h in hyps]) * 100
en_leak_rate = np.mean([en_leak(h) for h in hyps]) * 100
print("HU diacritics leak %:", round(hu_leak, 2))
print("English leak proxy %:", round(en_leak_rate, 2))

# Glossary accuracy: among occurrences where HU term appears, did output contain any RO variant?
gloss_norm = {k: [norm_ro(v) for v in vs] for k, vs in glossary.items()}

def glossary_hit(src: str, hyp: str):
    hyp_n = norm_ro(hyp)
    checks = []
    for hu_term, ro_forms in gloss_norm.items():
        if hu_term in str(src):
            checks.append(any(f in hyp_n for f in ro_forms))
    return checks

all_hits = [glossary_hit(s, h) for s, h in zip(src_hu, hyps)]
flat = [x for row in all_hits for x in row]
if flat:
    print("Glossary accuracy %:", round(np.mean(flat) * 100, 2))
else:
    print("Glossary accuracy: no glossary terms found in test.")


# Save outputs

out_path = OUT_DIR / f"pivot_glossary_beam{NUM_BEAMS}_{hu_en_ckpt.name}__{en_ro_ckpt.name}.csv"
pd.DataFrame({
    "source_hu": src_hu,
    "pivot_en": en_mid,
    "reference_ro": ref_ro,
    "hypothesis_ro": hyps
}).to_csv(out_path, index=False)

print("Saved:", out_path)
print("Done.")


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: f59407a4-4ac4-4afe-98d8-767cbbaba5c5)')' thrown while requesting HEAD https://huggingface.co/Helsinki-NLP/opus-mt-hu-en/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


DEVICE: cuda
Torch: 2.5.1+cu121
Loaded glossary entries: 500
Loading Helsinki-NLP/opus-mt-hu-en + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\hu_en_legal_lora\checkpoint-6250




Loading Helsinki-NLP/opus-mt-en-ro + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\en_ro_legal_lora\checkpoint-6250
Test size: 30366


HU→EN beam4 [normal]: 100%|██████████| 29820/29820 [1:20:21<00:00,  6.18sent/s]
HU→EN beam4 [table]: 100%|██████████| 546/546 [00:12<00:00, 45.03sent/s]


Building per-sentence force_words_ids from glossary (conservative)...


force_ids: 100%|██████████| 30366/30366 [00:05<00:00, 5524.10it/s]
EN→RO beam4 + glossary [normal]:   0%|          | 0/30200 [00:00<?, ?sent/s]Constrained Beam Search was moved to a `custom_generate` repo: https://hf.co/transformers-community/constrained-beam-search. To prevent loss of backward compatibility, add `custom_generate='transformers-community/constrained-beam-search'` to your `generate` call before v4.62.0.


ValueError: Constrained Beam Search requires `trust_remote_code=True` in your `generate` call, since it loads https://hf.co/transformers-community/constrained-beam-search.

In [None]:


import os, gc, json, re, unicodedata
import numpy as np
import pandas as pd
import torch
import sacrebleu
from tqdm import tqdm
from pathlib import Path

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from peft import PeftModel


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("DEVICE:", DEVICE)
print("Torch:", torch.__version__)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()


HU_EN_BASE = "Helsinki-NLP/opus-mt-hu-en"
EN_RO_BASE = "Helsinki-NLP/opus-mt-en-ro"

HU_EN_RUN = Path(PROJECT_ROOT) / "checkpoints" / "hu_en_legal_lora"
EN_RO_RUN = Path(PROJECT_ROOT) / "checkpoints" / "en_ro_legal_lora"


BASE_SAFE_ROOT = Path(PROJECT_ROOT) / "safetensors_bases"
BASE_SAFE_ROOT.mkdir(parents=True, exist_ok=True)

OUT_DIR = Path(OUTPUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)


NUM_BEAMS = 4
MAX_INPUT_LEN = 256
MAX_NEW = 128
BS = 8 if DEVICE == "cuda" else 4

# table-ish safe mode
MAX_NEW_TABLE = 32
BEAMS_TABLE = 1

# anti-repeat
USE_ANTI_REPEAT = True
NO_REPEAT_NGRAM = 3
REPETITION_PENALTY = 1.10
LENGTH_PENALTY = 1.0

# glossary forcing
USE_GLOSSARY = True
MAX_FORCED_TERMS_PER_SENT = 2  # conservative; constrained decoding can be slower

# resumable files (beam-specific)
RESUME_PATH = OUT_DIR / f"pivot_glossary_RESUMABLE_beam{NUM_BEAMS}.csv"
PIVOT_CACHE_PATH = None  # will set after we know checkpoints


GLOSS_PATH = Path(PROJECT_ROOT) / "data" / "glossary.csv"
assert GLOSS_PATH.exists(), f"Missing glossary at {GLOSS_PATH}. Run Option A first."
gdf = pd.read_csv(GLOSS_PATH)
assert {"hu","ro"}.issubset(gdf.columns), "glossary.csv must have columns hu, ro"

glossary = {}
for _, row in gdf.iterrows():
    hu_term = str(row["hu"])
    ro_forms = [x.strip() for x in str(row["ro"]).split("|") if x.strip()]
    if ro_forms:
        glossary[hu_term] = ro_forms

print("Glossary entries:", len(glossary))


def load_tokenizer_local_first(model_id: str):
    try:
        return AutoTokenizer.from_pretrained(model_id, use_fast=True, local_files_only=True)
    except Exception:
        return AutoTokenizer.from_pretrained(model_id, use_fast=True)


def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

def ensure_base_safetensors(model_id: str, out_root: Path) -> Path:
    out_dir = out_root / model_id.replace("/", "__")
    out_dir.mkdir(parents=True, exist_ok=True)
    if (out_dir / "model.safetensors").exists() and (out_dir / "config.json").exists():
        return out_dir

    print(f"[Convert] Base -> safetensors for {model_id}")
    snap_dir = Path(snapshot_download(
        repo_id=model_id,
        allow_patterns=[
            "config.json","tokenizer.json","tokenizer_config.json","source.spm",
            "vocab.json","merges.txt",
            "pytorch_model.bin","pytorch_model.bin.index.json","pytorch_model-*.bin",
        ],
    ))

    config = AutoConfig.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_config(config)

    bin_path = snap_dir / "pytorch_model.bin"
    if bin_path.exists():
        state = torch.load(bin_path, map_location="cpu")
        model.load_state_dict(state, strict=False)
        del state
    else:
        with open(snap_dir / "pytorch_model.bin.index.json", "r", encoding="utf-8") as f:
            index = json.load(f)
        shard_files = sorted(set(index["weight_map"].values()))
        for sf in tqdm(shard_files, desc=f"[Convert] shards {model_id.split('/')[-1]}"):
            shard_state = torch.load(snap_dir / sf, map_location="cpu")
            model.load_state_dict(shard_state, strict=False)
            del shard_state

    model.save_pretrained(out_dir, safe_serialization=True)
    config.save_pretrained(out_dir)
    return out_dir

def load_lora_model(base_id: str, run_dir: Path):
    base_safe = ensure_base_safetensors(base_id, BASE_SAFE_ROOT)
    ckpt = get_latest_checkpoint(run_dir)
    print(f"Loading {base_id} + LoRA from:", ckpt)

    tok = load_tokenizer_local_first(base_id)
    base = AutoModelForSeq2SeqLM.from_pretrained(
        base_safe,
        use_safetensors=True,
        torch_dtype=torch.float16 if DEVICE=="cuda" else None,
    ).to(DEVICE)

    model = PeftModel.from_pretrained(base, ckpt).to(DEVICE)
    model.eval()
    return model, tok, ckpt

hu_en_model, hu_en_tok, hu_en_ckpt = load_lora_model(HU_EN_BASE, HU_EN_RUN)
en_ro_model, en_ro_tok, en_ro_ckpt = load_lora_model(EN_RO_BASE, EN_RO_RUN)


PIVOT_CACHE_PATH = OUT_DIR / f"pivot_en_cache_{hu_en_ckpt.name}_beam{NUM_BEAMS}.csv"


test_df = pd.read_csv(Path(DATA_PROCESSED) / "test.csv")
src_hu = test_df["hu"].astype(str).tolist()
ref_ro = test_df["ro"].astype(str).tolist()
N = len(src_hu)
print("Test size:", N)


_punct = set(list("0123456789.,;:-/()[]{}<>|_+*=—–%°\"' \t"))
def is_table_like(s: str) -> bool:
    t = str(s).strip()
    if not t:
        return True
    if "_BAR_" in t or t.count("|") >= 2:
        return True
    good = sum(ch in _punct for ch in t)
    if good / max(1, len(t)) > 0.80:
        return True
    if re.search(r"[-—–]{6,}", t):
        return True
    return False


def build_force_words_ids(tok, hu_src: str):
    if not USE_GLOSSARY:
        return None
    hu_s = str(hu_src)
    forced = []
    for hu_term, ro_forms in glossary.items():
        if hu_term in hu_s and ro_forms:
            ids = tok(ro_forms[0], add_special_tokens=False).input_ids
            if ids:
                forced.append(ids)
        if len(forced) >= MAX_FORCED_TERMS_PER_SENT:
            break
    return forced if forced else None


def gen_batch(model, tok, batch_txt, beams, max_new, anti_repeat=False, force_words_ids=None):
    nr = NO_REPEAT_NGRAM if anti_repeat else 0
    rp = REPETITION_PENALTY if anti_repeat else 1.0

    inp = tok(
        batch_txt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_INPUT_LEN
    ).to(DEVICE)

    kwargs = dict(
        num_beams=beams,
        max_new_tokens=max_new,
        do_sample=False,
        length_penalty=LENGTH_PENALTY,
        no_repeat_ngram_size=nr,
        repetition_penalty=rp,
    )

    
    if force_words_ids is not None:
        kwargs["force_words_ids"] = force_words_ids
        kwargs["trust_remote_code"] = True
        kwargs["custom_generate"] = "transformers-community/constrained-beam-search"

    with torch.inference_mode():
        out = model.generate(**inp, **kwargs)
    return tok.batch_decode(out, skip_special_tokens=True)

def append_rows(path: Path, rows: list):
    df = pd.DataFrame(rows)
    header = not path.exists()
    df.to_csv(path, mode="a", index=False, header=header)


if RESUME_PATH.exists():
    done_df = pd.read_csv(RESUME_PATH)
    done = set(done_df["idx"].astype(int).tolist())
    print(f"Resuming EN→RO: found {len(done)} completed rows in {RESUME_PATH}")
else:
    done = set()
    print("Starting fresh EN→RO (no resume file).")

pivot_en = [None] * N

if PIVOT_CACHE_PATH.exists():
    piv = pd.read_csv(PIVOT_CACHE_PATH)
    for _, r in piv.iterrows():
        pivot_en[int(r["idx"])] = r["pivot_en"]
    missing = [i for i, v in enumerate(pivot_en) if v is None]
    print(f"Pivot cache found: {len(piv)} rows. Missing pivot:", len(missing))
else:
    missing = list(range(N))
    print("No pivot cache; will compute all pivot EN.")

if missing:
    for start in tqdm(range(0, len(missing), BS), desc="HU→EN cache", unit="batch"):
        batch_idx = missing[start:start+BS]
        batch_hu = [src_hu[i] for i in batch_idx]

        outs = []
        for x in batch_hu:
            if is_table_like(x):
                outs.append(gen_batch(hu_en_model, hu_en_tok, [x], BEAMS_TABLE, MAX_NEW_TABLE, anti_repeat=False)[0])
            else:
                outs.append(gen_batch(hu_en_model, hu_en_tok, [x], NUM_BEAMS, MAX_NEW, anti_repeat=USE_ANTI_REPEAT)[0])

        for i, out in zip(batch_idx, outs):
            pivot_en[i] = out

        append_rows(PIVOT_CACHE_PATH, [{"idx": i, "pivot_en": pivot_en[i]} for i in batch_idx])

print("Pivot EN ready. Cache:", PIVOT_CACHE_PATH)


for start in tqdm(range(0, N, BS), desc="EN→RO resumable", unit="batch"):
    batch_idx = list(range(start, min(N, start+BS)))

    if all(i in done for i in batch_idx):
        continue

    rows_to_write = []
    for i in batch_idx:
        if i in done:
            continue

        hu_in = src_hu[i]
        en_in = pivot_en[i]

        
        if is_table_like(hu_in) or is_table_like(en_in):
            beams = BEAMS_TABLE
            max_new = MAX_NEW_TABLE
            anti = False
            fw = None
        else:
            beams = NUM_BEAMS
            max_new = MAX_NEW
            anti = USE_ANTI_REPEAT
            fw = build_force_words_ids(en_ro_tok, hu_in)

        
        try:
            ro_out = gen_batch(
                en_ro_model, en_ro_tok,
                [en_in],
                beams=beams,
                max_new=max_new,
                anti_repeat=anti,
                force_words_ids=fw
            )[0]
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            ro_out = gen_batch(en_ro_model, en_ro_tok, [en_in], beams=1, max_new=MAX_NEW_TABLE, anti_repeat=False, force_words_ids=None)[0]

        rows_to_write.append({
            "idx": i,
            "source_hu": hu_in,
            "pivot_en": en_in,
            "reference_ro": ref_ro[i],
            "hypothesis_ro": ro_out
        })

    if rows_to_write:
        append_rows(RESUME_PATH, rows_to_write)
        for r in rows_to_write:
            done.add(int(r["idx"]))

print("Finished EN→RO. Final file:", RESUME_PATH)


final_df = pd.read_csv(RESUME_PATH).sort_values("idx")
hyps = final_df["hypothesis_ro"].astype(str).tolist()
refs = final_df["reference_ro"].astype(str).tolist()
srcs = final_df["source_hu"].astype(str).tolist()

bleu = sacrebleu.corpus_bleu(hyps, [refs]).score
print("\nFinal BLEU:", round(bleu, 2))

hu_diac = set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ")
def has_hu_diacritics(s: str) -> bool:
    return any(c in hu_diac for c in str(s))

def en_leak(s: str) -> bool:
    s = " " + re.sub(r"\s+", " ", str(s).lower()) + " "
    return any(w in s for w in [" the ", " and ", " of ", " to ", " for ", " with ", " on "])

hu_leak = np.mean([has_hu_diacritics(h) for h in hyps]) * 100
en_leak_rate = np.mean([en_leak(h) for h in hyps]) * 100
print("HU diacritics leak %:", round(hu_leak, 2))
print("English leak proxy %:", round(en_leak_rate, 2))

def norm_ro(s: str) -> str:
    s = str(s).lower().replace("ţ","ț").replace("ş","ș")
    return unicodedata.normalize("NFKC", s)

gloss_norm = {k: [norm_ro(v) for v in vs] for k, vs in glossary.items()}
checks = []
for s_hu, hyp in zip(srcs, hyps):
    hyp_n = norm_ro(hyp)
    for hu_term, ro_forms in gloss_norm.items():
        if hu_term in s_hu:
            checks.append(any(f in hyp_n for f in ro_forms))

if checks:
    print("Glossary accuracy %:", round(np.mean(checks) * 100, 2))
else:
    print("Glossary accuracy: no glossary terms found in test.")


EN→RO beam4 + glossary [normal]:   0%|          | 0/30200 [24:30<?, ?sent/s]

DEVICE: cuda
Torch: 2.5.1+cu121
Glossary entries: 500
Loading Helsinki-NLP/opus-mt-hu-en + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\hu_en_legal_lora\checkpoint-6250





Loading Helsinki-NLP/opus-mt-en-ro + LoRA from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\en_ro_legal_lora\checkpoint-6250
Test size: 30366
Starting fresh EN→RO (no resume file).
Pivot cache found: 168 rows. Missing pivot: 30198


HU→EN cache: 100%|██████████| 3775/3775 [3:38:34<00:00,  3.47s/batch]  


Pivot EN ready. Cache: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\pivot_en_cache_checkpoint-6250_beam4.csv


EN→RO resumable:   0%|          | 0/3796 [00:00<?, ?batch/s]'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 70763103-6cdc-4b62-bb12-13ca1b1fcc60)')' thrown while requesting HEAD https://huggingface.co/transformers-community/constrained-beam-search/resolve/main/custom_generate/generate.py
Retrying in 1s [Retry 1/5].
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
EN→RO resumable:   0%|          | 0/3796 [00:02<?, ?batch/s]


OSError: `transformers-community/constrained-beam-search` does not contain a `custom_generate` subdirectory with a `generate.py` file, can't load the custom generate function.

In [2]:
!pip install -U unbabel-comet


Collecting unbabel-comet
  Downloading unbabel_comet-2.2.7-py3-none-any.whl.metadata (19 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
     ---------------------------------------- 0.0/55.5 kB ? eta -:--:--
     ---------------------------------------- 55.5/55.5 kB 1.4 MB/s eta 0:00:00
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.8-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Collecting pytorch-lightning<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading pytorch_lightning-2.6.0-py3-none-any.whl.metadata (21 kB)
Collecting scipy<2.0.0,>=1.5.4 (from unbabel-comet)
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB 3

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
onnxruntime 1.16.2 requires flatbuffers, which is not installed.
rembg 2.0.52 requires scikit-image, which is not installed.


In [5]:
# ============================================================
# Compute COMET score for a saved prediction CSV
# ============================================================

import pandas as pd
import torch
from comet import download_model, load_from_checkpoint
from tqdm import tqdm

# -------------------------
# 1) Path to result CSV
# -------------------------
CSV_PATH = OUTPUT_DIR / "pivot_predictions_checkpoint-6250__checkpoint-6250_beam1.csv"
# change filename if needed

df = pd.read_csv(CSV_PATH)
print("Loaded:", CSV_PATH)
print("Rows:", len(df))

# -------------------------
# 2) Detect columns robustly
# -------------------------
# Expected columns (any of these naming variants)
SRC_COLS = ["source", "source_hu", "hu"]
REF_COLS = ["reference", "reference_ro", "ro"]
HYP_COLS = ["hypothesis", "hypothesis_ro"]

def find_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"None of {candidates} found in CSV columns: {df.columns.tolist()}")

src_col = find_col(SRC_COLS)
ref_col = find_col(REF_COLS)
hyp_col = find_col(HYP_COLS)

print(f"Using columns → src: {src_col}, ref: {ref_col}, hyp: {hyp_col}")

srcs = df[src_col].astype(str).tolist()
refs = df[ref_col].astype(str).tolist()
hyps = df[hyp_col].astype(str).tolist()

# -------------------------
# 3) Load COMET model
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

comet_model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(comet_model_path)
comet_model.to(DEVICE)
comet_model.eval()

# -------------------------
# 4) Prepare data
# -------------------------
data = [
    {"src": s, "mt": h, "ref": r}
    for s, h, r in zip(srcs, hyps, refs)
]

# -------------------------
# 5) Compute COMET
# -------------------------
with torch.no_grad():
    scores = comet_model.predict(
        data,
        batch_size=8 if DEVICE == "cuda" else 2,
        gpus=1 if DEVICE == "cuda" else 0,
        progress_bar=True
    )

comet_score = scores["system_score"]
print("\nCOMET score:", round(comet_score, 4))


Loaded: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\pivot_predictions_checkpoint-6250__checkpoint-6250_beam1.csv
Rows: 30366
Using columns → src: source_hu, ref: reference_ro, hyp: hypothesis_ro
DEVICE: cuda


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.71it/s]
c:\Users\RoG\anaconda3\envs\pythonRL\lib\site-packages\lightning_fabric\utilities\cloud_io.py:73: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explic


COMET score: 0.8797
