In [14]:
!pip install -U bitsandbytes



In [1]:
!python scripts/download_data.py

2026-01-12 21:41:46,849 - INFO - Zip file already exists.
2026-01-12 21:41:46,850 - INFO - Extracting...
2026-01-12 21:41:47,280 - INFO - Data successfully ready in D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\raw
2026-01-12 21:41:47,281 - INFO - Files in data folder: ['hu-ro.txt.zip', 'JRC-Acquis.hu-ro.hu', 'JRC-Acquis.hu-ro.ro', 'JRC-Acquis.hu-ro.xml', 'LICENSE', 'README']


In [2]:
!python scripts/prepare_splits.py

2026-01-12 21:42:55,001 - INFO - Loading JRC-Acquis.hu-ro.hu and JRC-Acquis.hu-ro.ro...
2026-01-12 21:42:56,767 - INFO - Original: 417178 | Cleaned: 310067 | Removed: 107111
2026-01-12 21:42:56,802 - INFO - Split Sizes -> Train: 248053, Val: 31007, Test: 31007
2026-01-12 21:42:59,194 - INFO - SUCCESS! Splits saved to D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed


In [9]:
!python scripts/evaluate_baseline.py

2026-01-12 21:47:32.958678: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-12 21:47:36.732384: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Loading test data from D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed\test.csv...
Loading model facebook/nllb-200-distilled-600M on cuda...
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/e

## Downloading JRC-Acquis

In [None]:
import os
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR = PROJECT_ROOT / "data" / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_RAW:", DATA_RAW)
print("DATA_PROCESSED:", DATA_PROCESSED)
print("OUTPUT_DIR:", OUTPUT_DIR)


PROJECT_ROOT: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal
DATA_RAW: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\raw
DATA_PROCESSED: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed
OUTPUT_DIR: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs


In [None]:
import requests, zipfile, logging
logging.basicConfig(level=logging.INFO)

OPUS_DIRECT_URL = "https://object.pouta.csc.fi/OPUS-JRC-Acquis/v3.0/moses/hu-ro.txt.zip"
zip_path = DATA_RAW / "hu-ro.txt.zip"
DATA_RAW.mkdir(parents=True, exist_ok=True)

# Download the zip if not already present
if not zip_path.exists():
    logging.info(f"Downloading {OPUS_DIRECT_URL}...")
    r = requests.get(OPUS_DIRECT_URL, stream=True, headers={"User-Agent":"Mozilla/5.0"})
    r.raise_for_status()
    with open(zip_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    logging.info("Download complete.")
else:
    logging.info("Zip already exists.")

assert zipfile.is_zipfile(zip_path), "Downloaded file is not a valid zip."

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(DATA_RAW)

print("Extracted files:", [p.name for p in DATA_RAW.glob("*")][:10])


INFO:root:Zip already exists.


Extracted files: ['hu-ro.txt.zip', 'JRC-Acquis.hu-ro.hu', 'JRC-Acquis.hu-ro.ro', 'JRC-Acquis.hu-ro.xml', 'LICENSE', 'README']


## Preparing splits

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# Load raw parallel data
def load_parallel_from_raw(raw_dir: Path):
    hu_path = list(raw_dir.glob("*.hu"))[0]
    ro_path = list(raw_dir.glob("*.ro"))[0]
    with open(hu_path, "r", encoding="utf-8") as f: hu = [x.strip() for x in f]
    with open(ro_path, "r", encoding="utf-8") as f: ro = [x.strip() for x in f]
    assert len(hu) == len(ro)
    return pd.DataFrame({"hu": hu, "ro": ro})

df = load_parallel_from_raw(DATA_RAW)
print("Raw pairs:", len(df))

# Basic cleaning
df = df.drop_duplicates()
df = df[df["hu"].str.strip().astype(bool)]
df = df[df["ro"].str.strip().astype(bool)]

# Length ratio filter 
def len_ratio_ok(s, t, min_ratio=0.5, max_ratio=2.0):
    ls, lt = max(len(s), 1), max(len(t), 1)
    r = ls / lt
    return (r >= min_ratio) and (r <= max_ratio)

mask = [len_ratio_ok(s, t) for s, t in zip(df["hu"], df["ro"])]
df = df[mask]

print("After cleaning:", len(df))

# Split 80/10/10 
train_df, test_val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_val_df, test_size=0.5, random_state=42)

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
train_df.to_csv(DATA_PROCESSED / "train.csv", index=False)
val_df.to_csv(DATA_PROCESSED / "val.csv", index=False)
test_df.to_csv(DATA_PROCESSED / "test.csv", index=False)

print("Saved:",
      (DATA_PROCESSED/"train.csv"),
      (DATA_PROCESSED/"val.csv"),
      (DATA_PROCESSED/"test.csv"))
print("Sizes:", len(train_df), len(val_df), len(test_df))


Raw pairs: 417178
After cleaning: 303659
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed\train.csv D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed\val.csv D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\processed\test.csv
Sizes: 242927 30366 30366


## Re-running baseline evaluation

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu
from tqdm import tqdm

OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "c:\Users\RoG\anaconda3\envs\licenta\lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu
from tqdm import tqdm

MODEL_NAME = "facebook/nllb-200-distilled-600M"
SRC_LANG = "hun_Latn"
TGT_LANG = "ron_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load test data
test_df = pd.read_csv(DATA_PROCESSED / "test.csv").head(200) 
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE) # load model
tokenizer.src_lang = SRC_LANG # load tokenizer

# forcing the target language token
forced_id = tokenizer.lang_code_to_id[TGT_LANG] if hasattr(tokenizer, "lang_code_to_id") else tokenizer.convert_tokens_to_ids(TGT_LANG)

# Translation function
def translate_batch(sentences, batch_size=8, max_len=512, num_beams=4):
    hyps = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(DEVICE)
        with torch.no_grad():
            out = model.generate(
                **inputs,
                forced_bos_token_id=forced_id,
                max_length=max_len,
                num_beams=num_beams,
                early_stopping=True
            )
        hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return hyps

# Baseline evaluation
baseline_hyps = translate_batch(src_sentences, num_beams=4)
bleu = sacrebleu.corpus_bleu(baseline_hyps, [refs]).score
print("Baseline BLEU:", round(bleu, 2))

pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": baseline_hyps}) \
  .to_csv(OUTPUT_DIR / "baseline_predictions_notebook.csv", index=False)
print("Saved:", OUTPUT_DIR / "baseline_predictions_notebook.csv")


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [23:22<00:00, 56.09s/it] 


Baseline BLEU: 23.61
Saved: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\data\outputs\baseline_predictions_notebook.csv


In [None]:
from transformers import AutoTokenizer

CANDIDATES = [
    # Trying direct HU<->RO if available 
    "Helsinki-NLP/opus-mt-hu-ro",
    "Helsinki-NLP/opus-mt-ro-hu",
    "Helsinki-NLP/opus-mt-hu-en",
    "Helsinki-NLP/opus-mt-en-ro",
]

chosen = None
for m in CANDIDATES:
    try:
        _ = AutoTokenizer.from_pretrained(m)
        chosen = m
        break
    except Exception as e:
        print("Not available:", m, "|", type(e).__name__)

print("Chosen model:", chosen)

import os, gc, torch, pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model, TaskType

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

MODEL_NAME = chosen 

max_len = 256        
batch_size = 8       
grad_accum = 1
train_n = 50000      
val_n = 2000
num_train_epochs = 3

train_df = pd.read_csv(DATA_PROCESSED / "train.csv").sample(min(train_n, len(pd.read_csv(DATA_PROCESSED / "train.csv"))), random_state=42)
val_df   = pd.read_csv(DATA_PROCESSED / "val.csv").head(val_n)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df, preserve_index=False)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    x = tokenizer(examples["hu"], truncation=True, max_length=max_len)
    y = tokenizer(text_target=examples["ro"], truncation=True, max_length=max_len)
    x["labels"] = y["input_ids"]
    return x

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)

# Optional LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  
)

try:
    model = get_peft_model(base_model, lora_config)
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    print("Using LoRA.")
except Exception as e:
    print("LoRA failed, training full model instead:", type(e).__name__, e)
    model = base_model

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

args = Seq2SeqTrainingArguments(
    output_dir=str(PROJECT_ROOT / "checkpoints" / "opus_hu_ro_legal"),
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    learning_rate=5e-5,
    num_train_epochs=num_train_epochs,
    fp16=(DEVICE == "cuda"),
    evaluation_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    save_total_limit=2,
    predict_with_generate=False,
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=0,  
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
print("Done. Saved to:", args.output_dir)


Not available: Helsinki-NLP/opus-mt-hu-ro | OSError
Not available: Helsinki-NLP/opus-mt-ro-hu | OSError
Chosen model: Helsinki-NLP/opus-mt-hu-en


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:24<00:00, 2020.46 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:00<00:00, 2026.35 examples/s]


Using LoRA.


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
2000,5.1241,4.837155
4000,4.8079,4.472488
6000,4.5679,4.242502
8000,4.453,4.096498
10000,4.3484,3.986247
12000,4.2055,3.911134
14000,4.2407,3.863305
16000,4.1872,3.833374
18000,4.1255,3.817755




Done. Saved to: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal


In [None]:
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

CKPT_DIR = Path(PROJECT_ROOT) / "checkpoints" / "opus_hu_ro_legal"

def get_latest_checkpoint(folder: Path):
    cks = sorted([p for p in folder.glob("checkpoint-*") if p.is_dir()],
                 key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1)
    return cks[-1] if cks else folder

LOAD_DIR = get_latest_checkpoint(CKPT_DIR)
print("Loading from:", LOAD_DIR)

tokenizer = AutoTokenizer.from_pretrained(LOAD_DIR, use_fast=True)

try:
    model = AutoModelForSeq2SeqLM.from_pretrained(LOAD_DIR).to(DEVICE)
    print("Loaded as full model.")
except Exception as e:
    print("Full-model load failed, trying LoRA adapter load:", type(e).__name__, e)
    
    base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
    model = PeftModel.from_pretrained(base, LOAD_DIR).to(DEVICE)
    print("Loaded as LoRA adapter on base model.")

model.eval()


Loading from: D:\UniNou\Master\Anul 2\Machine Translation\Proiect 2\MT-project-ro-hu-legal\checkpoints\opus_hu_ro_legal\checkpoint-18750




Loaded as full model.


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62522, 512, padding_idx=62521)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62522, 512, padding_idx=62521)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): lora.Linear(
              (base_layer): Linear(in_features=512, out_features=512, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=512, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=512, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (l

In [None]:
import pandas as pd
from pathlib import Path
import torch
from tqdm import tqdm

test_df = pd.read_csv(DATA_PROCESSED / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()

assert torch.cuda.is_available(), "CUDA not available in this kernel."

model = model.to("cuda")
model.eval()
print("Model device:", next(model.parameters()).device)

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def translate_batch_safe(
    model, tokenizer, sentences,
    batch_size=32,           
    max_input_len=256,
    max_new_tokens=96,
    num_beams=1
):
    hyps = []
    i = 0

    while i < len(sentences):
        bs = min(batch_size, len(sentences) - i)
        batch = sentences[i:i+bs]

        try:
            inputs = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_input_len,
            ).to("cuda")

            with torch.inference_mode():
                out = model.generate(
                    **inputs,
                    num_beams=num_beams,
                    do_sample=False,
                    max_new_tokens=max_new_tokens,
                    early_stopping=True,
                )

            hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
            i += bs

        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            if batch_size <= 1:
                raise
            batch_size = max(1, batch_size // 2)
            print(f"OOM -> reducing batch_size to {batch_size} and retrying...")

        except RuntimeError as e:
            msg = str(e)
            if "CUBLAS_STATUS_INTERNAL_ERROR" in msg or "CUDA error" in msg:
                torch.cuda.empty_cache()
                if batch_size > 1:
                    batch_size = max(1, batch_size // 2)
                    print(f"CUDA kernel error -> reducing batch_size to {batch_size} and retrying...")
                else:
                    if max_input_len > 128:
                        max_input_len = 128
                        print("CUDA kernel error at batch_size=1 -> reducing max_input_len to 128 and retrying...")
                    elif max_new_tokens > 64:
                        max_new_tokens = 64
                        print("CUDA kernel error at batch_size=1 -> reducing max_new_tokens to 64 and retrying...")
                    else:
                        raise
            else:
                raise

    return hyps

hyps = translate_batch_safe(
    model, tokenizer, src_sentences,
    batch_size=32,         
    max_input_len=256,
    max_new_tokens=96,
    num_beams=1
)

pred_path = Path(PROJECT_ROOT) / "data" / "outputs" / "opus_finetuned_predictions_greedy.csv"
pred_path.parent.mkdir(parents=True, exist_ok=True)

pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": hyps}).to_csv(pred_path, index=False)
print("Saved predictions to:", pred_path)


Model device: cuda:0




KeyboardInterrupt: 

In [17]:
from comet import download_model, load_from_checkpoint

comet_model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(comet_model_path)


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\RoG\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
INFO:comet.models.base:Encoder model frozen.
c:\Users\RoG\anaconda3\envs\licenta\lib\site-packages\pytorch_lightning\core\saving.py:197: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [16]:
def compute_comet(src, hyp, ref, batch_size=8):
    data = [
        {"src": s, "mt": h, "ref": r}
        for s, h, r in zip(src, hyp, ref)
    ]
    scores = comet_model.predict(data, batch_size=batch_size, gpus=1 if torch.cuda.is_available() else 0)
    return sum(scores.scores) / len(scores.scores)

comet_score = compute_comet(src_sentences, baseline_hyps, refs)
print("Baseline COMET:", round(comet_score, 4))


NameError: name 'comet_model' is not defined

In [None]:
import torch
import numpy as np
import pandas as pd
import sacrebleu

from tqdm import tqdm
from pathlib import Path

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel


RUN_DIR = Path(PROJECT_ROOT) / "checkpoints" / "opus_hu_ro_legal"  

def get_latest_checkpoint(run_dir: Path) -> Path:
    ckpts = sorted(
        [p for p in run_dir.glob("checkpoint-*") if p.is_dir()],
        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1
    )
    return ckpts[-1] if ckpts else run_dir

CKPT_DIR = get_latest_checkpoint(RUN_DIR)
print("Using checkpoint:", CKPT_DIR)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)

model = PeftModel.from_pretrained(base_model, CKPT_DIR).to(DEVICE)
model.eval()

print("Model device:", next(model.parameters()).device)

test_df = pd.read_csv(DATA_PROCESSED / "test.csv")
src_sentences = test_df["hu"].tolist()
refs = test_df["ro"].tolist()


def translate_batch(model, tokenizer, sentences, batch_size=32, max_input_len=256, max_new_tokens=96, num_beams=1):
    hyps = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_input_len
        ).to(DEVICE)

        with torch.inference_mode():
            out = model.generate(
                **inputs,
                num_beams=num_beams,
                do_sample=False,
                max_new_tokens=max_new_tokens,
                early_stopping=True,
            )

        hyps.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return hyps


finetuned_hyps = translate_batch(
    model, tokenizer, src_sentences,
    batch_size=16 if DEVICE=="cuda" else 4,  
    num_beams=1,
    max_new_tokens=96
)


bleu_ft = sacrebleu.corpus_bleu(finetuned_hyps, [refs]).score
print("Finetuned BLEU:", round(bleu_ft, 2))

leak_rate_ft = np.mean([has_hu_diacritics(h) for h in finetuned_hyps])
print("HU diacritics leak rate (FT):", round(leak_rate_ft*100, 2), "%")

all_hits_ft = [glossary_hit(s, h, glossary) for s, h in zip(src_sentences, finetuned_hyps)]
flat_ft = [x for row in all_hits_ft for x in row]
if flat_ft:
    print("Glossary accuracy (FT):", round(np.mean(flat_ft)*100, 2), "%")
else:
    print("No glossary terms found in sample.")


out_path = Path(OUTPUT_DIR) / f"finetuned_predictions_lora_{CKPT_DIR.name}.csv"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
pd.DataFrame({"source": src_sentences, "reference": refs, "hypothesis": finetuned_hyps}).to_csv(out_path, index=False)
print("Saved:", out_path)


OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "c:\Users\RoG\anaconda3\envs\licenta\lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [15]:
import numpy as np

# HU diacritics leak proxy (how often output still looks Hungarian)
hu_diacritics = set("√°√©√≠√≥√∂≈ë√∫√º≈±√Å√â√ç√ì√ñ≈ê√ö√ú≈∞")
def has_hu_diacritics(s): 
    return any(c in hu_diacritics for c in s)

leak_rate = np.mean([has_hu_diacritics(h) for h in baseline_hyps])
print("HU diacritics leak rate:", round(leak_rate*100, 2), "%")

# Very small starter glossary for institutions/terms (expand later)
glossary = {
    "Bizotts√°g": ["Comisia"],
    "Tan√°cs": ["Consiliul"],
    "K√∂z√∂ss√©g": ["Comunitatea", "ComunitƒÉ»õii"],
}

def glossary_hit(src, hyp, glossary):
    hits = []
    for k, vals in glossary.items():
        if k in src:
            ok = any(v in hyp for v in vals)
            hits.append(ok)
    return hits

all_hits = [glossary_hit(s, h, glossary) for s, h in zip(src_sentences, baseline_hyps)]
flat = [x for row in all_hits for x in row]
if flat:
    print("Glossary accuracy (starter terms):", round(np.mean(flat)*100, 2), "%")
else:
    print("No glossary terms found in sample.")


HU diacritics leak rate: 0.0 %
Glossary accuracy (starter terms): 45.45 %


In [None]:
import os, gc, math, re
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import LoraConfig, get_peft_model, TaskType


# Pick best available HU<->RO model (prefer HU->RO)
CANDIDATES = [
    "Helsinki-NLP/opus-mt-hu-ro",
    "Helsinki-NLP/opus-mt-ro-hu",
]

chosen = None
for m in CANDIDATES:
    try:
        _ = AutoTokenizer.from_pretrained(m)
        chosen = m
        break
    except Exception as e:
        print("Not available:", m, "|", type(e).__name__)

if chosen is None:
    raise RuntimeError("No HU<->RO OPUS model available. You must use a pivot or different base.")

MODEL_NAME = chosen
print("Chosen model:", MODEL_NAME)


REVERSE = (MODEL_NAME.endswith("ro-hu"))
print("REVERSE (means base is RO->HU):", REVERSE)


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()


# Load data
train_path = DATA_PROCESSED / "train.csv"
val_path   = DATA_PROCESSED / "val.csv"

train_full = pd.read_csv(train_path)
val_full   = pd.read_csv(val_path)

# Curriculum: oversample "legal-heavy" lines early

LEGAL_MARKERS_RO = [
    "Regulamentul", "Directiva", "articol", "alineat", "considerent",
    "Comisia", "Consiliul", "Parlamentul", "Uniunii", "statele membre"
]
LEGAL_MARKERS_HU = [
    "rendelet", "ir√°nyelv", "cikk", "bekezd√©s",
    "Bizotts√°g", "Tan√°cs", "Parlament", "Uni√≥", "tag√°llam"
]

def is_legalish(row):
    hu = str(row["hu"])
    ro = str(row["ro"])
    hu_hit = any(m in hu for m in LEGAL_MARKERS_HU)
    ro_hit = any(m in ro for m in LEGAL_MARKERS_RO)
    return hu_hit or ro_hit

# pick sizes 
train_n = min(50000, len(train_full))
val_n   = min(2000, len(val_full))

train_sample = train_full.sample(train_n, random_state=42)

# make a curriculum subset: legalish + some random
legal_part = train_sample[train_sample.apply(is_legalish, axis=1)]
rand_part  = train_sample.sample(min(len(train_sample), max(5000, train_n // 5)), random_state=43)
cur_df = pd.concat([legal_part, rand_part], ignore_index=True).drop_duplicates()

# final train is curriculum first then rest
train_df = pd.concat([cur_df, train_sample], ignore_index=True).drop_duplicates()
val_df   = val_full.head(val_n)

print("Train size:", len(train_df), "| curriculum chunk:", len(cur_df), "| Val size:", len(val_df))

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df, preserve_index=False)

# Tokenizer + preprocessing
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

max_src_len = 256
max_tgt_len = 256

def preprocess(examples):

    src_texts = examples["hu"]
    tgt_texts = examples["ro"]


    if REVERSE:
        pass

    model_inputs = tokenizer(src_texts, truncation=True, max_length=max_src_len)

    labels = tokenizer(text_target=tgt_texts, truncation=True, max_length=max_tgt_len)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)


config = AutoConfig.from_pretrained(MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else None
).to(DEVICE)


base_model.config.use_cache = False
if hasattr(base_model, "gradient_checkpointing_enable"):
    base_model.gradient_checkpointing_enable()


lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

model = base_model
using_lora = False
try:
    model = get_peft_model(base_model, lora_config)
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    using_lora = True
    print("Using LoRA.")
except Exception as e:
    print("LoRA not compatible here; training full model instead:", type(e).__name__, e)
    model = base_model

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


batch_size = 4 if DEVICE == "cuda" else 2
grad_accum = 4 if DEVICE == "cuda" else 8  
num_train_epochs = 2                       

args = Seq2SeqTrainingArguments(
    output_dir=str(PROJECT_ROOT / "checkpoints" / "opus_hu_ro_legal_direct"),
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    learning_rate=3e-4 if using_lora else 5e-5,  
    num_train_epochs=num_train_epochs,
    fp16=(DEVICE == "cuda"),
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=0,  
    label_smoothing_factor=0.1,  
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.01 if not using_lora else 0.0,
    predict_with_generate=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
print("Done. Saved to:", args.output_dir)


OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "c:\Users\RoG\anaconda3\envs\licenta\lib\site-packages\torch\lib\c10.dll" or one of its dependencies.