In [1]:
%pip install -q "transformers>=4.44.0" "datasets>=2.20.0" "accelerate>=0.33.0" \
"peft>=0.12.0" "evaluate>=0.4.2" "rouge-score>=0.1.2" "sacrebleu>=2.4.2" torch ipywidgets


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, math, json, random, time
from dataclasses import dataclass
import torch

# ✔ dica MPS: diminui chance de OOM ao reciclar memória
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# caminhos
DATA_DIR = "data"
TRAIN_PATH = f"{DATA_DIR}/train.jsonl"
VAL_PATH   = f"{DATA_DIR}/val.jsonl"

# modelo base (seq2seq) — ótimo pra gerar texto
MODEL_NAME = "google/flan-t5-base"  # se faltar memória, troque depois pra "google/flan-t5-small"

# comprimentos (pode ajustar depois)
MAX_INPUT_LEN  = 128
MAX_TARGET_LEN = 224   # pode subir p/ 256 se estiver estável

# hiperparâmetros (pensados p/ M3 Pro, com LoRA)
LR        = 3e-4
EPOCHS    = 3
BATCH     = 2          # pequeno p/ caber em MPS
GRAD_ACC  = 8          # acumula gradiente p/ simular batch efetivo 16
WEIGHT_DECAY = 0.01
LR_SCHED     = "cosine"
SEED         = 42
EVAL_STEPS   = 500
SAVE_STEPS   = 500

# device
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

# conferências rápidas dos arquivos
assert os.path.exists(TRAIN_PATH), f"faltou {TRAIN_PATH} (gere no 01_data_prep.ipynb)"
assert os.path.exists(VAL_PATH),   f"faltou {VAL_PATH} (gere no 01_data_prep.ipynb)"


Device: mps


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

# carrega diretamente dos JSONL
raw = load_dataset("json", data_files={"train": TRAIN_PATH, "val": VAL_PATH})
print(raw)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 48500
    })
    val: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 1500
    })
})


In [4]:
from transformers import DataCollatorForSeq2Seq

def preprocess(batch):
    # inputs
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding=False
    )
    # targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_text"],
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding=False
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)  # definiremos o modelo já-já
print(tokenized)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 48500
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
})


In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, TaskType, get_peft_model

# carrega modelo base
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.to(device)

# dicas para treino estável em MPS
model.config.use_cache = False               # desliga cache em treino
model.gradient_checkpointing_enable()        # menor memória, um pouco mais lento

# LoRA config (leve e efetivo)
lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,                # rank
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q", "v"]  # projecções chaves/valores nos attn blocks do T5
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()  # sanity check


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


In [6]:
import evaluate
import numpy as np
from transformers import TrainerCallback

rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # decodifica
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    preds, labels = postprocess_text(preds, labels)
    rouge_res = rouge.compute(predictions=preds, references=labels, use_aggregator=True)
    bleu_res  = bleu.compute(predictions=preds, references=[[l] for l in labels])

    # foco em rougeL como métrica-chave
    out = {
        "rougeL": rouge_res["rougeL"],
        "bleu": bleu_res["score"]
    }
    return out

class EarlyStopper(TrainerCallback):
    def __init__(self, metric_name="rougeL", patience=2):
        self.metric_name = metric_name
        self.patience = patience
        self.best = None
        self.bad = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None: 
            return
        cur = metrics.get(self.metric_name)
        if cur is None:
            return
        if (self.best is None) or (cur > self.best):
            self.best = cur
            self.bad = 0
        else:
            self.bad += 1
            if self.bad >= self.patience:
                control.should_training_stop = True
        return control


In [7]:
# === reconfig "no-eval" ===
import inspect
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq

OUTPUT_DIR = "outputs/t5_lora_mps"

args_noeval = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH,      # seu BATCH atual
    gradient_accumulation_steps=GRAD_ACC,   # seu GRAD_ACC atual
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type=LR_SCHED,
    save_strategy="steps",
    save_steps=200,              # salva cedo e sempre
    save_total_limit=3,
    logging_steps=50,
    fp16=(device == "cuda"),
    bf16=False,
    # eval desligado (cobre v4 e v5)
    **({"eval_strategy": "no"} if "eval_strategy" in inspect.signature(TrainingArguments.__init__).parameters 
       else {"evaluation_strategy": "no"})
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,                      # reaproveita o modelo já carregado
    args=args_noeval,
    train_dataset=tokenized["train"], # pode reduzir mais tarde se quiser
    data_collator=data_collator,
    **(
        {"processing_class": tokenizer}
        if "processing_class" in inspect.signature(Trainer.__init__).parameters
        else {"tokenizer": tokenizer}
    ),
)

train_result = trainer.train()
train_result




Step,Training Loss,Validation Loss
50,3.6537,
100,3.6708,
150,3.7136,
200,3.7374,
250,3.7912,
300,3.764,
350,3.7769,
400,3.7933,
450,3.862,
500,3.8829,


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d92dbf9f-b378-45d8-bde4-702e8947e541)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e4630313-e08e-4c53-9e19-5fd2dfa39509)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 0928e0b1-594c-4ea8-b4c0-344b45d606f7)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


KeyboardInterrupt: 

In [8]:
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel
import torch, os, json

ckpt_path = "outputs/t5_lora_mps/checkpoint-600"
print("Usando checkpoint:", ckpt_path)

base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
base.to(device)

model_ckpt = PeftModel.from_pretrained(base, ckpt_path)
model_ckpt.to(device)

trainer.model = model_ckpt  # substitui modelo atual

# config de avaliação "magra" (pra não estourar memória)
if hasattr(trainer.args, "per_device_eval_batch_size"): trainer.args.per_device_eval_batch_size = 1
if hasattr(trainer.args, "include_logits_in_eval"):     trainer.args.include_logits_in_eval = False
if hasattr(trainer.args, "eval_accumulation_steps"):    trainer.args.eval_accumulation_steps = 1

setattr(trainer, "_gen_kwargs", {"max_new_tokens": MAX_TARGET_LEN, "num_beams": 1})


Usando checkpoint: outputs/t5_lora_mps/checkpoint-600


In [9]:
small_val = tokenized["val"].select(range(min(1000, len(tokenized["val"]))))
print("Tamanho do val sample:", len(small_val))

eval_metrics_600 = trainer.evaluate(eval_dataset=small_val)
print("Métricas checkpoint-600:", eval_metrics_600)

# salva snapshot de métricas
os.makedirs("outputs", exist_ok=True)
with open("outputs/eval_ckpt600.json", "w", encoding="utf-8") as f:
    json.dump(eval_metrics_600, f, ensure_ascii=False, indent=2)


Tamanho do val sample: 1000




Métricas checkpoint-600: {'eval_loss': 3.4426136016845703}


In [10]:
eval_metrics_full = trainer.evaluate(eval_dataset=tokenized["val"])
print("Avaliação completa:", eval_metrics_full)

with open("outputs/eval_ckpt600_full.json", "w", encoding="utf-8") as f:
    json.dump(eval_metrics_full, f, ensure_ascii=False, indent=2)


Avaliação completa: {'eval_loss': 3.4511327743530273}


In [None]:
import os
os.makedirs("artifacts/t5_lora", exist_ok=True)
trainer.save_model("artifacts/t5_lora")
tokenizer.save_pretrained("artifacts/t5_lora")
print("Modelo final salvo em artifacts/t5_lora")


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: cf456722-8784-4f2d-b369-4dff93818314)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


✔ Modelo final salvo em artifacts/t5_lora


In [12]:
import os, json, random
from transformers import AutoModelForSeq2SeqLM

os.makedirs("outputs", exist_ok=True)
baseline_path = "outputs/baseline_val200.jsonl"

# 0) util de geração
def gen_batch(model, texts, tok, device, max_in=128, max_out=256, bs=4):
    out = []
    for i in range(0, len(texts), bs):
        enc = tok(texts[i:i+bs], return_tensors="pt", padding=True, truncation=True, max_length=max_in).to(device)
        with torch.no_grad():
            ids = model.generate(**enc, max_new_tokens=max_out, num_beams=1)
        out += tok.batch_decode(ids, skip_special_tokens=True)
    return out

# 1) inputs (usa baseline se existir; senão cria 200 do val)
if os.path.exists(baseline_path):
    rows = [json.loads(l) for l in open(baseline_path, "r", encoding="utf-8")]
    inputs = [r["input"] for r in rows]
    refs   = [r["ref"]   for r in rows]
else:
    # seleciona 200 primeiros do val (ou aleatórios)
    N = min(200, len(tokenized["val"]))
    sample = tokenized["val"].select(range(N))
    inputs = sample["input_text"]
    refs   = sample["target_text"]

# 2) baseline = FLAN-T5-base "cru"
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
preds_base = gen_batch(base_model, inputs, tokenizer, device, MAX_INPUT_LEN, MAX_TARGET_LEN)

# 3) fine-tuned = seu checkpoint carregado como `model_ckpt`
preds_ft = gen_batch(model_ckpt, inputs, tokenizer, device, MAX_INPUT_LEN, MAX_TARGET_LEN)

# 4) salvar comparação
out_path = "outputs/compare_baseline_vs_finetuned.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for x, pb, pf, r in zip(inputs, preds_base, preds_ft, refs):
        f.write(json.dumps({"input": x, "baseline_pred": pb, "finetuned_pred": pf, "ref": r}, ensure_ascii=False) + "\n")

print("Comparação salva em:", out_path)
# mostre 1 exemplo
print(json.dumps({"input": inputs[0], "baseline_pred": preds_base[0], "finetuned_pred": preds_ft[0], "ref": refs[0]}, ensure_ascii=False)[:600], "...")


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: cd9520d2-bd3e-4bb9-aa98-4f870f92a7f1)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Comparação salva em: outputs/compare_baseline_vs_finetuned.jsonl
{"input": "Given a product title, generate its product description.\nTitle: Islandoffer Color Pearl Nail Art Stone Small Wheel Rhinestones Beads\nDescription:", "baseline_pred": "Islandoffer Color Pearl Nail Art Stone Small Wheel Rhinestones Beads", "finetuned_pred": "Islandoffer Color Pearl Nail Art Stone Small Wheel Rhinestones Beads", "ref": "Make your nails look elegance and special,Suitable to use on top of nail polish, UV builder gel, acrylic, etc,Also can be used to decorate your home and furniture, cell phone and mp3 cases, glasses, cards, body art, bookmarks."} ...


In [13]:
import evaluate, json

cmp_path = "outputs/compare_baseline_vs_finetuned.jsonl"
assert os.path.exists(cmp_path), "rode o bloco de comparação primeiro"

rows = [json.loads(l) for l in open(cmp_path, "r", encoding="utf-8")]
preds_ft = [r["finetuned_pred"] for r in rows]
refs     = [r["ref"]           for r in rows]
preds_bl = [r["baseline_pred"] for r in rows]

rouge = evaluate.load("rouge"); bleu = evaluate.load("sacrebleu")

def metrics_of(preds, refs):
    r = rouge.compute(predictions=preds, references=refs, use_aggregator=True)
    b = bleu.compute(predictions=preds, references=[[x] for x in refs])
    return {"rougeL": r["rougeL"], "bleu": b["score"]}

m_ft = metrics_of(preds_ft, refs)
m_bl = metrics_of(preds_bl, refs)

print("Baseline →", {k: round(v, 4) for k,v in m_bl.items()})
print("Fine-tuned →", {k: round(v, 4) for k,v in m_ft.items()})


Baseline → {'rougeL': np.float64(0.1223), 'bleu': 0.0003}
Fine-tuned → {'rougeL': np.float64(0.1266), 'bleu': 1.2264}


In [15]:
train_result = trainer.train(resume_from_checkpoint=True)
train_result



Step,Training Loss
650,3.9226
700,3.9364
750,3.987
800,3.9977
850,4.0133
900,4.0037
950,3.993
1000,4.0796
1050,4.0538
1100,4.0218


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 948295b0-0680-46c9-8e7b-ccedc1c8eac3)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 56b1e65a-4aa0-445d-bf04-d954c8b97360)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 91a55f59-36fa-4129-843b-0a082b9c38e8)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 80

TrainOutput(global_step=9095, training_loss=4.510415011883568, metrics={'train_runtime': 19978.2591, 'train_samples_per_second': 7.283, 'train_steps_per_second': 0.455, 'total_flos': 7691658521057280.0, 'train_loss': 4.510415011883568, 'epoch': 3.0})

In [None]:
# === Avaliar múltiplos checkpoints (ROUGE-L e BLEU) ===
import glob, os, json, gc
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel
import torch, evaluate
from datasets import load_dataset

# 1) garantir dataset cru (para pegar input_text e target_text)
if "raw" not in locals():
    raw = load_dataset("json", data_files={"train": "data/train.jsonl", "val": "data/val.jsonl"})

# 2) definir checkpoints a avaliar (ajuste conforme quiser)
candidates = []
for p in sorted(glob.glob("outputs/t5_lora_mps/checkpoint-*"), key=lambda x: int(x.split("-")[-1])):
    step = int(p.split("-")[-1])
    if step in (600, 3000, 6000, 9000):  # << altere se quiser testar outros
        candidates.append((step, p))
print("Vou avaliar:", [c[0] for c in candidates])

# 3) recorte de validação seguro (usa colunas de texto do dataset cru)
N = min(1000, len(raw["val"]))  # avalia nas 1000 primeiras amostras
val_raw_sample = raw["val"].select(range(N))
texts = val_raw_sample["input_text"]
refs  = val_raw_sample["target_text"]
len(texts), len(refs)

# 4) util de geração leve
def gen_preds(model, texts, tok, device, max_in=MAX_INPUT_LEN, max_out=MAX_TARGET_LEN, bs=1):
    preds = []
    model.eval()
    with torch.inference_mode():
        for i in tqdm(range(0, len(texts), bs)):
            chunk = texts[i:i+bs]
            enc = tok(chunk, return_tensors="pt", padding=True, truncation=True, max_length=max_in).to(device)
            ids = model.generate(**enc, max_new_tokens=max_out, num_beams=1)
            preds += tok.batch_decode(ids, skip_special_tokens=True)
            del enc, ids
            if device == "mps":
                torch.mps.empty_cache()
    return preds

# 5) preparar métricas
rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")
results = []

# 6) avaliar cada checkpoint
for step, ckpt in candidates:
    print(f"\n=== Avaliando checkpoint {step} ===")
    base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
    model_ckpt = PeftModel.from_pretrained(base, ckpt).to(device)

    preds = gen_preds(model_ckpt, texts, tokenizer, device, bs=1)
    r = rouge.compute(predictions=preds, references=refs, use_aggregator=True)
    b = bleu.compute(predictions=preds, references=[[x] for x in refs])

    results.append({"step": step, "ckpt": ckpt, "rougeL": r["rougeL"], "bleu": b["score"]})
    print(f"[{step}] ROUGE-L={r['rougeL']:.4f} | BLEU={b['score']:.2f}")

    # liberar memória
    del base, model_ckpt, preds
    gc.collect()
    if device == "mps":
        torch.mps.empty_cache()

# 7) salvar snapshot e escolher melhor checkpoint
os.makedirs("outputs", exist_ok=True)
with open("outputs/eval_multi_ckpts.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

best = max(results, key=lambda x: x["rougeL"])
print("\n Resultados finais:")
for r in results:
    print(f" step {r['step']:>5} → ROUGE-L={r['rougeL']:.4f} | BLEU={r['bleu']:.2f}")
print("\n Melhor checkpoint:", best)


Vou avaliar: [9000]

=== Avaliando checkpoint 9000 ===


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 05ff6113-b7d6-46a3-996c-01ec568b0f0d)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
100%|██████████| 1000/1000 [41:20<00:00,  2.48s/it] 


[9000] ROUGE-L=0.1255 | BLEU=1.19

📈 Resultados finais:
 step  9000 → ROUGE-L=0.1255 | BLEU=1.19

🏆 Melhor checkpoint: {'step': 9000, 'ckpt': 'outputs/t5_lora_mps/checkpoint-9000', 'rougeL': np.float64(0.1254880707734567), 'bleu': 1.185081614929426}


In [18]:
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel
import os

best_ckpt = "outputs/t5_lora_mps/checkpoint-9000"
base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model_best = PeftModel.from_pretrained(base, best_ckpt).to(device)

os.makedirs("artifacts/t5_lora_best", exist_ok=True)
model_best.save_pretrained("artifacts/t5_lora_best")
tokenizer.save_pretrained("artifacts/t5_lora_best")
print("✔ Melhor modelo salvo em artifacts/t5_lora_best")


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 0b429cb7-b412-4417-93c8-29340479b47a)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


✔ Melhor modelo salvo em artifacts/t5_lora_best


In [19]:
# usa os mesmos 200 inputs do baseline
import json, torch, gc
from transformers import AutoModelForSeq2SeqLM

rows = [json.loads(l) for l in open("outputs/baseline_val200.jsonl","r",encoding="utf-8")]
inputs = [r["input"] for r in rows]; refs = [r["ref"] for r in rows]

def gen_batch(model, texts, bs=2):
    out=[]
    with torch.inference_mode():
        for i in range(0,len(texts),bs):
            enc = tokenizer(texts[i:i+bs], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LEN).to(device)
            ids = model.generate(**enc, max_new_tokens=MAX_TARGET_LEN, num_beams=1)
            out += tokenizer.batch_decode(ids, skip_special_tokens=True)
            if device=="mps": torch.mps.empty_cache()
    return out

base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
preds_bl = gen_batch(base_model, inputs); del base_model; gc.collect()
preds_ft = gen_batch(model_best, inputs)

with open("outputs/compare_baseline_vs_finetuned.jsonl","w",encoding="utf-8") as f:
    for x,pb,pf,r in zip(inputs,preds_bl,preds_ft,refs):
        f.write(json.dumps({"input":x,"baseline_pred":pb,"finetuned_pred":pf,"ref":r},ensure_ascii=False)+"\n")

print("✔ compare atualizado: outputs/compare_baseline_vs_finetuned.jsonl")


✔ compare atualizado: outputs/compare_baseline_vs_finetuned.jsonl
