In [1]:
%pip install -q "transformers>=4.44.0" "datasets>=2.20.0" "accelerate>=0.33.0" \
"peft>=0.12.0" "evaluate>=0.4.2" "rouge-score>=0.1.2" "sacrebleu>=2.4.2" torch ipywidgets


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, math, json, random, time
from dataclasses import dataclass
import torch

# ✔ dica MPS: diminui chance de OOM ao reciclar memória
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# caminhos
DATA_DIR = "data"
TRAIN_PATH = f"{DATA_DIR}/train.jsonl"
VAL_PATH   = f"{DATA_DIR}/val.jsonl"

# modelo base (seq2seq) — ótimo pra gerar texto
MODEL_NAME = "google/flan-t5-base"  # se faltar memória, troque depois pra "google/flan-t5-small"

# comprimentos (pode ajustar depois)
MAX_INPUT_LEN  = 128
MAX_TARGET_LEN = 224   # pode subir p/ 256 se estiver estável

# hiperparâmetros (pensados p/ M3 Pro, com LoRA)
LR        = 3e-4
EPOCHS    = 3
BATCH     = 2          # pequeno p/ caber em MPS
GRAD_ACC  = 8          # acumula gradiente p/ simular batch efetivo 16
WEIGHT_DECAY = 0.01
LR_SCHED     = "cosine"
SEED         = 42
EVAL_STEPS   = 500
SAVE_STEPS   = 500

# device
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

# conferências rápidas dos arquivos
assert os.path.exists(TRAIN_PATH), f"faltou {TRAIN_PATH} (gere no 01_data_prep.ipynb)"
assert os.path.exists(VAL_PATH),   f"faltou {VAL_PATH} (gere no 01_data_prep.ipynb)"


Device: mps


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

# carrega diretamente dos JSONL
raw = load_dataset("json", data_files={"train": TRAIN_PATH, "val": VAL_PATH})
print(raw)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 48500
    })
    val: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 1500
    })
})


In [4]:
from transformers import DataCollatorForSeq2Seq

def preprocess(batch):
    # inputs
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding=False
    )
    # targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_text"],
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding=False
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)  # definiremos o modelo já-já
print(tokenized)


Map:   0%|          | 0/48500 [00:00<?, ? examples/s]



Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 48500
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
})


In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, TaskType, get_peft_model

# carrega modelo base
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.to(device)

# dicas para treino estável em MPS
model.config.use_cache = False               # desliga cache em treino
model.gradient_checkpointing_enable()        # menor memória, um pouco mais lento

# LoRA config (leve e efetivo)
lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,                # rank
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q", "v"]  # projecções chaves/valores nos attn blocks do T5
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()  # sanity check


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


In [6]:
import evaluate
import numpy as np
from transformers import TrainerCallback

rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # decodifica
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    preds, labels = postprocess_text(preds, labels)
    rouge_res = rouge.compute(predictions=preds, references=labels, use_aggregator=True)
    bleu_res  = bleu.compute(predictions=preds, references=[[l] for l in labels])

    # foco em rougeL como métrica-chave
    out = {
        "rougeL": rouge_res["rougeL"],
        "bleu": bleu_res["score"]
    }
    return out

class EarlyStopper(TrainerCallback):
    def __init__(self, metric_name="rougeL", patience=2):
        self.metric_name = metric_name
        self.patience = patience
        self.best = None
        self.bad = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None: 
            return
        cur = metrics.get(self.metric_name)
        if cur is None:
            return
        if (self.best is None) or (cur > self.best):
            self.best = cur
            self.bad = 0
        else:
            self.bad += 1
            if self.bad >= self.patience:
                control.should_training_stop = True
        return control


In [10]:
import inspect
from transformers import Trainer, TrainingArguments

OUTPUT_DIR = "outputs/t5_lora_mps"

# base comum (sem args problemáticos)
common_kwargs = dict(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    gradient_accumulation_steps=GRAD_ACC,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type=LR_SCHED,
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    fp16=(device == "cuda"),   # no MPS, manter False
    bf16=False,
)

# 1) tentar usar Seq2SeqTrainingArguments (tem predict_with_generate em algumas versões)
try:
    from transformers import Seq2SeqTrainingArguments as _TA
except Exception:
    _TA = TrainingArguments

# 2) montar kwargs de forma dinâmica, conforme assinatura da classe
sig = inspect.signature(_TA.__init__)
allowed = set(sig.parameters.keys())

dyn_kwargs = dict(common_kwargs)

# evaluation strategy mudou de nome em algumas versões
if "eval_strategy" in allowed:
    dyn_kwargs["eval_strategy"] = "steps"
elif "evaluation_strategy" in allowed:
    dyn_kwargs["evaluation_strategy"] = "steps"
# se nenhum dos dois existir, seguimos sem estratégia explícita (fazemos eval manual se precisar)

# predict_with_generate existe em Seq2SeqTrainingArguments (algumas versões)
if "predict_with_generate" in allowed:
    dyn_kwargs["predict_with_generate"] = True

# geração: em v5, passe via gen_kwargs no Trainer; guardamos aqui para injetar depois
gen_kwargs = {"max_new_tokens": MAX_TARGET_LEN}

args = _TA(**dyn_kwargs)

# reconstroi o data_collator agora com o modelo definido
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStopper(metric_name="rougeL", patience=2)]
)

# hack compatível: injeta gen_kwargs para avaliação/predição
setattr(trainer, "_gen_kwargs", gen_kwargs)

trainer


  trainer = Trainer(


<transformers.trainer.Trainer at 0x32ed40ad0>

In [11]:
train_result = trainer.train()
train_result




Step,Training Loss,Validation Loss


RuntimeError: Invalid buffer size: 9.01 GiB

In [12]:
import os, glob, json, pprint

OUTDIR = "outputs/t5_lora_mps"

ckpts = sorted(
    glob.glob(f"{OUTDIR}/checkpoint-*"),
    key=lambda p: int(p.split("-")[-1])
)
print("Checkpoints encontrados:", [os.path.basename(p) for p in ckpts][-5:])
last_ckpt = ckpts[-1] if ckpts else None
print("Último checkpoint:", last_ckpt)


Checkpoints encontrados: []
Último checkpoint: None


In [13]:
STATE = f"{OUTDIR}/trainer_state.json"
if os.path.exists(STATE):
    with open(STATE, "r", encoding="utf-8") as f:
        st = json.load(f)
    print("global_step:", st.get("global_step"))
    print("epoch:", st.get("epoch"))
    print("best_metric (rougeL):", st.get("best_metric"))
    print("best_model_checkpoint:", st.get("best_model_checkpoint"))
    print("\nÚltimos logs:")
    for e in st.get("log_history", [])[-5:]:
        pprint.pprint(e)
else:
    print("trainer_state.json não encontrado")


trainer_state.json não encontrado
