In [11]:
%pip install -q transformers datasets accelerate peft evaluate rouge-score sacrebleu ipywidgets torch

import os, torch
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
Device: mps


In [12]:
MODEL_NAME = "google/flan-t5-base"
MAX_INPUT_LEN, MAX_TARGET_LEN = 128, 224

LR = 3e-4
WEIGHT_DECAY = 0.01
LR_SCHED = "cosine"
EPOCHS = 2

BATCH = 1           # MPS - macbook
GRAD_ACC = 16       # batch efetivo 16
SAVE_STEPS = 200    # salvar (se necessário retomra) a cada 200

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw = load_dataset("json", data_files={"train":"../data/train.jsonl","val":"../data/val.jsonl"})
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    model_inputs = tokenizer(batch["input_text"], max_length=MAX_INPUT_LEN, truncation=True, padding=False)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target_text"], max_length=MAX_TARGET_LEN, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
len(tokenized["train"]), len(tokenized["val"])


(48500, 1500)

In [14]:
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, TaskType, get_peft_model

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model.config.use_cache = False
model.gradient_checkpointing_enable()

lora_cfg = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
                      target_modules=["q","v"])
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


In [15]:
import inspect
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

args = TrainingArguments(
    output_dir="../outputs/t5_lora_mps",
    per_device_train_batch_size=BATCH,
    gradient_accumulation_steps=GRAD_ACC,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type=LR_SCHED,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    logging_steps=50,
    fp16=(device=="cuda"),
    bf16=False,
    **({"eval_strategy":"no"} if "eval_strategy" in inspect.signature(TrainingArguments.__init__).parameters
       else {"evaluation_strategy":"no"})
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,
    **({"processing_class": tokenizer}
       if "processing_class" in inspect.signature(Trainer.__init__).parameters
       else {"tokenizer": tokenizer})
)
trainer

<transformers.trainer.Trainer at 0x15f606210>

In [17]:
#train_result = trainer.train(resume_from_checkpoint=True)
train_result = trainer.train()
train_result




Step,Training Loss
50,3.6519
100,3.6605
150,3.6769
200,3.7224
250,3.7759
300,3.7325
350,3.7425
400,3.7409
450,3.8077
500,3.815


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7b84f4bf-98da-499d-8545-ea0d2c921d51)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: eb2d9f6a-de8b-420f-8689-2a04cd45508e)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4c83466c-119f-499f-8fc1-09230ce39ea1)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: a1

TrainOutput(global_step=6064, training_loss=4.258969286815472, metrics={'train_runtime': 14330.8855, 'train_samples_per_second': 6.769, 'train_steps_per_second': 0.423, 'total_flos': 4365024810854400.0, 'train_loss': 4.258969286815472, 'epoch': 2.0})

In [18]:
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel
import evaluate, json

# textos da validação (cru, com strings)
N = min(1000, len(raw["val"]))
texts = raw["val"].select(range(N))["input_text"]
refs  = raw["val"].select(range(N))["target_text"]

# carregar checkpoint campeão
base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model_ckpt = PeftModel.from_pretrained(base, "../outputs/t5_lora_mps/checkpoint-6064").to(device)

# geração leve
preds = []
with torch.inference_mode():
    for x in texts:
        enc = tokenizer(x, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(device)
        out = model_ckpt.generate(**enc, max_new_tokens=MAX_TARGET_LEN, num_beams=1)
        preds.append(tokenizer.decode(out[0], skip_special_tokens=True))
        if device=="mps": torch.mps.empty_cache()

rouge = evaluate.load("rouge"); bleu = evaluate.load("sacrebleu")
r = rouge.compute(predictions=preds, references=refs, use_aggregator=True)
b = bleu.compute(predictions=preds, references=[[y] for y in refs])
print("Val(1k) - ROUGE-L:", round(float(r["rougeL"]),4), "| BLEU:", round(float(b["score"]),2))

# snapshot
import os; os.makedirs("../outputs", exist_ok=True)
with open("../outputs/eval_snapshot.json","w",encoding="utf-8") as f:
    json.dump({"checkpoint":"checkpoint-9000","rougeL":float(r["rougeL"]),"bleu":float(b["score"])}, f, indent=2)


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7978fec8-c984-4ddc-bc1f-eef1082a66ce)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Val(1k) - ROUGE-L: 0.1006 | BLEU: 0.66


In [19]:
import json, gc
from transformers import AutoModelForSeq2SeqLM

rows = [json.loads(l) for l in open("../outputs/baseline_val200.jsonl","r",encoding="utf-8")]
inputs = [r["input"] for r in rows]
refs200 = [r["ref"] for r in rows]

def gen_batch(mdl, texts, bs=2):
    out=[]
    with torch.inference_mode():
        for i in range(0,len(texts),bs):
            enc = tokenizer(texts[i:i+bs], return_tensors="pt", padding=True,
                            truncation=True, max_length=MAX_INPUT_LEN).to(device)
            ids = mdl.generate(**enc, max_new_tokens=MAX_TARGET_LEN, num_beams=1)
            out += tokenizer.batch_decode(ids, skip_special_tokens=True)
            if device=="mps": torch.mps.empty_cache()
    return out

# baseline cru
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
preds_bl = gen_batch(base_model, inputs); del base_model; gc.collect()

# fine-tuned best
preds_ft = gen_batch(model_ckpt, inputs)

with open("../outputs/compare_baseline_vs_finetuned.jsonl","w",encoding="utf-8") as f:
    for x, pb, pf, y in zip(inputs, preds_bl, preds_ft, refs200):
        f.write(json.dumps({"input":x,"baseline_pred":pb,"finetuned_pred":pf,"ref":y},ensure_ascii=False)+"\n")
print("compare salvo em outputs/compare_baseline_vs_finetuned.jsonl")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ff5d32f7-2a4a-48fb-8ef3-ddad52da210e)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


compare salvo em outputs/compare_baseline_vs_finetuned.jsonl


In [20]:
import evaluate, json
rows = [json.loads(l) for l in open("../outputs/compare_baseline_vs_finetuned.jsonl","r",encoding="utf-8")]
preds_bl = [r["baseline_pred"]  for r in rows]
preds_ft = [r["finetuned_pred"] for r in rows]
refs200  = [r["ref"]            for r in rows]

rouge = evaluate.load("rouge"); bleu = evaluate.load("sacrebleu")
def metric(preds, refs):
    r = rouge.compute(predictions=preds, references=refs, use_aggregator=True)
    b = bleu.compute(predictions=preds, references=[[x] for x in refs])
    return {"rougeL": float(r["rougeL"]), "bleu": float(b["score"])}

print("Baseline:", {k: round(v,4) for k,v in metric(preds_bl, refs200).items()})
print("Fine-tuned:", {k: round(v,4) for k,v in metric(preds_ft, refs200).items()})

Baseline: {'rougeL': 0.1223, 'bleu': 0.0003}
Fine-tuned: {'rougeL': 0.1123, 'bleu': 0.9039}


In [21]:
import os
os.makedirs("../artifacts/t5_lora_best", exist_ok=True)
model_ckpt.save_pretrained("../artifacts/t5_lora_best")
tokenizer.save_pretrained("../artifacts/t5_lora_best")
print("Modelo final salvo em artifacts/t5_lora_best")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d016142a-8321-41aa-a798-fb5e172274ad)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Modelo final salvo em artifacts/t5_lora_best
