In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from google.colab import drive
from torch.nn import CrossEntropyLoss

drive.mount('/content/drive')

model_id = "Emilio407/guarani-jopara-gemma-2-2b-it-v1"

# Lê JSON e divide em treino/validação
df = pd.read_json("/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/dados_treinamento_guarani.json")

# Divide os dados em treino e validação (80/20)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Converte para Dataset Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Carrega tokenizer e modelo
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_id)  # <- AQUI ESTÁ SEU MODEL


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/945 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

In [2]:
# Função para tokenização
def tokenize(example):
    prompt = example["instruction"] + "\n" + example["input"] + "\n"
    target = example["output"]
    full_text = prompt + target

    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Aplica tokenização
tokenized_train = train_dataset.map(tokenize, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize, remove_columns=val_dataset.column_names)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

In [3]:
# Função para calcular métricas
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Calcula perplexity
    shift_logits = predictions[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    # Flatten para cálculo da loss
    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.view(-1)

    # Remove tokens de padding (-100)
    mask = shift_labels != -100
    shift_logits = shift_logits[mask]
    shift_labels = shift_labels[mask]

    # Calcula cross entropy loss
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(shift_logits, shift_labels)

    # Perplexity = exp(loss)
    perplexity = torch.exp(loss)

    return {
        "eval_loss": loss.item(),
        "perplexity": perplexity.item()
    }

# Argumentos de treino com avaliação
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/modelo_finetunado",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_strategy="steps",  # Avalia a cada X steps
    eval_steps=50,          # Avalia a cada 50 steps
    save_steps=50,
    save_total_limit=2,
    fp16=False,
    logging_dir="logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[]
)


In [4]:
# Trainer com dataset de validação
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Adiciona dataset de validação
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Adiciona função de métricas
)

# Treina o modelo
trainer.train()

# Salva o modelo
output_dir = "/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/modelo_finetunado"
trainer.save_model(output_dir)

print("Treinamento concluído!")

  trainer = Trainer(
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 82.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 56.88 MiB is free. Process 25644 has 39.49 GiB memory in use. Of the allocated memory 38.52 GiB is allocated by PyTorch, and 488.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# ============= AVALIAÇÃO QUALITATIVA =============

# Carrega o modelo treinado
model_path = "/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/modelo_finetunado"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()

# Função para testar múltiplas perguntas
def avaliar_modelo(perguntas_teste):
    resultados = []

    for pergunta in perguntas_teste:
        inputs = tokenizer(pergunta, return_tensors="pt")

        with torch.no_grad():
            outputs = model.generate(
                inputs['input_ids'],
                max_length=100,
                do_sample=True,
                temperature=0.7,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id
            )

        resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove a pergunta da resposta para mostrar só a geração
        resposta_limpa = resposta[len(pergunta):].strip()

        resultados.append({
            'pergunta': pergunta,
            'resposta': resposta_limpa
        })

        print(f"Pergunta: {pergunta}")
        print(f"Resposta: {resposta_limpa}")
        print("-" * 50)

    return resultados

# Perguntas de teste em Guarani
perguntas_teste = [
    "Como se diz 'gavião' em Guarani?",
    "Como se diz 'água' em Guarani?",
    "Como se diz 'casa' em Guarani?",
    "Como se diz 'sol' em Guarani?",
    "Como se diz 'lua' em Guarani?"
]

print("=== AVALIAÇÃO DO MODELO ===")
resultados = avaliar_modelo(perguntas_teste)

# ============= MÉTRICAS QUANTITATIVAS =============

def calcular_perplexity_dataset(model, tokenizer, dataset, max_samples=100):
    """Calcula perplexity em um conjunto de dados"""
    model.eval()
    total_loss = 0
    total_tokens = 0

    # Pega apenas uma amostra se o dataset for muito grande
    if len(dataset) > max_samples:
        indices = np.random.choice(len(dataset), max_samples, replace=False)
        sample_dataset = dataset.select(indices)
    else:
        sample_dataset = dataset

    with torch.no_grad():
        for example in sample_dataset:
            input_ids = torch.tensor([example['input_ids']])
            labels = torch.tensor([example['labels']])

            outputs = model(input_ids, labels=labels)
            loss = outputs.loss

            # Conta apenas tokens que não são padding
            mask = labels != -100
            num_tokens = mask.sum().item()

            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens

    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)

    return perplexity

# Calcula perplexity no conjunto de validação
perplexity_val = calcular_perplexity_dataset(model, tokenizer, tokenized_val)
print(f"\nPerplexity no conjunto de validação: {perplexity_val:.2f}")

# ============= COMPARAÇÃO COM MODELO BASE =============

print("\n=== COMPARAÇÃO COM MODELO BASE ===")

# Carrega modelo base para comparação
model_base = AutoModelForCausalLM.from_pretrained(model_id)
model_base.eval()

perplexity_base = calcular_perplexity_dataset(model_base, tokenizer, tokenized_val)
print(f"Perplexity modelo base: {perplexity_base:.2f}")
print(f"Perplexity modelo fine-tuned: {perplexity_val:.2f}")

if perplexity_val < perplexity_base:
    print("✅ O fine-tuning MELHOROU o modelo!")
    print(f"Redução na perplexity: {perplexity_base - perplexity_val:.2f}")
else:
    print("⚠️ O fine-tuning pode não ter melhorado o modelo.")
    print("Considere ajustar os hiperparâmetros.")

# Testa ambos os modelos na mesma pergunta
pergunta_teste = "Como se diz 'gavião' em Guarani?"
print(f"\n=== TESTE COMPARATIVO: '{pergunta_teste}' ===")

# Modelo base
inputs = tokenizer(pergunta_teste, return_tensors="pt")
with torch.no_grad():
    outputs_base = model_base.generate(
        inputs['input_ids'],
        max_length=100,
        do_sample=False,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

resposta_base = tokenizer.decode(outputs_base[0], skip_special_tokens=True)
resposta_base_limpa = resposta_base[len(pergunta_teste):].strip()

# Modelo fine-tuned
with torch.no_grad():
    outputs_ft = model.generate(
        inputs['input_ids'],
        max_length=100,
        do_sample=False,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

resposta_ft = tokenizer.decode(outputs_ft[0], skip_special_tokens=True)
resposta_ft_limpa = resposta_ft[len(pergunta_teste):].strip()

print(f"Modelo Base: {resposta_base_limpa}")
print(f"Modelo Fine-tuned: {resposta_ft_limpa}")