In [1]:
# 🧩 Instala pacotes necessários
!pip install -U torch torchvision torchaudio transformers datasets

# ✅ Importações
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
import torch
from datasets import load_dataset
from google.colab import drive

# ✅ Monta o Google Drive
drive.mount('/content/drive')

# ✅ Define modelo base (confirme o nome e acesso ao repositório)
model_id = "rubuntu/Phi-3.5-mini-instruct-Jopara-V3"

# ✅ Carrega o dataset do Hugging Face
dataset = load_dataset("thinkPy/dataset-cultura-guarani_mistral-dpo")["train"]

# ✅ Formata os dados no estilo Instruct (usa prompt_used + base_answer)
def format_example(example):
    # Aqui, junta prompt e resposta no texto para o modelo aprender
    # Adiciona um espaço no meio para separar bem os textos
    return {"text": example["prompt_used"].strip() + "\n" + example["base_answer"].strip()}

formatted_dataset = dataset.map(format_example, remove_columns=dataset.column_names)

# ✅ Carrega tokenizer e modelo
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,   # Use float16 (fp16) para Colab comum, não bf16
    device_map="auto"
)

# ✅ Tokenização com padding e truncation
tokenized_dataset = formatted_dataset.map(
    lambda x: tokenizer(
        x["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    ),
    batched=True,
    remove_columns=["text"]
)

# ✅ Data Collator para LM causal
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# ✅ Argumentos de treino
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/modelo_finetunado",
    per_device_train_batch_size=1,
    num_train_epochs=2,
    max_steps=750,
    gradient_accumulation_steps=4,
    warmup_steps=20,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=700,
    save_total_limit=2,
    bf16=False,   # Desativado, pois Colab não suporta bf16
    fp16=False,    # Ativado para acelerar se sua GPU for compatível (ex: T4)
    report_to="none",
    overwrite_output_dir=True
)

# ✅ Inicializa o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# ✅ Inicia o treino
trainer.train()

# ✅ Salva o modelo e tokenizer no Drive
output_path = "/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/modelo_finetunado"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"✅ Modelo salvo em: {output_path}")


Collecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/575 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/386k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1373 [00:00<?, ? examples/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Step,Training Loss
10,551.8617
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


KeyboardInterrupt: 

In [1]:
# 🔍 DIAGNÓSTICO COMPLETO DO PROBLEMA

# Primeiro, vamos parar o treinamento atual se estiver rodando
# trainer.save_model()  # Salva o progresso atual

# ✅ Importações para diagnóstico
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import numpy as np

# ✅ Recarrega modelo e tokenizer
model_id = "rubuntu/Phi-3.5-mini-instruct-Jopara-V3"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# ✅ DIAGNÓSTICO 1: Verifica configuração do tokenizer
print("🔍 DIAGNÓSTICO 1: Tokenizer")
print(f"- Vocab size: {tokenizer.vocab_size}")
print(f"- Pad token: {tokenizer.pad_token}")
print(f"- Pad token ID: {tokenizer.pad_token_id}")
print(f"- EOS token: {tokenizer.eos_token}")
print(f"- EOS token ID: {tokenizer.eos_token_id}")

# 🔧 CORREÇÃO: Configura pad_token corretamente
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print("✅ pad_token configurado como eos_token")

# ✅ DIAGNÓSTICO 2: Verifica o dataset
dataset = load_dataset("thinkPy/dataset-cultura-guarani_mistral-dpo")["train"]
print(f"\n🔍 DIAGNÓSTICO 2: Dataset")
print(f"- Número de exemplos: {len(dataset)}")
print(f"- Colunas disponíveis: {dataset.column_names}")

# Mostra exemplo do dataset
exemplo = dataset[0]
print(f"\n📝 Exemplo do dataset:")
for key, value in exemplo.items():
    print(f"- {key}: {str(value)[:100]}...")

# ✅ DIAGNÓSTICO 3: Testa tokenização
print(f"\n🔍 DIAGNÓSTICO 3: Tokenização")

# Formato CORRETO para treinamento
def format_example_correto(example):
    """Formato correto para treinamento instruction-following"""
    prompt = example["prompt_used"].strip()
    answer = example["base_answer"].strip()

    # Formato mais limpo
    return {"text": f"### Instrução:\n{prompt}\n\n### Resposta:\n{answer}"}

# Testa com um exemplo
exemplo_formatado = format_example_correto(dataset[0])
print(f"Texto formatado: {exemplo_formatado['text'][:200]}...")

# Tokeniza o exemplo
tokens = tokenizer(
    exemplo_formatado["text"],
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

print(f"- Shape dos tokens: {tokens['input_ids'].shape}")
print(f"- Tokens únicos: {len(torch.unique(tokens['input_ids']))}")
print(f"- Primeiros tokens: {tokens['input_ids'][0][:10]}")
print(f"- Último token: {tokens['input_ids'][0][-1]}")

# ✅ DIAGNÓSTICO 4: Testa o modelo
print(f"\n🔍 DIAGNÓSTICO 4: Modelo")

# Coloca modelo em modo de treino
model.train()

# Testa forward pass
with torch.no_grad():
    outputs = model(**tokens, labels=tokens["input_ids"])

print(f"- Loss: {outputs.loss}")
print(f"- Loss é NaN: {torch.isnan(outputs.loss)}")
print(f"- Loss é infinito: {torch.isinf(outputs.loss)}")
print(f"- Logits shape: {outputs.logits.shape}")

# ✅ CÓDIGO CORRIGIDO PARA TREINAMENTO
print(f"\n🛠️ PREPARANDO TREINAMENTO CORRIGIDO...")

# Reprocessa dataset com formato correto
formatted_dataset = dataset.map(format_example_correto, remove_columns=dataset.column_names)

# Função de tokenização corrigida
def tokenize_function_corrigida(examples):
    # Tokeniza com configurações corretas
    result = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors=None  # Importante: None para datasets
    )

    # Cria labels igual aos input_ids para language modeling
    result["labels"] = result["input_ids"].copy()

    return result

# Tokeniza dataset
tokenized_dataset = formatted_dataset.map(
    tokenize_function_corrigida,
    batched=True,
    remove_columns=["text"]
)

# Divide em treino e validação
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"✅ Dataset processado:")
print(f"- Treino: {len(train_dataset)} exemplos")
print(f"- Validação: {len(eval_dataset)} exemplos")

# ✅ NOVA CONFIGURAÇÃO DE TREINAMENTO
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Data collator corrigido
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Não é masked language modeling
    pad_to_multiple_of=8  # Otimização para GPU
)

# Argumentos corrigidos
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Doutorado Unesp/assistente-guarani/data/modelo_finetunado",
    per_device_train_batch_size=1,  # Menor para evitar problemas
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    max_steps=500,  # Reduzido para teste
    gradient_accumulation_steps=8,  # Aumentado para compensar batch menor
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=5,  # Log mais frequente
    eval_steps=50,
    save_steps=250,
    save_total_limit=2,
    bf16=False,
    fp16=True,
    report_to="none",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=5e-5,  # Learning rate maior
    lr_scheduler_type="linear",
    dataloader_num_workers=0,  # Sem workers paralelos
    dataloader_pin_memory=False,
    gradient_checkpointing=True,  # Economiza memória
    remove_unused_columns=False,
)

# Trainer corrigido
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print(f"\n🚀 PRONTO PARA TREINAR!")
print(f"Execute: trainer.train()")

# ✅ TESTE RÁPIDO antes do treinamento
print(f"\n🧪 TESTE RÁPIDO:")
sample_batch = next(iter(trainer.get_train_dataloader()))
print(f"- Batch shape: {sample_batch['input_ids'].shape}")
print(f"- Labels shape: {sample_batch['labels'].shape}")

# Teste forward pass
model.eval()
with torch.no_grad():
    outputs = model(**sample_batch)
    print(f"- Loss do batch: {outputs.loss}")

if outputs.loss > 0:
    print("✅ Loss está correto! Pode treinar.")
else:
    print("❌ Loss ainda está em 0. Verifique os dados.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

🔍 DIAGNÓSTICO 1: Tokenizer
- Vocab size: 32000
- Pad token: <|placeholder6|>
- Pad token ID: 32009
- EOS token: <|endoftext|>
- EOS token ID: 32000


Downloading readme:   0%|          | 0.00/575 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]


Downloading data:   0%|          | 0.00/386k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 386k/386k [00:00<00:00, 688kB/s]


Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1373 [00:00<?, ? examples/s]

NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.