In [7]:
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd

# Carregar o modelo e o tokenizador
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Supondo que seu dataset está em uma lista de strings
data = pd.read_csv("../data/13k-recipes.csv")  # substitua pelo caminho do seu arquivo
data = data.dropna()
data = data["Instructions"].tolist()
sentences = data[:5000]

# Criar pares duplicados (positivos)
class ContrastiveDataset(torch.utils.data.Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        inputs_a = tokenizer(text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        inputs_b = tokenizer(text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        return inputs_a, inputs_b

train_dataset = ContrastiveDataset(sentences)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)

In [8]:
sentences[0]

'Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twine. Let sit at room temperature 1 hour.\nMeanwhile, halve squash and scoop out seeds. Run a vegetable peeler along ridges of squash halves to remove skin. Cut each half into ½"-thick wedges; arrange on a rimmed baking sheet.\nCombine sage, rosemary, and 6 Tbsp. melted butter in a large bowl; pour half of mixture over squash on baking sheet. Sprinkle squash with allspice, red pepper flakes, and ½ tsp. salt and season with black pepper; toss to coat.\nAdd bread, apples, oil, and ¼ tsp. salt to remaining herb butter in bowl; season with black pepper and toss to combine. Set aside.\nPlace onion and vinegar in a small bowl; season with salt and toss to coat. Let sit, tossing occasionally, until ready to serve.\nPlace a rack in middle and lower third of oven; preheat to 425°F. Mix miso and 3 Tbsp. room-temperature butter in a small bowl until smooth. Pat chicken dry with paper towels, 

In [9]:
import torch.nn.functional as F

def contrastive_loss(embeddings_a, embeddings_b, similarity_target=1.0):
    cosine_sim = F.cosine_similarity(embeddings_a, embeddings_b)
    loss = F.mse_loss(cosine_sim, torch.ones_like(cosine_sim) * similarity_target)
    return loss

In [None]:
from transformers import AdamW
from tqdm import tqdm
import time

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.train()
for epoch in range(num_epochs):
    # Barra de progresso para cada epoch
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
    epoch_loss = 0
    for batch_idx, batch in enumerate(progress_bar):
        start_time = time.time()
        inputs_a, inputs_b = batch
        inputs_a = {k: v.squeeze().to(device) for k, v in inputs_a.items()}
        inputs_b = {k: v.squeeze().to(device) for k, v in inputs_b.items()}

        # Obter embeddings
        outputs_a = model(**inputs_a).last_hidden_state.mean(dim=1)
        outputs_b = model(**inputs_b).last_hidden_state.mean(dim=1)

        # Calcular a perda
        loss = contrastive_loss(outputs_a, outputs_b)
        epoch_loss += loss.item()

        # Backpropagation e atualização dos pesos
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Atualizar a barra de progresso com a perda média e o tempo por batch
        progress_bar.set_postfix({
            "batch_loss": loss.item(),
            "avg_loss": epoch_loss / (batch_idx + 1),
            "batch_time": f"{time.time() - start_time:.2f}s"
        })

    # Print da perda média por epoch após cada época
    print(f"Epoch {epoch + 1} completed. Average Loss: {epoch_loss / len(train_dataloader):.4f}")

Epoch 1/3:   0%|          | 0/313 [00:00<?, ?it/s]

                                                                                                                        

Epoch 1 completed. Average Loss: 0.0000


                                                                                                                        

Epoch 2 completed. Average Loss: 0.0000


                                                                                                                       

Epoch 3 completed. Average Loss: 0.0000




In [11]:
model.save_pretrained("../models/finetuned-bert")
tokenizer.save_pretrained("../models/finetuned-tokenizer")

('../models/finetuned-tokenizer\\tokenizer_config.json',
 '../models/finetuned-tokenizer\\special_tokens_map.json',
 '../models/finetuned-tokenizer\\vocab.txt',
 '../models/finetuned-tokenizer\\added_tokens.json')