In [1]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification

# Load the Yelp dataset from a CSV file
df = pd.read_csv("balanced_dataset.csv")

# Load the pre-trained Albert tokenizer
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

# Initialize an empty list to store tokenized text data
text_data = []

# Iterate through each review in the dataset
for review in df["text"]:
    tokens = tokenizer.encode_plus(
        review, 
        add_special_tokens=True, 
        max_length=512, 
        truncation=True, 
        return_attention_mask=True, 
        return_tensors='pt'
    )
    text_data.append(tokens['input_ids'].flatten())

# Create a custom dataset class for Yelp reviews
class YelpDataset(torch.utils.data.Dataset):
    def __init__(self, text_data, labels):
        self.text_data = text_data
        self.labels = labels

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        text = self.text_data[idx]
        label = self.labels[idx]
        return {
            'input_ids': torch.tensor(text),
            'attention_mask': torch.tensor([1] * len(text)),
            'labels': torch.tensor(label)
        }

# Create an instance of the custom dataset class
dataset = YelpDataset(text_data, df["rating"])

# Create data loaders for training and evaluation
train_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
eval_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)

# Import necessary libraries for training
import torch.nn as nn
import torch.optim as optim

# Load the pre-trained Albert model for sequence classification
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=5)

# Set the device (GPU or CPU) for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device
model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Train the Albert model
for epoch in range(5):  # You can increase the number of epochs for better training
    model.train()  # Set the model to training mode
    total_loss = 0  # Initialize total loss for the epoch
    for batch in train_loader:
        # Move input data to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # Get the loss from the model outputs

        # Backward pass
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        total_loss += loss.item()

    # Print the average loss for the epoch
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

print("Training completed!")


KeyError: 'review'