### Library import and data loading

In [None]:
! pip install transformers datasets

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizerFast, BertModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [4]:
dataset = load_dataset("yelp_polarity")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


### Tokenizer and Embedding

In [6]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Tokenize the dataset
def encode_reviews(tokenizer, reviews, max_length=256):
    return tokenizer(reviews, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

In [8]:
# Split the training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(dataset['train']['text'], dataset['train']['label'], test_size=0.1)

In [9]:
# Encode the split datasets
train_encodings = encode_reviews(tokenizer, train_texts)
val_encodings = encode_reviews(tokenizer, val_texts)
test_encodings = encode_reviews(tokenizer, dataset['test']['text'])

### Create PyTorch dataset and dataloaders

In [10]:
# Create the PyTorch dataset
class YelpPolarityDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
# Create the dataset objects
train_dataset = YelpPolarityDataset(train_encodings, train_labels)
val_dataset = YelpPolarityDataset(val_encodings, val_labels)
test_dataset = YelpPolarityDataset(test_encodings, dataset['test']['label'])

In [12]:
# Create DataLoader instances
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [13]:
# Define the model
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)
        return self.sigmoid(logits)

In [14]:
# Initialize the model and move it to the GPU
model = SentimentClassifier().to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [15]:
# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

### Data training and validation

In [17]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float().unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        # Calculate accuracy
        predictions = torch.round(outputs)
        train_correct += (predictions == labels).sum().item()
        train_total += labels.size(0)

    # Calculate average training loss and accuracy over the epoch
    train_loss /= len(train_loader)
    train_accuracy = train_correct / train_total

    # Validation loop
    model.eval()
    val_loss = 0
    val_total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float().unsqueeze(1)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predictions = torch.round(outputs)
            val_total += labels.size(0)

    # Calculate average validation loss over the epoch
    val_loss /= len(val_loader)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}')


Epoch 1/10, Train Loss: 0.0780, Train Accuracy: 0.9714, Validation Loss: 0.0884
Epoch 2/10, Train Loss: 0.0518, Train Accuracy: 0.9815, Validation Loss: 0.0889
Epoch 3/10, Train Loss: 0.0364, Train Accuracy: 0.9874, Validation Loss: 0.1066
Epoch 4/10, Train Loss: 0.0277, Train Accuracy: 0.9905, Validation Loss: 0.1237
Epoch 5/10, Train Loss: 0.0229, Train Accuracy: 0.9921, Validation Loss: 0.1269
Epoch 6/10, Train Loss: 0.0201, Train Accuracy: 0.9932, Validation Loss: 0.1200
Epoch 7/10, Train Loss: 0.0186, Train Accuracy: 0.9936, Validation Loss: 0.1363
Epoch 8/10, Train Loss: 0.0174, Train Accuracy: 0.9941, Validation Loss: 0.1367
Epoch 9/10, Train Loss: 0.0166, Train Accuracy: 0.9943, Validation Loss: 0.1351
Epoch 10/10, Train Loss: 0.0153, Train Accuracy: 0.9949, Validation Loss: 0.1512


### Testing

In [18]:
# Testing loop
model.eval()
test_loss = 0
test_correct = 0
test_total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float().unsqueeze(1)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        predictions = torch.round(outputs)
        test_correct += (predictions == labels).sum().item()
        test_total += labels.size(0)

# Calculate average test loss and accuracy
test_loss /= len(test_loader)
test_accuracy = test_correct / test_total

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Test Loss: 0.1431, Test Accuracy: 0.9685
