Loading Preprocessed Train, Validation, and Test Data

In [101]:
import pandas as pd

# Load the preprocessed train data from a csv file
train = pd.read_csv('preprocessed/train.csv')

# Load the preprocessed validation data from a csv file
val = pd.read_csv('preprocessed/val.csv')

# Load the preprocessed test data from a csv file
test = pd.read_csv('preprocessed/test.csv')

In [102]:
train

Unnamed: 0,review,sentiment
0,work librari expect like movi came 5 year ago ...,1
1,eagl wing pleasant surpris movi keep viewer in...,1
2,new york love collect work eleven short film s...,1
3,saw movi yesterday night one best made tv film...,1
4,playwright sidney bruhl wonder overthetop mich...,1
...,...,...
34702,love movi tv program record come nov 2nd reall...,1
34703,big jim carey fan took seat cinema optim fun d...,0
34704,even 6000 buck cast parttim actor christoph no...,1
34705,one best movi ive ever seen good act hank newm...,1


Converting a Dataset into a PyTorch Dataset for Sentiment Analysis using DistilBERT

In [67]:

import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define a custom PyTorch dataset for sentiment analysis
class SentimentDataset(Dataset):
    def __init__(self, reviews, labels):
        # Store the reviews and labels as instance variables
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        # Return the length of the dataset
        return len(self.reviews)

    def __getitem__(self, idx):
        # Get the review and label at the given index
        review = self.reviews[idx]
        label = self.labels[idx]

        # Convert the review to inputs for the DistilBERT model
        input_ids = torch.tensor(tokenizer.encode(review, add_special_tokens=True))
        max_length = 128 # replace with your desired maximum sequence length
        padding_length = max_length - len(input_ids)
        input_ids = torch.nn.functional.pad(input_ids, (0, padding_length), value=0)
        attention_mask = torch.where(input_ids != 0, torch.tensor(1), torch.tensor(0))

        # Return the inputs and label as a dictionary
        return {
            "input_ids": input_ids.unsqueeze(0),
            "attention_mask": attention_mask.unsqueeze(0),
            "labels": torch.tensor(label)
        }

In [86]:

# Create a SentimentDataset for the train data
train_dataset = SentimentDataset(train["review"], train["sentiment"])

# Create a SentimentDataset for the validation data
val_dataset = SentimentDataset(val["review"], val["sentiment"])

# Create a SentimentDataset for the test data
test_dataset = SentimentDataset(test["review"], test["sentiment"])

In [69]:
epochs = 5

Loading the Pre-trained DistilBERT Model and Defining the Loss Function, Optimizer, and Learning Rate Scheduler


In [70]:

# Load the pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Define the loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Define the learning rate scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-5, total_steps=len(train_dataset) * epochs, epochs=epochs)

# Define an additional criterion for calculating loss
criterion = torch.nn.BCELoss()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

Defining the Train and Validation Data Loaders and Specifying the Device for Training

In [71]:
# Define the train data loader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define the validation data loader
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Specify the device to use for training (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device
model = model.to(device)

Training the DistilBERT Model for Sentiment Analysis

In [103]:
import numpy as np
import torch.nn.functional as F
import time

# Loop over the number of epochs
for epoch in range(epochs):
    # Print the current epoch number
    print("Epoch: {}/{}".format(epoch+1, epochs))
    
    # Initialize variables to track the train and validation loss and accuracy
    train_loss, val_loss = 0, 0
    train_acc, val_acc = 0, 0
    
    # Record the start time of the epoch
    start_time = time.time()
    
    # Set the model to train mode
    model.train()
    
    # Loop over the batches in the train data loader
    for i, batch in enumerate(train_loader):
        # Move data to proper dtype and device
        input_ids = batch["input_ids"].squeeze(1).to(device)
        attention_mask = batch["attention_mask"].squeeze(1).to(device)
        labels = batch["labels"].to(device)
        
        # Convert labels to one-hot encoding
        labels_one_hot = F.one_hot(labels, num_classes=2).float()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass: compute predictions from inputs
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0] # get raw logits from model

        # Compute loss using raw logits and one-hot encoded labels
        loss = loss_fn(logits, labels_one_hot)
        
        # Backward pass: compute gradients of loss with respect to model parameters
        loss.backward()
        
        # Update model parameters using computed gradients
        optimizer.step()
        
        # Update learning rate using scheduler
        scheduler.step()

        # Accumulate the training loss
        train_loss += loss.item()

        # Calculate the accuracy
        pred = torch.argmax(logits, dim=1)
        train_acc += (pred == labels).sum().item()

        # Print progress information
        if (i+1) % 10 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (i+1) * (len(train_loader) - (i+1))
            current_acc = train_acc / ((i+1) * 16)
            print("Batch: {}/{} - Elapsed Time: {:.2f}s - Remaining Time: {:.2f}s - Train Acc: {:.4f}".format(i+1, len(train_loader), elapsed_time, remaining_time, current_acc))

    # Evaluate the model on the validation set
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            input_ids = batch["input_ids"].squeeze(1)
            attention_mask = batch["attention_mask"].squeeze(1)
            labels = batch["labels"]
            labels_one_hot = F.one_hot(labels, num_classes=2).float() # convert labels to one-hot encoding

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs[0] # get raw logits from model

            loss = loss_fn(logits, labels_one_hot) # compute loss using raw logits and one-hot encoded labels

            val_loss += loss.item()

            # Calculate the accuracy
            pred = torch.argmax(logits, dim=1)
            val_acc += (pred == labels).sum().item()

    # Calculate the average losses and accuracies
    train_loss = train_loss / len(train_dataset)
    val_loss = val_loss / len(val_dataset)
    train_acc = train_acc / len(train_dataset)
    val_acc = val_acc / len(val_dataset)

    # Print the losses and accuracies
    print("Train Loss: {:.4f}, Val Loss: {:.4f}".format(train_loss, val_loss))
    print("Train Acc: {:.4f}, Val Acc: {:.4f}".format(train_acc, val_acc))

Epoch: 1/5


KeyboardInterrupt: 