This project aims to perform a sentiment analysis on the IMDB movie reviews dataset using a pre-trained BERT model. The goal is to classify the reviews as positive or negative based on their content, and demonstrate how to fine-tune a BERT model for this task.


In [1]:
# 1. Load and split the dataset

import pandas as pd
from sklearn.model_selection import train_test_split


file_path = "movie_data.csv"
df: pd.DataFrame = pd.read_csv(file_path)
train_data, test_data = train_test_split(df, test_size=0.3, 
                                         random_state=42, stratify=df['sentiment'])
train_data, val_data = train_test_split(train_data, test_size=5000,
                                         random_state=42, stratify=train_data['sentiment'])
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 8666 to 12357
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     30000 non-null  object
 1   sentiment  30000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 703.1+ KB


In [2]:
# separate the raw data into text and label data, as we will need to tokenize the text data

train_texts: list[str] = train_data['review'].tolist()
train_labels: list[int] = train_data['sentiment'].tolist()
val_texts: list[str] = val_data['review'].tolist()
val_labels: list[int] = val_data['sentiment'].tolist()
test_texts: list[str] = test_data['review'].tolist()
test_labels: list[int] = test_data['sentiment'].tolist()
print(f"Number of training samples: {len(train_texts)}")

Number of training samples: 30000


In [3]:
# 2. Pick a suitable pre-trained model and the corresponding tokenizer. 
# Choose the distilbert-base-uncased model due to limited computational resources.

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased",
                                                local_files_only=True)

In [4]:
# 3. Tokenize the input text from the train, validation and test datasets
train_encodings = tokenizer(
    train_texts, 
    truncation=True, # resulting encoding object can hold up to 512 tokens. If the original text is longer, it will be truncated.
    padding=True, 
    return_tensors='pt') # return PyTorch tensors

val_encodings = tokenizer(
    val_texts,
    truncation=True, 
    padding=True, 
    return_tensors='pt')

test_encodings = tokenizer(
    test_texts,
    truncation=True, 
    padding=True,
    return_tensors='pt')

In [5]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [6]:
print(train_encodings['input_ids'].shape)
train_encodings['input_ids'][0]

torch.Size([30000, 512])


tensor([  101,  1037,  6919,  2143,  3805,  1997,  2049,  2051,  1010,  1026,
         7987,  1013,  1028,  1026,  7987,  1013,  1028,  1045,  2228,  2061,
         1010,  1999,  1996, 12021,  1005,  1055,  2009,  2001,  2035,  2055,
         3045,  1010, 22040,  2003,  2204,  1029,  3342,  2008,  2028,  1029,
         1045,  2031,  2464,  2023,  2143,  2062,  2008,  2322,  2335,  1010,
         2000,  2033,  2023,  2003,  1037,  2613,  5532,  2479,  2143,  1010,
         1045,  2562,  3666,  2138,  2045,  2003,  2467,  2242,  2062,  2000,
         4553,  2055,  2122, 25077,  3494,  2008,  1045,  2074,  2293,  1010,
         8201,  9092,  5149,  1010,  1998, 20368, 13675, 10698,  2078,  1010,
         2024,  3432,  6919,  1010,  2036, 12218,  1040,  1005, 12262,  1010,
        17935,  7346,  2272,  1999,  2012,  1037,  2485,  2117,  1010,  2123,
         1005,  1056,  2131,  2033,  3308,  2045,  2024,  2116,  2062,  2307,
         2836,  1005,  1055,  1999,  2023,  2143,  1010,  1998, 

In [7]:
print(train_encodings['attention_mask'].shape)
print(train_encodings['attention_mask'][0])

torch.Size([30000, 512])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0

In [8]:
train_encodings.items()

dict_items([('input_ids', tensor([[ 101, 1037, 6919,  ...,    0,    0,    0],
        [ 101, 1045, 2387,  ...,    0,    0,    0],
        [ 101, 2043, 1005,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 1005,  ...,    0,    0,    0],
        [ 101, 2011, 3411,  ...,    0,    0,    0],
        [ 101, 3294, 5993,  ...,    0,    0,    0]])), ('attention_mask', tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]))])

In [9]:
# 4. Encapsulate all data fields, including the labels within a Dataset class:
from torch.utils.data import Dataset
import torch


class MovieReviewDataset(Dataset):
    def __init__(self, encodings: dict, labels: list[int]) -> None:
        self.encodings: dict = encodings
        self.labels: list[int] = labels

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx) -> dict[str, torch.Tensor]:
        """Put all the data fields into a dictionary and return it."""

        item: dict[str, torch.Tensor] = {key: val[idx] for key, val in self.encodings.items()}
        # Convert the positive label to [0., 1.] and the negative label to [1., 0.]. 
        # because the BERT model expects the labels to be in a one-hot encoded format.
        item['labels'] = torch.tensor([0., 1.]) if self.labels[idx] == 1 else torch.tensor([1., 0.])

        return item


# Now, create the training, validation and test datasets using the MovieReviewDataset class:
train_dataset = MovieReviewDataset(train_encodings, train_labels)
val_dataset = MovieReviewDataset(val_encodings, val_labels)
test_dataset = MovieReviewDataset(test_encodings, test_labels)

In [10]:
# 5. Create a DataLoader for the training dataset. The DataLoader will take care of batching the data and shuffling it.
from torch.utils.data import DataLoader

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
# 6. After completing the data preparation, tokenization and custom DataSet and DataLoader, 
# the next step is loading the pre-trained model and fine-tuning it with the datasets just prepared.

from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW

device= torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    local_files_only=True).to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), 
                  lr=5e-5,  # learning rate of 5e-5 is a good starting point for fine-tuning BERT models
                  weight_decay=0.01)  # weight decay of 0.01 is a common choice for regularization

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Using the Trainer API to train Transformer models


In [None]:
# 7. Use the Trainer API from Hugging Face Transformers to train the model.
# First, prepare the necessary configurations and initialize a Trainer object:

from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
import numpy as np

torch.manual_seed(42)  # set the random seed for reproducibility

training_args = TrainingArguments(
    output_dir='./results',  # output directory for model predictions and checkpoints
    num_train_epochs=3,  
    per_device_train_batch_size=batch_size,  # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    warmup_steps=50,  # number of warmup steps for learning rate scheduler
    logging_dir='./logs', 
    logging_steps=50,  # log every 50 steps
    eval_strategy="steps", 
    eval_steps=50,  # evaluate every 50 steps
    metric_for_best_model="accuracy",  # use accuracy to evaluate the model
    greater_is_better=True,  
    save_strategy="steps", 
    save_steps=50,  # save every 50 steps
    load_best_model_at_end=True,  # load the best model at the end of training
)

# Define the compute_metrics function to compute accuracy
def compute_metrics(eval_pred) -> dict[str, float]:
    """Compute accuracy for evaluating the model."""
	
    # Unpack the logits and labels from the eval_pred tuple
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)  # note: logits is np.ndarray of (num_samples, num_classes) 
    true_labels = np.argmax(labels, axis=-1)  # convert the true class labels to 1d array.
	
    metric = evaluate.load("accuracy")
    accu: dict = metric.compute(predictions=preds, references=true_labels)  

    return accu


# Initialize the Trainer object
trainer = Trainer(
    model=model,  
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  
    eval_dataset=val_dataset,  
    optimizers=(optimizer, None),  # optimizer and scheduler
    compute_metrics=compute_metrics,  # compute accuracy
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],  # add early stopping callback
)

In [13]:
# 8. Train the model
# Check GPU memory before training
free_mem, total_mem = torch.cuda.mem_get_info(device=device)
print(f"Free GPU memory: {free_mem/1024**3:.2f} GB / {total_mem/1024**3:.2f} GB")

trainer.train()

Free GPU memory: 6.65 GB / 8.00 GB


Step,Training Loss,Validation Loss,Accuracy
50,0.6151,0.367332,0.8662
100,0.3404,0.288157,0.8874
150,0.2836,0.271197,0.89
200,0.2752,0.257851,0.8996
250,0.2802,0.273859,0.8914
300,0.278,0.238523,0.9062
350,0.2548,0.263075,0.902
400,0.2509,0.231817,0.9112
450,0.2623,0.226824,0.9088
500,0.2286,0.217945,0.9178


TrainOutput(global_step=1750, training_loss=0.19960899571010043, metrics={'train_runtime': 3077.9649, 'train_samples_per_second': 29.24, 'train_steps_per_second': 0.914, 'total_flos': 7416054846357504.0, 'train_loss': 0.19960899571010043, 'epoch': 1.8656716417910446})

In [None]:
# 9. Evaluate the model on the validation set to check for the best model
trainer.evaluate(val_dataset)

{'eval_loss': 0.19786538183689117,
 'eval_accuracy': 0.936,
 'eval_runtime': 44.0313,
 'eval_samples_per_second': 113.556,
 'eval_steps_per_second': 3.566,
 'epoch': 1.8656716417910446}

In [16]:
# 10. Evaluate the model on the test set
trainer.evaluate(test_dataset)

{'eval_loss': 0.20091913640499115,
 'eval_accuracy': 0.9325333333333333,
 'eval_runtime': 128.775,
 'eval_samples_per_second': 116.482,
 'eval_steps_per_second': 3.642,
 'epoch': 1.8656716417910446}

<b>Best practice</b><br>
<br>
Here are some best practices for fine-tuning BERT:

- <b>Data is king</b>: You should prioritize high-quality and well-labeled data.
- <b>Start small</b>: You can begin with smaller pre-trained models like BERT-base or DistilBERT. They’re less demanding on your computational power compared to larger models like BERT-large.
- <b>Automate hyperparameter tuning</b>: You may utilize automated hyperparameter tuning libraries (e.g., Hyperopt, Optuna) to search for optimal hyperparameters. This can save you time and let your computer do the heavy lifting.
- <b>Implement early stopping</b>: You should monitor validation loss during training. If it stops getting better after a while, hit the brakes. This early stopping strategy can prevent unnecessary training iterations.
