# Movie Review Sentiment Analysis

In [29]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import time
import os

## Configuration

In [7]:
MODEL_NAME = 'distilbert-base-uncased' # A smaller, faster version of BERT
MAX_LENGTH = 256 # Max sequence length for tokenizer
BATCH_SIZE = 16 # Smaller batch size for Transformer models
EPOCHS = 1 # Fine-tuning often requires fewer epochs
LEARNING_RATE = 2e-5 # Common learning rate for fine-tuning
MODEL_SAVE_PATH = "sentiment_classifier.pth"

## Set Device

In [4]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: mps


## Load Dataset

- Load the famous `IMDB movie review dataset` from `Hugging Face`

In [9]:
print("Loading dataset...")
dataset = load_dataset("imdb")

Loading dataset...


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

## Load Tokenizer

- `AutoTokenizer`
  - Provided by Hugging Face
- `distilbert-base-uncased`
  - `distilbert`
    - A smaller, faster version of BERT
  - `base`
    - Refers to the "base" size of the model (as opposed to "large").
  - `uncased`
    - The model was pre-trained on text that was converted to all lowercase
    - Therefore, our tokenizer will also convert all our input text to lowercase before processing it

In [8]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Custom Dataset Class

In [13]:
class ImdbDataset(Dataset):
    def __init__(self, dataset_split, tokenizer, max_length):
        self.dataset = dataset_split
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item['text']
        label = item['label']

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [12]:
train_dataset = ImdbDataset(dataset['train'], tokenizer, MAX_LENGTH)
test_dataset = ImdbDataset(dataset['test'], tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
print("Dataset prepared.")

Dataset prepared.


### Dataset Inspection

In [23]:
print(f"Sample text: {dataset['train'][0]['text']}")
print(f"Sample label: {dataset['train'][0]['label']}")

Sample text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far

In [14]:
num_train_samples = len(train_dataset)
num_test_samples = len(test_dataset)

print(f"Number of training samples: {num_train_samples}")
print(f"Number of test samples: {num_test_samples}")

Number of training samples: 25000
Number of test samples: 25000


In [15]:
num_train_batches = len(train_loader)
num_test_batches = len(test_loader)

print(f"Number of training batches per epoch: {num_train_batches}")
print(f"Number of test batches: {num_test_batches}")

Number of training batches per epoch: 1563
Number of test batches: 1563


In [21]:
first_sample = train_dataset[0]

first_sample_label = first_sample['label']
first_sample_data = {key: val for key, val in first_sample.items() if key != 'label'}

print("\n--- Inspecting a Single Sample ---")
print(f"Label: {first_sample_label}")
print(f"Data type: {type(first_sample_data)}")
print(f"Sample Data Input Ids: {first_sample_data['input_ids']}")
print(f"Input IDs shape: {first_sample_data['input_ids'].shape}")
print(f"Attention Mask shape: {first_sample_data['attention_mask'].shape}")


--- Inspecting a Single Sample ---
Label: 0
Data type: <class 'dict'>
Sample Data Input Ids: tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
         2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
         2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
         2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
         1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
         2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
         6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
         1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
         5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
        14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
         1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
         2000,  2437,  2070,  4066,  1997,  4516

## Build the Model

In [19]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        # Load the pre-trained DistilBERT model
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        
        # Add a dropout layer for regularization
        self.drop = nn.Dropout(p=0.3)
        
        # Add a fully-connected layer for classification
        # self.bert.config.hidden_size is the size of the output from DistilBERT (e.g., 768)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # The DistilBERT model returns a tuple of hidden states. We only need the first one.
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # The output we want is the hidden state of the first token ([CLS])
        pooled_output = outputs.last_hidden_state[:, 0]
        output = self.drop(pooled_output)
        return self.out(output)

model = SentimentClassifier(n_classes=2).to(DEVICE) # 2 classes: positive and negative

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

## Training and Evaluation Functions

In [24]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss().to(DEVICE)



In [25]:
def train_epoch(model, data_loader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Helps prevent exploding gradients
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return total_loss / len(data_loader)

In [26]:
def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_loss, accuracy

## Main Training Loop

In [30]:
print("Starting training...")
total_start_time = time.time()

for epoch in range(EPOCHS):
    epoch_start_time = time.time()
    
    print(f'--- Epoch {epoch + 1}/{EPOCHS} ---')
    train_loss = train_epoch(model, train_loader, criterion, optimizer, scheduler, DEVICE)
    print(f'Training Loss: {train_loss:.4f}')
    
    # We evaluate on the test set for simplicity in this example
    val_loss, val_accuracy = eval_model(model, test_loader, criterion, DEVICE)
    print(f'Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}')
    
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time
    print(f"Epoch {epoch + 1} duration: {epoch_duration / 60:.2f} minutes")

total_end_time = time.time()
total_training_time = total_end_time - total_start_time
print("\n--- Training Complete ---")
print(f"Total training time: {total_training_time / 60:.2f} minutes")

Starting training...
--- Epoch 1/1 ---
Training Loss: 0.2667
Validation Loss: 0.2307 | Validation Accuracy: 0.9098
Epoch 1 duration: 27.58 minutes

--- Training Complete ---
Total training time: 27.58 minutes


## Save the fine-tuned model's state dictionary

In [31]:
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}")

Model saved to sentiment_classifier.pth
