**Sentiment Analysis with Transformers**


This project implements a Transformer-based text classification model trained on the IMDb movie review dataset. The model leverages components of the Transformer architecture, such as multi-head attention, positional encoding, and feed-forward networks, to process text data and predict binary sentiment labels (positive or negative).



In [2]:
!pip install torch torchvision torchaudio datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
import math

Step 1: Defining the Transformer Components


---


**Scaled Dot-Product Attention** - Computes the "attention scores" for the input query, key, and value. Attention score measures the importance of one word in a sentence with respect to others. This helps the model focus on important parts of the input sequence.


In [4]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, query, key, value, mask=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = F.softmax(scores, dim=-1)
        output = torch.matmul(attn, value)
        return output, attn


**Multi-Head Attention** - Extends the attention mechanism by using multiple "heads" (smaller parts of the text) to focus on different parts of the input simultaneously. This will give the model a broader understanding of the relationships between words.



In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        query = self.query(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.key(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.value(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        attn_output, attn = ScaledDotProductAttention()(query, key, value, mask)

        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        output = self.out(attn_output)
        return output, attn

**Position-wise Feedforward Network** - Adds a layer to process the data after attention has been applied.

In [6]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))


**Positional Encoding** - Helps the model understand the order of the words in the sequence.

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        self.encoding.require_grad = False

        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)

        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].to(x.device)


**Transformer Encoder Layer** - Combines attention and feedforward layers into a single processing unit.

In [8]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        src2, _ = self.self_attn(src, src, src, src_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.feed_forward(src)
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

**Transformer Encoder** - Stacks multiple encoder layers to process sequences in-depth.

In [9]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size, max_len=5000, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        src = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        src = self.pos_encoding(src)
        src = self.dropout(src)

        for layer in self.layers:
            src = layer(src, src_mask)

        return src

Step 2 - Dataset Loading and Preparation

Things being done
1. Loading IMDb dataset
2. Tokenising the text sequence
3. Building vocabulary by counting frequency of each word in sequence and assigning unique ID to each word
4. Defining pipelines - Converting text to token IDs and converting labels to 1s and 0s.


---




In [10]:
# Dataset and Dataloader
imdb_dataset = load_dataset("imdb")

print(imdb_dataset['train'][:5])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

In [11]:
def simple_tokenizer(text):
    return text.lower().split()

# Vocabulary creation
from collections import Counter
from itertools import chain

counter = Counter(chain.from_iterable([simple_tokenizer(text) for text in imdb_dataset['train']['text']]))
vocab = {word: i for i, (word, _) in enumerate(counter.items(), start=2)}
vocab['<unk>'] = 0
vocab['<pad>'] = 1

# Pipelines
def text_pipeline(text):
    return [vocab.get(token, vocab['<unk>']) for token in simple_tokenizer(text)]

def label_pipeline(label):
    return 1 if label == 'pos' else 0

def collate_batch(batch):
    label_list, text_list = [], []
    for example in batch:
        label_list.append(label_pipeline(example['label']))
        processed_text = text_pipeline(example['text'])
        text_list.append(torch.tensor(processed_text, dtype=torch.int64))
    labels = torch.tensor(label_list, dtype=torch.int64)
    texts = nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    return texts, labels

batch_size = 32
train_dataloader = DataLoader(imdb_dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(imdb_dataset['test'], batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


Step 3 - Initialising the model


---



In [12]:
# Training setup
input_vocab_size = len(vocab)
model = TransformerEncoder(num_layers=6, d_model=512, num_heads=8, d_ff=2048, input_vocab_size=input_vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


Step 4 - Training the model


---



In [None]:
num_epochs = 5
print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    print(f"Epoch {epoch + 1} started...")
    for batch_idx, (texts, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        output = model(texts)
        logits = output.mean(dim=1)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")
    print(f"Epoch {epoch + 1} completed, Loss: {total_loss:.4f}")

# Testing loop
print("Starting testing...")
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch_idx, (texts, labels) in enumerate(test_dataloader):
        output = model(texts)
        logits = output.mean(dim=1)
        predictions = torch.argmax(logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}, Correct: {correct}, Total: {total}")
print(f"Test Accuracy: {correct / total:.2%}")


Starting training...
Epoch 1 started...
