### Example LSTM model for sentiment prediction on the IMDB dataset
Source: [blogpost](https://github.com/hassaanbinaslam/myblog/blob/main/posts/2022-11-09-pytorch-lstm-imdb-sentiment-prediction.ipynb)

In [1]:
!conda install pytorch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 pytorch-cuda=12.1 -c pytorch -c nvidia

Channels:
 - pytorch
 - nvidia
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
!pip install matplotlib pandas portalocker torchtext==0.18 datasets



In [3]:
from platform import python_version
import numpy, matplotlib, pandas, torch, torchtext

print("python==" + python_version())
print("numpy==" + numpy.__version__)
print("torch==" + torch.__version__)
print("torchtext==" + torchtext.__version__)
print("matplotlib==" + matplotlib.__version__)

python==3.12.9
numpy==2.0.1
torch==2.3.0
torchtext==0.18.0+cpu
matplotlib==3.10.1


In [4]:
from datasets import load_dataset
from torch.utils.data.dataset import random_split

torch.manual_seed(1)

train_dataset_raw = load_dataset("imdb", split="train")
test_dataset_raw = load_dataset("imdb", split="test")

  from .autonotebook import tqdm as notebook_tqdm


Check the size of the downloaded data.

In [5]:
print("Train dataset size: ", len(list(train_dataset_raw)))
print("Test dataset size: ", len(list(test_dataset_raw)))

Train dataset size:  25000
Test dataset size:  25000


In [6]:
train_dataset_raw_processed = []
for el in train_dataset_raw:
    y = "pos" if el["label"] == 1 else "neg"
    train_dataset_raw_processed.append((y, el["text"]))
test_dataset_raw_processed = []
for el in test_dataset_raw:
    y = "pos" if el["label"] == 1 else "neg"
    test_dataset_raw_processed.append((y, el["text"]))

### Split train data further into train and validation set

Both train and test datasets have 25000 reviews. Therefore, we can split the training set further into the train and validation sets.

In [7]:
train_set_size = 20000
valid_set_size = 5000

train_dataset, valid_dataset = random_split(list(train_dataset_raw_processed), [20000, 5000])

In [8]:
len(train_dataset)

20000

In [9]:
import re
def tokenizer(text):
    # step 1. remove HTML tags. they are not helpful in understanding the sentiments of a review
    # step 2: use lowercase for all text to keep symmetry
    # step 3: extract emoticons. keep them as they are important sentiment signals
    # step 4: remove punctuation marks
    # step 5: put back emoticons
    # step 6: generate word tokens
    text = re.sub("<[^>]*>", "", text)
    text = text.lower()
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = re.sub("[\W]+", " ", text)
    text = text + " ".join(emoticons).replace("-", "")
    tokenized = text.split()
    return tokenized

  emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
  text = re.sub("[\W]+", " ", text)


In [10]:
##
# step 1: convert reviews into tokens
# step 2: find frequency of tokens

from collections import Counter

token_counts = Counter()

for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
 
print('IMDB vocab size:', len(token_counts))

IMDB vocab size: 69023


In [11]:
##
# step 3: sort the token based on their frequency
# step 4: put the sorted tokens in OrderedDict
# step 5: convert token to integers using vocab object

from collections import OrderedDict
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vb = vocab(ordered_dict)

vb.insert_token("<pad>", 0)  # special token for padding
vb.insert_token("<unk>", 1)  # special token for unknown words
vb.set_default_index(1)

# print some token indexes from vocab
for token in ["this", "is", "an", "example"]:
    print(token, " --> ", vb[token])

this  -->  11
is  -->  7
an  -->  35
example  -->  457




In [12]:
##
# inline lambda functions for text and label precessing
text_pipeline = lambda x: [vb[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1.0 if x == "pos" else 0.0

In [13]:
##
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [14]:
##
# a function to apply pre-processing steps at a batch level
import torch.nn as nn

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []

    # iterate over all reviews in a batch
    for _label, _text in batch:
        # label preprocessing
        # print(f"label is {_label}")
        label_list.append(label_pipeline(_label))
        # text preprocessing
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)

        # store the processed text in a list
        text_list.append(processed_text)
        
        # store the length of processed text
        # this will come handy in future when we want to know the original size of a text (without padding)
        lengths.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    
    # pad the processed reviews to make their lengths consistant
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    
    # return
    # 1. a list of processed and padded review texts
    # 2. a list of processed labels
    # 3. a list of review text original lengths (before padding)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

## Batching the training, validation, and test dataset

Let's proceed on creating DataLoaders for train, valid, and test data with `batch_size = 32`

In [15]:
from torch.utils.data import DataLoader
batch_size = 32

train_dl = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
valid_dl = DataLoader(
    valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)
test_dl = DataLoader(
    test_dataset_raw_processed, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

## Define model training and evaluation pipelines
I have defined two simple functions to train and evaluate the model in this section.

In [16]:
##
# model training pipeline
# https://github.com/rasbt/machine-learning-book/blob/main/ch15/ch15_part2.ipynb
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)


# model evaluation pipeline
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

## RNN model configuration, loss function, and optimizer
We have seen the review text, which can be long sequences. We will use the LSTM layer for capturing the long-term dependencies. Our sentiment analysis model is composed of the following layers

* Start with an **Embedding layer**. Placing the embedding layer is similar to one-hot-encoding, where each word token is converted to a separate feature (or vector or column). But this can lead to too many features (curse of dimensionality or dimensional explosion). To avoid this, we try to map tokens to fixed-size vectors (or columns). In such a feature matrix, different elements denote different tokens. Tokens that are closed are also placed together. Further, during training, we also learn and update the positioning of tokens. Similar tokens are placed into closer and closer locations. Such a matrix layer is termed an embedding layer.
* After the embedding layer, there is the RNN layer (LSTM to be specific).
* Then we have a fully connected layer followed by activation and another fully connected layer.
* Finally, we have a logistic sigmoid layer for prediction

In [17]:
##
# https://github.com/rasbt/machine-learning-book/blob/main/ch15/ch15_part2.ipynb
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
        )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [18]:
vocab_size = len(vb)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)

### Define model loss function and optimizer
For loss function (or criterion), I have used [Binary Cross Entropy](https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html), and for loss optimization, I have used [Adam algorithm](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html)

In [19]:
torch.manual_seed(1)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Model training and evaluation
Let's run the pipeline for ten epochs and compare the training and validation accuracy.

In [20]:
num_epochs = 10
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(
        f"Epoch {epoch} train accuracy: {acc_train:.4f}; val accuracy: {acc_valid:.4f}"
    )

Epoch 0 train accuracy: 0.6096; val accuracy: 0.6852
Epoch 1 train accuracy: 0.7257; val accuracy: 0.7452
Epoch 2 train accuracy: 0.7466; val accuracy: 0.6284
Epoch 3 train accuracy: 0.7253; val accuracy: 0.5366
Epoch 4 train accuracy: 0.7972; val accuracy: 0.7492
Epoch 5 train accuracy: 0.8619; val accuracy: 0.7784
Epoch 6 train accuracy: 0.8911; val accuracy: 0.8040
Epoch 7 train accuracy: 0.9162; val accuracy: 0.8574
Epoch 8 train accuracy: 0.9328; val accuracy: 0.8598
Epoch 9 train accuracy: 0.9504; val accuracy: 0.8634


### Evaluate sentiments on random texts
Let's create another helper method to evaluate sentiments on random texts.

In [21]:
def classify_review(text):
    text_list, lengths = [], []

    # process review text with text_pipeline
    # note: "text_pipeline" has dependency on data vocabulary
    processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
    text_list.append(processed_text)

    # get processed review tokens length
    lengths.append(processed_text.size(0))
    lengths = torch.tensor(lengths)
        
    # change the dimensions from (torch.Size([8]), torch.Size([1, 8]))
    # nn.utils.rnn.pad_sequence(text_list, batch_first=True) does this too
    padded_text_list = torch.unsqueeze(processed_text, 0)

    # move tensors to correct device
    padded_text_list = padded_text_list.to(device)
    lengths = lengths.to(device)

    # get prediction
    model.eval()
    pred = model(padded_text_list, lengths)
    print("model pred: ", pred)

    # positive or negative review
    review_class = 'negative' # else case
    if (pred>=0.5) == 1:
        review_class = "positive"

    print("review type: ", review_class)

In [22]:
##
# create two random texts with strong positive and negative sentiments
pos_review = 'i love this movie. it was so good.'
neg_review = 'slow and boring. waste of time.'

In [23]:
classify_review(pos_review)

model pred:  tensor([[0.8522]], device='cuda:0', grad_fn=<SigmoidBackward0>)
review type:  positive


In [24]:
classify_review(neg_review)

model pred:  tensor([[0.0029]], device='cuda:0', grad_fn=<SigmoidBackward0>)
review type:  negative
