## Upgraded Sentiment Analysis

### 1. Preparing Data

In [1]:
import torch
import torchtext
from torchtext import datasets
from torch.utils.data import  DataLoader
from torchtext.data import utils
from torchtext import vocab
from torchtext.data import functional
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn
from tqdm import tqdm
import functools
import sys

#### Build Vocabulary

In [2]:
# Load Dataset
train_iter, test_iter = datasets.IMDB()

tokenizer = utils.get_tokenizer("basic_english")

def yield_tokens(text_iter):
    for _, text in text_iter:
        yield tokenizer(text)
        
special_tokens = ["<unk>", "<pad>"]
        
vocabulary = vocab.build_vocab_from_iterator(yield_tokens(train_iter),
                                            min_freq=1,
                                            specials=special_tokens)
vocabulary.set_default_index(vocabulary["<unk>"])

#### Build Dataset and Vocabulary

In [3]:
text_pipeline = lambda x : vocabulary(tokenizer(x))
label_pipeline = lambda x: 0. if x=='neg' else 1.
BATCH_SIZE = 1000

# Load Dataset
train_iter, test_iter = datasets.IMDB()

train_dataset, test_dataset = functional.to_map_style_dataset(train_iter), \
                                functional.to_map_style_dataset(test_iter)

train_dataset = train_dataset[:20000]
test_dataset = test_dataset[:20000]

num_test = int(len(test_dataset)*0.90)
split_test, split_valid = random_split(test_dataset, [num_test, len(test_dataset)-num_test])

def collate_batch(batch, pad_index):
    label_list, text_list, lengths = [], [], []
    for (label, text) in batch:
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        lengths.append(len(processed_text))
        text_list.append(processed_text)
        label_list.append(label_pipeline(label))
    seq_list = pad_sequence(text_list, batch_first=True, padding_value=pad_index)
    label_seq = torch.unsqueeze(torch.tensor(label_list), 1)
    return seq_list, label_seq, lengths

pad_index = vocabulary["<pad>"]
collate_batch = functools.partial(collate_batch, pad_index=pad_index)

train_loader = DataLoader(dataset=train_dataset, 
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn=collate_batch,)
valid_loader = DataLoader(dataset=split_valid, 
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         collate_fn=collate_batch,)
test_loader = DataLoader(dataset=split_test, 
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         collate_fn=collate_batch,)

### 2. Define Model

#### Different RNN Architecture
We'll be using a different RNN architecture called a Long Short-Term Memory (LSTM). Why is an LSTM better than a standard RNN? Standard RNNs suffer from the vanishing gradient problem. LSTMs overcome this by having an extra recurrent state called a cell, $c$ - which can be thought of as the "memory" of the LSTM - and the use use multiple gates which control the flow of information into and out of the memory. For more information, go here. We can simply think of the LSTM as a function of $x_t$, $h_t$ and $c_t$, instead of just $x_t$ and $h_t$.

$$(h_t, c_t) = \text{LSTM}(x_t, h_t, c_t)$$
Thus, the model using an LSTM looks something like (with the embedding layers omitted):

<img src="img/lstm.png"/>

The initial cell state, $c_0$, like the initial hidden state is initialized to a tensor of all zeros. The sentiment prediction is still, however, only made using the final hidden state, not the final cell state, i.e. $\hat{y}=f(h_T)$.

#### Bidirectional RNN
The concept behind a bidirectional RNN is simple. As well as having an RNN processing the words in the sentence from the first to the last (a forward RNN), we have a second RNN processing the words in the sentence from the last to the first (a backward RNN). At time step $t$, the forward RNN is processing word $x_t$, and the backward RNN is processing word $x_{T-t+1}$.

In PyTorch, the hidden state (and cell state) tensors returned by the forward and backward RNNs are stacked on top of each other in a single tensor.

We make our sentiment prediction using a concatenation of the last hidden state from the forward RNN (obtained from final word of the sentence), $h_T^\rightarrow$, and the last hidden state from the backward RNN (obtained from the first word of the sentence), $h_T^\leftarrow$, i.e. $\hat{y}=f(h_T^\rightarrow, h_T^\leftarrow)$

The image below shows a bi-directional RNN, with the forward RNN in orange, the backward RNN in green and the linear layer in silver.

<img src="img/bidirec.png"/>

#### Mulit-layer RNN
Multi-layer RNNs (also called deep RNNs) are another simple concept. The idea is that we add additional RNNs on top of the initial standard RNN, where each RNN added is another layer. The hidden state output by the first (bottom) RNN at time-step $t$ will be the input to the RNN above it at time step $t$. The prediction is then made from the final hidden state of the final (highest) layer.

The image below shows a multi-layer unidirectional RNN, where the layer number is given as a superscript. Also note that each layer needs their own initial hidden state, $h_0^L$.

<img src="img/multilayer.png"/>

#### Regularization
Although we've added improvements to our model, each one adds additional parameters. Without going into overfitting into too much detail, the more parameters you have in in your model, the higher the probability that your model will overfit (memorize the training data, causing a low training error but high validation/testing error, i.e. poor generalization to new, unseen examples). To combat this, we use regularization. More specifically, we use a method of regularization called dropout. Dropout works by randomly dropping out (setting to 0) neurons in a layer during a forward pass. The probability that each neuron is dropped out is set by a hyperparameter and each neuron with dropout applied is considered indepenently. One theory about why dropout works is that a model with parameters dropped out can be seen as a "weaker" (less parameters) model. The predictions from all these "weaker" models (one for each forward pass) get averaged together withinin the parameters of the model. Thus, your one model can be thought of as an ensemble of weaker models, none of which are over-parameterized and thus should not overfit.

#### Implementation Details
Another addition to this model is that we are not going to learn the embedding for the <pad> token. This is because we want to explitictly tell our model that padding tokens are irrelevant to determining the sentiment of a sentence. This means the embedding for the pad token will remain at what it is initialized to (we initialize it to all zeros later). We do this by passing the index of our pad token as the padding_idx argument to the nn.Embedding layer.

To use an LSTM instead of the standard RNN, we use nn.LSTM instead of nn.RNN. Also, note that the LSTM returns the output and a tuple of the final hidden state and the final cell state, whereas the standard RNN only returned the output and final hidden state.

As the final hidden state of our LSTM has both a forward and a backward component, which will be concatenated together, the size of the input to the nn.Linear layer is twice that of the hidden dimension size.

Implementing bidirectionality and adding additional layers are done by passing values for the num_layers and bidirectional arguments for the RNN/LSTM.

Dropout is implemented by initializing an nn.Dropout layer (the argument is the probability of dropping out each neuron) and using it within the forward method after each layer we want to apply dropout to. Note: never use dropout on the input or output layers (text or fc in this case), you only ever want to use dropout on intermediate layers. The LSTM has a dropout argument which adds dropout on the connections between hidden states in one layer to hidden states in the next layer.

As we are passing the lengths of our sentences to be able to use packed padded sequences, we have to add a second argument, text_lengths, to forward.

Before we pass our embeddings to the RNN, we need to pack them, which we do with nn.utils.rnn.packed_padded_sequence. This will cause our RNN to only process the non-padded elements of our sequence. The RNN will then return packed_output (a packed sequence) as well as the hidden and cell states (both of which are tensors). Without packed padded sequences, hidden and cell are tensors from the last element in the sequence, which will most probably be a pad token, however when using packed padded sequences they are both from the last non-padded element in the sequence. Note that the lengths argument of packed_padded_sequence must be a CPU tensor so we explicitly make it one by using .to('cpu').

We then unpack the output sequence, with nn.utils.rnn.pad_packed_sequence, to transform it from a packed sequence to a tensor. The elements of output from padding tokens will be zero tensors (tensors where every element is zero). Usually, we only have to unpack output if we are going to use it later on in the model. Although we aren't in this case, we still unpack the sequence just to show how it is done.

The final hidden state, hidden, has a shape of [num layers * num directions, batch size, hid dim]. These are ordered: [forward_layer_0, backward_layer_0, forward_layer_1, backward_layer 1, ..., forward_layer_n, backward_layer n]. As we want the final (top) layer forward and backward hidden states, we get the top two hidden layers from the first dimension, hidden[-2,:,:] and hidden[-1,:,:], and concatenate them together before passing them to the linear layer (after applying dropout).

In [11]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, 
                 embed_dim, 
                 hidden_dim,
                 output_dim, 
                 n_layers,
                 bidirectional,
                 dropout,
                 pad_index):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_index)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.GRU(embed_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,X, x_lengths):
        embedded = self.embedding(X)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, x_lengths, 
                                                            batch_first=True, enforce_sorted=False)
        output, (hidden, cell) = self.rnn(packed_embedded)
        dropped = self.dropout(hidden[-1, :, :])
        out = self.fc(dropped)
        return self.sigmoid(out)

### 3. Build and Train Model

#### Define Hyperparamters

In [12]:
vocab_size = len(vocabulary)
EMBEDDING_DIM = 50
HIDDEN_DIM = 50
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = False
DROPOUT = 0.5
pad_index = vocabulary["<pad>"]

classifier = TextClassifier(vocab_size,
                           embed_dim=EMBEDDING_DIM,
                           hidden_dim=HIDDEN_DIM,
                           output_dim=OUTPUT_DIM,
                           n_layers=N_LAYERS,
                           bidirectional=BIDIRECTIONAL,
                           dropout=DROPOUT,
                           pad_index=pad_index)

#### Count Trainable Parameters

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in classifier.parameters() if p.requires_grad)

print(f'The model has {count_parameters(classifier):,} trainable parameters')

The model has 5,054,651 trainable parameters


#### Use Pretrained Embedding Vectors

In [14]:
vectors = torchtext.vocab.GloVe(name='6B', dim=50)
pretrained_embedding = vectors.get_vecs_by_tokens(vocabulary.get_itos())
classifier.embedding.weight.data = pretrained_embedding
classifier.embedding.weight.requires_grad = False

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in classifier.parameters() if p.requires_grad)

print(f'The model has {count_parameters(classifier):,} trainable parameters')

The model has 20,451 trainable parameters


#### Train Model

In [16]:
def train(dataloader, model):
    model.train()
    for texts, labels, lengths in tqdm(dataloader, desc='training...', file=sys.stdout):
        optimizer.zero_grad()
        outputs = model(texts, lengths)
        #outputs = outputs.reshape(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
def evaluate(dataloader, model):
    model.eval()
    n_samples, n_accurates = 0, 0
    with torch.no_grad():
        for texts, labels, lengths in dataloader:
            outputs = model(texts, lengths)
            #outputs = outputs.reshape(-1)
            n_samples += labels.size(0)
            n_accurates += (torch.round(outputs)==labels).sum().item()
    return n_accurates/n_samples

In [17]:
N_EPOCHS = 1
LR = 0.05

# Criterion, Optimizer, learning rate scheduler
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=LR)
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

for epoch in range(1, N_EPOCHS+1):
    train(train_loader, classifier)
    accu_train = evaluate(train_loader, classifier)
    accu_val = evaluate(valid_loader, classifier)
    scheduler.step()
    print(f"| Epoch: {epoch}/{N_EPOCHS} | train_accuracy: {accu_train: .3f} | val_accuracy :  {accu_val: .3f}")
    

# Test with test set
accu_test = evaluate(test_loader, classifier)
print('='*60)
print(f"Test Accuracy: {accu_test: .3f}")

training...: 100%|██████████████████████████████████████████████████████████████████| 20/20 [1:42:21<00:00, 307.07s/it]
| Epoch: 1/1 | train_accuracy:  0.637 | val_accuracy :   0.643
Test Accuracy:  0.634


In [18]:
# torch.save(classifier, "text_classifier_SA_pretrained_lstm.pth")

def predict_sentiment(text, model, tokenizer, vocab):
    tokens = tokenizer(text)
    txt_length = torch.tensor([len(tokens)])
    ids = [vocab[t] for t in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(dim=0)
    prediction = model(tensor, txt_length)
    prediction = torch.round(prediction).item()
    predicted_polarity = "pos" if prediction==1 else "neg"
    return predicted_polarity

In [19]:
text = "This film is great!"

predict_sentiment(text, classifier, tokenizer, vocabulary)

'neg'

In [20]:
text = "This film is not great, it's terrible!"

predict_sentiment(text, classifier, tokenizer, vocabulary)

'neg'