#Sentiment Analysis using RNNs

This notebook addresses sentiment analysis task on text data. We use [IMDB movie review dataset](https://ai.stanford.edu/~amaas/data/sentiment/). In this dataset, the task is to classify the sentiment IMDB movie review comments as positive or negative. There are 25k labeled training and 25k labeled test examples in the initial data. For feasibility as an I2DL exercise, very long reviews (e.g. longer 150 words) should be ignored.

This notebook uses Pytorch's own LSTM and Embedding implementations. For the exercise, students should be asked to implement these layers themselves.

## Setup

In [None]:
import os
import re
import random
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

## Data Download
The data is pulicly available, and this notebook uses the provided train and test splits. For the exercise we should create our own splits. Otherwise one can overfit to the test set whose labels are available.

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvf aclImdb_v1.tar.gz

## Data Preparation
Preprocessing text data is tricky. There are two important steps we have to take care of:

1. As we use a word-based language model, we need to <b>tokenize</b> each text in lists of words. Tokenization is relatively easy in English, but can be tricky in some other languages.

2. We need to assign an integer index as an id to each word. The mapping that contains word -> id is often called <b>vocabulary</b>.

The below helper functions loads and tokenizes data while creating the vocabulary.

In [None]:
def load_train_data(max_vocab_size=5000, max_len=250):
    base_dir = './aclImdb/train/'
    data = []  # list of tuples of (raw_text, word_list, label (0 or 1))

    # Infrequent words can noise the training.
    # Therefore we select only the max_vocab_size most commong words.

    # 1. Collect and tokenize all sentences and keep track of the word 
    # frequencies on the fly. Counter is a useful data structure to store
    # frequencies.
    word_freqs = Counter()
    for label in ('pos', 'neg'):
        dir = base_dir + label
        for text_file in os.listdir(dir):
            with open(os.path.join(dir, text_file)) as f:
                text = f.read().strip()
                # Below tokenization will get words w/o special characters
                # in lower case
                words = [s.lower() for s in re.split(r'\W+', text) if len(s) > 0]
            if len(words) > max_len:  # Ignore very long sequences
                continue
            word_freqs.update(words)
            data.append((text, words, 1 if label == 'pos' else 0))

    # 2. Create the "vocabulary", a dictionary word -> integer id.
    # Note we have two special words:
    ## <eos>: end of sequence, used for padding
    ## <unk>: unknown, used for infrequent words
    word2id = {'<eos>': 0, '<unk>': 1}
    for word, freq in word_freqs.most_common(
        min(max_vocab_size, len(word_freqs))
    ):
        word2id[word] = len(word2id)
    
    # 3. Store the inverse vocabulary (id -> word). For this task, id2word
    # is not necessary, but it could be used e.g. in text generation.
    id2word = {v: k for k, v in word2id.items()}
    assert len(word2id) == len(id2word)

    # 4. Replace words with their integer ids
    for raw_text, words, label in data:
        for i in range(len(words)):
            words[i] = word2id.get(words[i], word2id['<unk>'])
    # Return vocabulary, inverse vocabulary and data. Same vocabulary must
    # be used for the test data!
    return word2id, id2word, data


def load_test_data(word2id, max_len=250):
    # Same logic as the train data but with a given vocabulary.
    data = []
    base_dir = './aclImdb/test/'
    
    for label in ('pos', 'neg'):
        dir = base_dir + label
        for text_file in os.listdir(dir):
            with open(os.path.join(dir, text_file)) as f:
                text = f.read().strip()
            words = [s.lower() for s in re.split(r'\W+', text) if len(s) > 0]
            if len(words) > max_len:  # Ignore very long sequences
                continue
            words = [word2id.get(word, word2id['<unk>']) for word in words]
            data.append((text, words, 1 if label == 'pos' else 0))
    
    return data

We can now wrap the above function in a Pytorch Dataset class.

In [None]:
class IMDB(Dataset):
    def __init__(self, is_train=True, w2i=None, max_len=150):

        # NOTE: Larger max_len will increase # data, therefore test accuracy 
        self.max_len = max_len
        self.is_train = is_train
        
        if is_train:
            assert not w2i, 'Provided vocabulary is not supported in training'
            w2i, _, data = load_train_data(max_len=max_len)
            self.w2i = w2i
            # Ignore i2w for now
        else:
            assert w2i, 'You must provide the training w2i for the test data!'
            data = load_test_data(w2i, max_len=max_len)

        self.w2i = w2i
        self.data = data

        # Even if we support batching sequences with different lengths, 
        # sorting the data based on size is important! Otherwise, there will be 
        # a lot of padding that will cause wasting computational resources.
        # 
        # The "reverse=True" argument in .sort makes longest sequences 
        # come first during training. Therefore, we get out-of-memory errors 
        # at the beginning if batch size is too large :)!
        self.data.sort(key=lambda d: len(d[1]), reverse=True)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        _, words, label = self.data[index]
        return {
            'data': torch.tensor(words).long(),
            'label': torch.tensor(label).float()
        }


Now we can load the data. Below cell loads all dataset into the RAM, so it may take a while.

In [None]:
# Having larger max_len increase the training data size, therefore
# the test acuracy. Picking a small max_len is important for those
# who use CPU.
max_len = 150

# This will create the vocab (w2i) and store it as a member variable
train_dataset = IMDB(is_train=True, max_len=max_len)

# When the goal is submitting formal results, there shouldn't be a
# max_len on test/val data even if there is one on training data.
# Here we make this exception because o/w things become too slow.
test_dataset = IMDB(is_train=False, w2i=train_dataset.w2i, max_len=max_len)

# Compare the output with original 25k in both splits
print(len(train_dataset), len(test_dataset))

## Minibatching
Now we come to the hardest part of dealing with text data :D! Unlike in images, sequences may have different lengths. Therefore, it is necessary to pad sentences to make them the same size. 

To be able to use PyTorch's standard DataLoader class, we have to define a custom collate function (see [here](https://pytorch.org/docs/stable/data.html) for its use). We make use of [pad_sequence](https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html) function of PyTorch. By default sequences are padded with 0 (\<eos>) words.

Apart from data and labels, note that we also keep track of sequence lengths. The lengths will be used by the model to ignore padded elements during back-propagation.

In [None]:
def collate(batch):
    assert isinstance(batch, list)
    data = pad_sequence([b['data'] for b in batch])
    lengths = torch.tensor([len(b['data']) for b in batch])
    label = torch.stack([b['label'] for b in batch]).view(-1, 1)
    return {
        'data': data,
        'label': label,
        'lengths': lengths
    }


## Model Creation

### Model Components
Our text classifier model consists of three stages:

1. <b>Embedding layer</b> converts integer indices to dense vectors. It is nothing but a randomly initialized matrix. Each row (or column depending on the implementation) of it corresponds to a word. Embedding can also be initialized with transfer learning (using Word2Vec, Glove e.t.c.), but we don't do it here.

2. <b>RNN</b> processes the embeddings. In this implementation, the last hidden state of the RNN is used for classification. Alternatively, hidden state history could be used via some sort of attention.

3. <b>Output layer</b> is an MLP (could be a Linear model) that produces probabilities from the last hidden state.

### Minibatching
The pack_padded_sequence function is used to process padded batches in PyTorch. However, it is a complex mechanism related to the cuDNN backend. When students are implementing the padding stuff, they should be asked to use the padded representation and select the last hidden state using a loop at the end.

### Possible Enhancements
Bidirectional RNNs, Dropouts, and multiple layers are possible model enhancements. However, except for dropout of embedding outputs and MLP input/hidden layers, enhancements can be too hard for an exercise.


In [None]:
# Define the model
class TextClassifier(nn.Module):
    # TODO: num_layers, bidirectional, dropout?
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size)
        self.output = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()
        )

    def forward(self, sequence, lengths=None):
        embeddings = self.embedding(sequence)
        if lengths is not None:
            embeddings = pack_padded_sequence(embeddings, lengths)
     
        last_state = self.rnn(embeddings)[-1]
        if isinstance(last_state, tuple):  # includes cell state
            last_state = last_state[0]

        # NOTE: Below code should change for bidirectional + multi-layer
        last_state = last_state.squeeze(0)  # N x D

        return self.output(last_state)

## Training
It is just a regular PyTorch training loop. Note the gradient clipping to avoid exploding gradients problem and the collate_fn argument used for batching.

In [None]:
# Training configs
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using {}...\n'.format(device))

epochs = 5
model = TextClassifier(len(train_dataset.w2i), 64, 64).to(device)
optim = torch.optim.Adam(model.parameters())
gclip = 20

train_loader = DataLoader(
  train_dataset, batch_size=4, collate_fn=collate, drop_last=True
)
test_loader = DataLoader(
  test_dataset, batch_size=4, collate_fn=collate, drop_last=False
)

# Training loop
for e in range(epochs):
    print('Epoch {}...'.format(e))
    model.train()
    num_corrects = 0
    num_labels = 0
    total_loss = 0.0
    for i, data in enumerate(train_loader):
        seq = data['data'].to(device)
        label = data['label'].to(device)
        seq_lens = data['lengths']

        model.zero_grad()
        pred = model(seq, seq_lens)
        loss = F.binary_cross_entropy(pred, label, reduction='mean')
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=gclip)
        optim.step()
        
        num_corrects += ((pred > 0.5) == label).sum().item()
        num_labels += label.numel()
        total_loss += loss.item() * label.numel()
        if i % 500 == 0:
            print('Iter: {}, Loss: {}, Accuracy: {}'.format(
                i, total_loss / num_labels, num_corrects / num_labels
            ))

    print('Training loss/accuracy: {}/{}'.format(
        total_loss / num_labels, num_corrects / num_labels
    ))

    print('\nStarting evaluation...')
    model.eval()
    with torch.no_grad():
        num_corrects = 0
        num_labels = 0
        for i, data in enumerate(test_loader):
            seq = data['data'].to(device)
            label = data['label'].to(device)
            seq_lens = data['lengths']
    
            pred = model(seq, seq_lens) > 0.5

            num_corrects += (pred == label).sum().item()
            num_labels += label.numel()
            if i % 500 == 0:
                print('Iter: {}, Accuracy: {}'.format(
                    i, num_corrects / num_labels
                ))
    print('Accuracy: {}'.format(num_corrects / num_labels))
    
    print('\n')

In [None]:
text = ''
w2i = train_dataset.w2i
while True:
    text = input()
    if text == 'exit':
        break

    words = torch.tensor([
        w2i.get(word.lower(), w2i['<unk>'])
        for word in re.split(r'\W+', text)
    ]).long().to(device).view(-1, 1)  # T x B

    pred = model(words).item()
    sent = pred > 0.5
    
    print('Sentiment -> {}, Confidence -> {}'.format(
        ':)' if sent else ':(', pred if sent else 1 - pred
    ))
    print()