In [None]:
!pip install torch
!pip install torchdata
!pip install --upgrade torch torchdata
!pip install portalocker
!pip install torchtext==0.10.0

[31mERROR: Could not find a version that satisfies the requirement torchtext==0.10.0 (from versions: 0.1.1, 0.2.0, 0.2.1, 0.2.3, 0.3.1, 0.4.0, 0.5.0, 0.6.0, 0.12.0, 0.13.0, 0.13.1, 0.14.0, 0.14.1, 0.15.1, 0.15.2, 0.16.0, 0.16.1, 0.16.2, 0.17.0, 0.17.1, 0.17.2, 0.18.0)[0m[31m
[0m[31mERROR: No matching distribution found for torchtext==0.10.0[0m[31m
[0m

# Data Processing

Firstly, importing our data

In [None]:
import torch
import torch.nn as nn
import torchtext

In [None]:
from torchtext.datasets import IMDB

training_set = IMDB(split='train')
testing_set = IMDB(split='test')

firstly some basic data processing

In [None]:
import portalocker
from torch.utils.data.dataset import random_split

testing_set = list(testing_set)   #datapipe to list

torch.manual_seed(1)

# dividing the initial training set into training set and validation set
# essentially out of 25000, 20000 are for training and 5000 validation
training_set, validation_set = random_split(
    list(training_set), [20000, 5000])

now to come up with a tokenisation tool to find unique words and also getting rid of unecessary punctuation

In [None]:
import re
from collections import Counter, OrderedDict


token_counts = Counter()


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

now we will try to assess the total number of unique words

In [None]:
for label, line in training_set:
    words = tokenizer(line)
    token_counts.update(words)


print('Unique Words:', len(token_counts))

Unique Words: 69023


now to map each unique word to a particular integer using torchtext

In [None]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

Now we will define a function to carry out labelling into negative and positive reviews

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

from torchtext import __version__ as torchtext_version
from pkg_resources import parse_version

if parse_version(torchtext.__version__) > parse_version("0.10"):
    label_pipeline = lambda x: 1. if x == 2 else 0.         # 1 ~ negative, 2 ~ positive review
else:
    label_pipeline = lambda x: 1. if x == 'pos' else 0.

now to wrap the ecoder and transfomation function

In [None]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text),
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

now to batch datasets

In [None]:
from torch.utils.data import DataLoader
dataloader = DataLoader(training_set, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))

In [None]:
batch_size = 32

train_dl = DataLoader(training_set, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(validation_set, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(testing_set, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)


embedding = nn.Embedding(num_embeddings=10,
                         embedding_dim=3,
                         padding_idx=0)

# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])

# Building the RNN

We will build a Bidirectional LSTM by adding a bidirectional recurrent layer which essentially takes input from both sides

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embed_dim,
                                      padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

The train and evaluate function

In [None]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)


    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

Now to begin running it

In [None]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)

Training it

In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

num_epochs = 20

torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6113 val_accuracy: 0.6826
Epoch 1 accuracy: 0.7007 val_accuracy: 0.6758
Epoch 2 accuracy: 0.8338 val_accuracy: 0.8362
Epoch 3 accuracy: 0.9065 val_accuracy: 0.8650
Epoch 4 accuracy: 0.9319 val_accuracy: 0.8620
Epoch 5 accuracy: 0.9615 val_accuracy: 0.8598
Epoch 6 accuracy: 0.9791 val_accuracy: 0.8772
Epoch 7 accuracy: 0.9873 val_accuracy: 0.8742
Epoch 8 accuracy: 0.9925 val_accuracy: 0.8736
Epoch 9 accuracy: 0.9938 val_accuracy: 0.8734
Epoch 10 accuracy: 0.9971 val_accuracy: 0.8692
Epoch 11 accuracy: 0.9951 val_accuracy: 0.8676
Epoch 12 accuracy: 0.9978 val_accuracy: 0.8710
Epoch 13 accuracy: 0.9925 val_accuracy: 0.8726
Epoch 14 accuracy: 0.9970 val_accuracy: 0.8666
Epoch 15 accuracy: 0.9998 val_accuracy: 0.8756
Epoch 16 accuracy: 1.0000 val_accuracy: 0.8772
Epoch 17 accuracy: 1.0000 val_accuracy: 0.8758
Epoch 18 accuracy: 1.0000 val_accuracy: 0.8780
Epoch 19 accuracy: 1.0000 val_accuracy: 0.8778


Testing It

In [None]:
test_dataset = IMDB(split='test')
test_dl = DataLoader(list(test_dataset), batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [None]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8580
