## LSTM for IMBD Sentiment Analysis

In this notebook we will ...

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random
import torch.nn as nn
import torch.optim as optim

## Download and Read Data

You should install `spacy` using `pip install spacy`.

In [2]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [3]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
print(vars(train_data.examples[0]))

{'text': ['Saw', 'this', 'on', 'SBS', 'TV', 'here', 'in', 'Australia', 'the', 'other', 'week', ',', 'where', 'it', 'was', 'titled', '"', 'Laputa', ':', 'Castle', 'in', 'the', 'sky', '"', '.', 'I', 'had', 'enabled', 'subtitles', 'and', 'I', 'think', 'SBS', 'provided', 'their', 'own', 'for', 'that', ',', 'which', ',', 'as', 'usual', ',', 'was', 'of', 'very', 'good', 'quality.<br', '/><br', '/>Just', 'looked', 'up', '"', 'Laputa', '"', 'on', 'Wikipedia', 'and', 'it', 'confirms', 'what', 'I', 'suspected', '...', 'the', 'floating', 'island', 'of', 'this', 'tale', 'is', 'taken', 'from', 'the', 'classic', 'Jonathan', 'Swift', 'novel', '"', 'Gulliver', "'s", 'travels', '"', ',', 'which', 'was', 'published', 'in', 'the', 'early', 'to', 'mid', '1700s.<br', '/><br', '/>Anyway', ',', 'this', 'is', 'an', 'engaging', 'Japanese', 'fairytale', ',', 'which', 'features', 'an', 'English', 'speaking', 'voice', '-', 'cast', '.', 'It', "'s", 'suitable', 'for', 'young', 'children', ',', 'I', 'think', ',', 'b

In [5]:
SEED = 1230245
train_data, valid_data = train_data.split(random_state = random.seed(SEED))


## Build Vocabulary

In [6]:
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

## Explore Features

In [7]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 203504), (',', 193259), ('.', 165794), ('and', 109870), ('a', 109754), ('of', 101079), ('to', 93898), ('is', 76519), ('in', 61391), ('I', 54429), ('it', 53535), ('that', 49318), ('"', 44684), ("'s", 43642), ('this', 42666), ('-', 36803), ('/><br', 35496), ('was', 35217), ('as', 30564), ('with', 29981)]


In [8]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [9]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


## Training

In [10]:
BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

### Model Definition 

In [11]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        embedded = self.dropout(self.embedding(text))
        
        #packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
            
        return self.fc(hidden)

In [36]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

## Training the model

In [37]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [38]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [39]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for (text, cls) in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(text).squeeze(1)
        
        loss = criterion(predictions, cls)
        
        acc = binary_accuracy(predictions, cls)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [40]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for (text, cls) in iterator:

            
            
            predictions = model(text).squeeze(1)
            
            loss = criterion(predictions, cls)
            
            acc = binary_accuracy(predictions, cls)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    torch.save(model.state_dict(), 'model.pt' + str(epoch))
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.685 | Train Acc: 54.72%
	 Val. Loss: 0.690 |  Val. Acc: 52.25%


In [45]:
model.load_state_dict(torch.load('model.pt9'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.345 | Test Acc: 85.71%
