# LSTM vs. GRU
## Junqing Zou (jz862)

## Preparing Data

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

# Set the seed
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# TEXT deals with the review, and LABEL deals with the sentiment
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

# Split the datasets into training set and test set
train, test = datasets.IMDB.splits(TEXT, LABEL)

# Further split the training set into training set and validation set
train, valid = train.split(random_state=random.seed(SEED))

In [2]:
# Use pre-trained word embeddings
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [3]:
# Create the iterators
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

## Build the Model

In [4]:
import torch.nn as nn

class RNN_LSTM(nn.Module):    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        """
        Within the "__init__" we define the layers of the module. The layers include embedding layer, RNN, and a linear layer.
        The embedding layer is used to transform our vectors, which denote vocabularies, into a dense embedding vector.
        The RNN layer is our RNN which takes in ourse dense vector and the previous hidden state to calculate the next hidden state.
        The linear layer takes the final hidden state and feeds it through a fully connected layer, transforming it to the correct output dimension.    
        Implementing bidirectionality and adding additional layers are done by passing values for the "num_layers" and "bidirectional" arguments for the LSTM/GRU.
        Dropout is implemented by initializing an nn.Dropout layer (the argument is the probability of dropout for each neuron) and using it within the forward method after each layer we want to apply to dropout to.
        """
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        forward defines the forwarding process
        """
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

    
class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [5]:
# Create the instances of model_LSTM class and model_GRU class
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model_LSTM = RNN_LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_GRU = RNN_GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [6]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [7]:
model_LSTM.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0306, -0.0086,  0.1552,  ..., -0.9847,  0.4392,  0.3018],
        [ 0.3614,  0.1344,  0.0411,  ..., -0.1543, -1.0218, -0.5138]])

In [8]:
model_GRU.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0306, -0.0086,  0.1552,  ..., -0.9847,  0.4392,  0.3018],
        [ 0.3614,  0.1344,  0.0411,  ..., -0.1543, -1.0218, -0.5138]])

## Train the Model

In [9]:
# First, we create an optimizer. This is the algorithm we use to ipdate the parameters of the module. Here, we will use Adam algorithm.
import torch.optim as optim

optimizer_LSTM = optim.Adam(model_LSTM.parameters())
optimizer_GRU = optim.Adam(model_GRU.parameters())

In [10]:
# We define the loss function, which is "binary cross entropy with logits"
criterion = nn.BCEWithLogitsLoss()

# If PyTorch detects a GPU, we can place the model and the criterion on the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_LSTM = model_LSTM.to(device)
model_GRU = model_GRU.to(device)
criterion = criterion.to(device)

In [11]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [12]:
def train(model, iterator, optimizer, criterion):
    """
    The 'train' function iterates over all examples, a batch at a time.
    ’model.train()‘ is used to put the model in "training mode".
    
    For each batch, we first zero the gradients. Then we feed the batch of
    sentences "batch.text" into the model, calculate the loss and accuracy, and
    finally calculate the gradient of each parameter with "loss.backward()"
    and update the parameters using the gradients and optimizer algorithm with
    "optimizer.step()"
    
    The final step, we return the loss and accuracy, averaged across the epoch.
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [13]:
def evaluate(model, iterator, criterion):
    """
    This function is similiar to 'train', with a few modifications as we don't want to update
    the parameters.
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Run the LSTM Model

In [14]:
# Do 5 epochs and output the training and validation loss, accuracy
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model_LSTM, train_iterator, optimizer_LSTM, criterion)
    valid_loss, valid_acc = evaluate(model_LSTM, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.668, Train Acc: 59.04%, Val. Loss: 0.595, Val. Acc: 69.58%
Epoch: 02, Train Loss: 0.691, Train Acc: 52.52%, Val. Loss: 0.692, Val. Acc: 51.67%
Epoch: 03, Train Loss: 0.675, Train Acc: 57.51%, Val. Loss: 0.634, Val. Acc: 64.29%
Epoch: 04, Train Loss: 0.529, Train Acc: 73.84%, Val. Loss: 0.412, Val. Acc: 82.14%
Epoch: 05, Train Loss: 0.300, Train Acc: 87.92%, Val. Loss: 0.314, Val. Acc: 87.35%


In [15]:
test_loss, test_acc = evaluate(model_LSTM, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.350, Test Acc: 85.47%


## Run the GRU Model

In [16]:
# Do 5 epochs and output the training and validation loss, accuracy
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model_GRU, train_iterator, optimizer_GRU, criterion)
    valid_loss, valid_acc = evaluate(model_GRU, valid_iterator, criterion)
    torch.cuda.empty_cache()
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.688, Train Acc: 55.38%, Val. Loss: 0.607, Val. Acc: 67.12%
Epoch: 02, Train Loss: 0.402, Train Acc: 81.85%, Val. Loss: 0.316, Val. Acc: 86.28%
Epoch: 03, Train Loss: 0.231, Train Acc: 90.89%, Val. Loss: 0.248, Val. Acc: 90.06%
Epoch: 04, Train Loss: 0.164, Train Acc: 93.99%, Val. Loss: 0.264, Val. Acc: 89.87%
Epoch: 05, Train Loss: 0.115, Train Acc: 95.94%, Val. Loss: 0.283, Val. Acc: 90.17%


In [17]:
test_loss, test_acc = evaluate(model_GRU, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.336, Test Acc: 87.35%


## Conclusion

GRU is slightly more accurate than LSTM.

## User Input

In [18]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_LSTM(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction_LSTM = F.sigmoid(model_LSTM(tensor))
    return prediction_LSTM.item()

def predict_sentiment_GRU(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction_GRU = F.sigmoid(model_GRU(tensor))
    return prediction_GRU.item()

In [19]:
predict_sentiment_LSTM("This film is terrible")



0.03824625164270401

In [20]:
predict_sentiment_GRU("This film is terrible")



0.09794376790523529

In [21]:
predict_sentiment_LSTM("This film is great")



0.9506759643554688

In [22]:
predict_sentiment_GRU("This film is great")



0.9664276242256165