# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

## Dataset

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [2]:
def unpack_dataset():
    ! mkdir -p data/aclImdb
    ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    ! tar -zxvf aclImdb_v1.tar.gz -C data

In [3]:
#unpack_dataset()

In [4]:
from pathlib import Path
PATH = Path("data/aclImdb/")
list(PATH.iterdir())

[PosixPath('data/aclImdb/imdbEr.txt'),
 PosixPath('data/aclImdb/imdb.vocab'),
 PosixPath('data/aclImdb/train'),
 PosixPath('data/aclImdb/test'),
 PosixPath('data/aclImdb/README')]

In [5]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [6]:
# first time run this
#!python3 -m spacy download en

In [7]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [8]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [9]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('data/aclImdb/train/pos/10544_8.txt'),
 PosixPath('data/aclImdb/train/pos/9530_9.txt'),
 PosixPath('data/aclImdb/train/pos/9901_8.txt'),
 PosixPath('data/aclImdb/train/pos/11951_8.txt'),
 PosixPath('data/aclImdb/train/pos/7441_7.txt')]

In [10]:
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [11]:
#counts

In [12]:
len(counts.keys())

103688

In [13]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [14]:
len(counts.keys())

33914

In [15]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [16]:
#vocab2index

## Dataset

In [17]:
# note that spacy_tok takes a while run it just once
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [18]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400, padding_start=False)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            1,  1782,     4,  2723,     4,    29,    36,    37,    15,
         2388,   320,     6,    23,   351,     8,   316,   254,   680,
         1382,   103,   334,  7723,   103, 12825,     3,   369,   951,
            3, 16948,    19,  1791,    55,  1573,    36,     1,   153,
      

In [19]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400, padding_start=True):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [20]:
train_ds_v0 = ImdbDataset(PATH, padding_start=False)
valid_ds_v0 = ImdbDataset(PATH, "test", padding_start=False)

In [21]:
batch_size = 1000
train_dl_v0 = DataLoader(train_ds_v0, batch_size=batch_size, shuffle=True)
valid_dl_v0 = DataLoader(valid_ds_v0, batch_size=batch_size)

In [22]:
train_ds_v0[1]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

## Understanding LSTMs 

In [23]:
# Input dim is the dimension of the embedding for each word (2 in the example)
# Output dim is the dimension of the hidden layer (4 in this example)
# batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). 
lstm = nn.LSTM(2, 4, batch_first=True)  

In [24]:
inputs = [torch.randn(1, 2) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(1, len(inputs), -1)
inputs

tensor([[[ 1.0611,  0.9548],
         [-0.3836,  1.2201],
         [-1.4101, -1.9007],
         [ 1.9819, -1.6735],
         [-1.1701,  1.1169]]])

In [25]:
# RNNs with batch_first=True assume this input shape
# input shape should be bash_size x seq_len x embedding dimension
inputs.shape

torch.Size([1, 5, 2])

In [26]:
out, (hidden, cell) = lstm(inputs)

In [27]:
print(out.shape)
out

torch.Size([1, 5, 4])


tensor([[[-0.0772,  0.1723,  0.0313, -0.1882],
         [-0.1118,  0.2238, -0.0149, -0.1851],
         [ 0.0273, -0.1614, -0.3221,  0.0415],
         [ 0.2766, -0.5186, -0.0278,  0.1104],
         [ 0.0777,  0.0040, -0.1215,  0.0532]]], grad_fn=<TransposeBackward0>)

In [28]:
hidden

tensor([[[ 0.0777,  0.0040, -0.1215,  0.0532]]], grad_fn=<StackBackward>)

## LSTM V0 model

In [29]:
class LSTMV0Model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMV0Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [35]:
def train_epocs_v0(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [36]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # s is not used here
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [37]:
batch_size = 5000
train_dl = DataLoader(train_ds_v0, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds_v0, batch_size=batch_size)

In [38]:
vocab_size = len(words)
print(vocab_size)
model_v0 = LSTMV0Model(vocab_size, 50, 50).cuda()

33916


In [39]:
train_epocs_v0(model_v0, epochs=30, lr=0.01)

train loss 0.683 val loss 0.675 and val accuracy 0.570
train loss 0.594 val loss 0.863 and val accuracy 0.584
train loss 0.485 val loss 0.856 and val accuracy 0.664
train loss 0.359 val loss 1.108 and val accuracy 0.631
train loss 0.270 val loss 1.236 and val accuracy 0.649
train loss 0.236 val loss 0.763 and val accuracy 0.761


In [40]:
train_epocs_v0(model_v0, epochs=30, lr=0.001)

train loss 0.201 val loss 0.840 and val accuracy 0.766
train loss 0.183 val loss 0.878 and val accuracy 0.757
train loss 0.179 val loss 0.924 and val accuracy 0.753
train loss 0.171 val loss 0.992 and val accuracy 0.740
train loss 0.163 val loss 0.965 and val accuracy 0.753
train loss 0.161 val loss 1.058 and val accuracy 0.744


## LSTM model with variable length

In [41]:
# dataset with padding at the end
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

### Debugging our model

In [42]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x,s,y = next(iter(train_dl)) # here s is the length of the sentences

In [43]:
x.shape, s.shape

(torch.Size([7, 400]), torch.Size([7]))

In [44]:
s

tensor([162, 342, 168,  59, 166, 400, 181])

In [45]:
y

tensor([0, 1, 1, 1, 1, 0, 0])

In [46]:
# sort by length so we can use pack_padded_sequence
s, index = s.sort(0, descending=True)
x = x[index]

In [47]:
s

tensor([400, 342, 181, 168, 166, 162,  59])

In [48]:
index

tensor([5, 1, 6, 2, 4, 0, 3])

In [49]:
y[index]

tensor([0, 1, 0, 1, 1, 0, 1])

In [50]:
vocab_size = len(words)
embedding_dim = 10
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

In [51]:
x = embed(x.long())
x.shape

torch.Size([7, 400, 10])

In [52]:
hidden_dim = 9
lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

In [53]:
# RNN will not perform calculation on pad elements if pack_padded_sequence is used
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [54]:
out_pack, (ht, ct) = lstm(x_pack)

In [55]:
## final hidden layer
ht.shape

torch.Size([1, 7, 9])

In [56]:
ht[-1].shape

torch.Size([7, 9])

In [57]:
linear = nn.Linear(hidden_dim, 1)
y_hat = linear(ht[-1])
y_hat

tensor([[-0.0057],
        [ 0.0407],
        [ 0.1412],
        [ 0.1036],
        [ 0.0486],
        [ 0.0872],
        [ 0.1097]], grad_fn=<AddmmBackward>)

In [58]:
index.unsqueeze(1).shape

torch.Size([7, 1])

In [59]:
# takes back to the original ordering
h = torch.zeros_like(y_hat).scatter_(0, index.unsqueeze(1), y_hat)

In [60]:
h

tensor([[ 0.0872],
        [ 0.0407],
        [ 0.1036],
        [ 0.1097],
        [ 0.0486],
        [-0.0057],
        [ 0.1412]], grad_fn=<ScatterBackward0>)

In [61]:
y_hat

tensor([[-0.0057],
        [ 0.0407],
        [ 0.1412],
        [ 0.1036],
        [ 0.0486],
        [ 0.0872],
        [ 0.1097]], grad_fn=<AddmmBackward>)

In [62]:
index

tensor([5, 1, 6, 2, 4, 0, 3])

### Model v1
Running this model on the GPU.

In [63]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        # sorting
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out) 

In [64]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [65]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [66]:
batch_size = 2000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [69]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 50).cuda()

33916


In [70]:
train_epocs(model, epochs=30, lr=0.01)

train loss 0.667 val loss 0.623 and val accuracy 0.661
train loss 0.588 val loss 0.682 and val accuracy 0.632
train loss 0.328 val loss 0.381 and val accuracy 0.840
train loss 0.216 val loss 0.384 and val accuracy 0.854
train loss 0.144 val loss 0.373 and val accuracy 0.869
train loss 0.102 val loss 0.432 and val accuracy 0.869


In [71]:
train_epocs(model, epochs=30, lr=0.001)

train loss 0.080 val loss 0.456 and val accuracy 0.868
train loss 0.073 val loss 0.486 and val accuracy 0.867
train loss 0.070 val loss 0.478 and val accuracy 0.868
train loss 0.065 val loss 0.485 and val accuracy 0.868
train loss 0.061 val loss 0.504 and val accuracy 0.869
train loss 0.060 val loss 0.507 and val accuracy 0.869


In [72]:
train_epocs(model, epochs=30, lr=0.001)

train loss 0.058 val loss 0.520 and val accuracy 0.867
train loss 0.057 val loss 0.528 and val accuracy 0.867
train loss 0.052 val loss 0.536 and val accuracy 0.868
train loss 0.050 val loss 0.538 and val accuracy 0.866
train loss 0.049 val loss 0.549 and val accuracy 0.867
train loss 0.048 val loss 0.557 and val accuracy 0.866


In [73]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [76]:
! mkdir $PATH/"models"

In [77]:
p = PATH/"models/model-86.pth"
save_model(model, p)

In [78]:
val_metrics(model, valid_dl)

(0.5580890059471131, tensor(0.8661, device='cuda:0'))

In [79]:
load_model(model, p)

## GRU model with dropout

In [80]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, list(s), batch_first=True)
        out_pack, ht= self.gru(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)

In [81]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 50).cuda()

33916


In [82]:
train_epocs(model2, epochs=30, lr=0.01)

train loss 0.671 val loss 0.742 and val accuracy 0.601
train loss 0.447 val loss 0.440 and val accuracy 0.800
train loss 0.216 val loss 0.400 and val accuracy 0.865
train loss 0.129 val loss 0.460 and val accuracy 0.869
train loss 0.086 val loss 0.524 and val accuracy 0.872
train loss 0.052 val loss 0.638 and val accuracy 0.865


In [83]:
p = PATH/"models/model-gru-87.pth"
save_model(model2, p)

## Bidirectional and multiple layers GRUs / LSTMs

In [84]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x,s,y = next(iter(train_dl)) # here s is the length of the sentences

In [85]:
vocab_size = len(words)
embedding_dim = 10
hidden_dim = 9
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
lstm2 = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)

In [86]:
s, index = s.sort(0, descending=True)
x = x[index]
x = embed(x.long())
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [87]:
lstm_out, (ht, ct) = lstm1(x)

In [88]:
ht.shape

torch.Size([2, 7, 9])

In [89]:
ht[-2,:,:].shape

torch.Size([7, 9])

In [90]:
lstm_out, (ht2, ct2) = lstm2(x)

In [91]:
ht2.shape

torch.Size([4, 7, 9])

In [92]:
ht2[-2,:,:].shape, ht2[-1,:,:].shape

(torch.Size([7, 9]), torch.Size([7, 9]))

In [93]:
#concat the final forward (ht[-2,:,:]) and backward (ht[-1,:,:]) hidden layers      
h = torch.cat((ht2[-2,:,:], ht2[-1,:,:]), dim = 1)
h.shape

torch.Size([7, 18])

In [311]:
# This is slow

In [94]:
class LSTMBiModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMBiModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True,
                            dropout=0.3, bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        h = self.linear(h)
        return torch.zeros_like(h).scatter_(0, sort_index.unsqueeze(1).cuda(), h)

In [95]:
vocab_size = len(words)
model3 = LSTMBiModel(vocab_size, 50, 50).cuda()

In [96]:
train_epocs(model3, epochs=15, lr=0.01)

train loss 0.413 val loss 0.404 and val accuracy 0.821
train loss 0.342 val loss 0.448 and val accuracy 0.808
train loss 0.344 val loss 0.439 and val accuracy 0.809


## Bi GRUS

In [97]:
class GRUBiModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(GRUBiModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True,
                            dropout=0.3, bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, 1)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, ht = self.gru(x_pack)
        h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        return self.linear(h)

## Exercise:
Start with pre-trained embeddings.

## References

The model in this notebook is adapted from this [pytorch tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html). 