# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## Dataset

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [2]:
from pathlib import Path
PATH = Path("/data2/yinterian/aclImdb/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/aclImdb/README'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-86.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-78.pth'),
 PosixPath('/data2/yinterian/aclImdb/test'),
 PosixPath('/data2/yinterian/aclImdb/imdbEr.txt'),
 PosixPath('/data2/yinterian/aclImdb/train'),
 PosixPath('/data2/yinterian/aclImdb/imdb.vocab')]

In [3]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [4]:
# first time run this
#!python3 -m spacy download en

In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [6]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [7]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('/data2/yinterian/aclImdb/train/pos/8030_9.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/8819_10.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/6316_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/4781_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/10085_10.txt')]

In [8]:
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [9]:
#counts

In [10]:
len(counts.keys())

103578

In [11]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [12]:
len(counts.keys())

33918

In [13]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [14]:
#vocab2index

## Dataset

In [15]:
# you could imprive this function by taking a random sample
# when sentences are longer than N=400 words 
def encode_sentence(path, vocab2index, N=400):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    enc[N-l:] = enc1[:l]
    return enc

In [16]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400)

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           1,   774,   101,  2247,   101,   239,    22,  3051,   106,
         455,   834,   123,    52,   940,   131,  1999,   276,  3050,
        1040,    94,   416,  4813,    94,  4814,    76,  2336,  1100,
          76, 31038,    47,   510,   145,  1661,    22,     1,    33,
          25, 18194,

In [17]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400):
        self.N = N
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        path = self.files[idx]
        return encode_sentence(path, vocab2index, self.N), self.y[idx]

In [18]:
train_ds = ImdbDataset(PATH)
test_ds = ImdbDataset(PATH, "test")

In [19]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [20]:
test_ds[0]

(array([  293,  4435,   181,    76,   147,    52,    57,  1734,   132,
          238,   137,   523,   463,  7319,    25,  3047,   180,  1879,
         2744,  1694,    74,  4420,    52,  1261,    22, 14488, 31950,
        16989,    47, 14488,     3,   185,  2019,  1331,    76,  1087,
         5949,   250,    74,    90,   283,  6908,  1328, 14488,   123,
         3408,     1,     1,    76,     1,   108,  3073,   755,   134,
        16794,   131,     3,   577,  3040,    47,   392,  1087,   145,
           51,    74,   608,   227,   978,   118,   333,    47,   342,
         4439,    79,   140, 15342,    74,   283,   611,   147,    52,
         1258,    25,   470,     3, 23727,    47,   777,  3608,    64,
           25,   963, 24554,    47,   777,    40,  8328, 11888,   411,
           25,   963,    24,   442,   870,  3418,   123,    52,   252,
          166,  5810,    25,  2712,   106,   508, 10875,   392,    58,
           70,   145,  5928,    25,   120,   131,    74,  3662,   577,
      

## Understanding LSTMs 

In [21]:
# Input dim is the dimension of the embedding for each word (2 in the example)
# Output dim is the dimension of the hidden layer (4 in this example)
lstm = nn.LSTM(2, 4)  

In [22]:
inputs = [torch.randn(1, 2) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
inputs

tensor([[[ 0.7282,  0.6077]],

        [[-0.3187, -0.7117]],

        [[-1.5429,  0.0020]],

        [[ 0.0564,  0.0937]],

        [[ 1.0420,  1.4207]]])

In [23]:
# RNNs assume this input shape
# input shape should be seq_len x bash_size x embedding dimension
inputs.shape

torch.Size([5, 1, 2])

In [24]:
# LSTMs need two hidden vectors instead of one
hidden = (torch.zeros(1, 1, 4), torch.zeros(1, 1, 4))

In [25]:
out, hidden = lstm(inputs, hidden)

In [26]:
print(out.shape)
out

torch.Size([5, 1, 4])


tensor([[[ 0.0284, -0.0016,  0.1897,  0.0487]],

        [[-0.0148, -0.0602,  0.1194,  0.1238]],

        [[ 0.0118,  0.0869,  0.1618,  0.1522]],

        [[-0.0112,  0.0267,  0.2195,  0.1340]],

        [[ 0.0082,  0.0329,  0.4125,  0.0744]]])

In [27]:
hidden

(tensor([[[ 0.0082,  0.0329,  0.4125,  0.0744]]]),
 tensor([[[ 0.0166,  0.1283,  0.7120,  0.3235]]]))

### Debugging our model

In [28]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [29]:
x,y = next(iter(train_dl))

In [30]:
x.shape

torch.Size([7, 400])

In [31]:
vocab_size = len(words)
embedding_dim = 10
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

In [32]:
x = embed(x.long())
x.shape

torch.Size([7, 400, 10])

In [33]:
## x should have dimensions seq_len, batch_size, embedding dimension
x = x.transpose(0,1)
x.shape

torch.Size([400, 7, 10])

In [34]:
hidden_dim = 9
lstm = nn.LSTM(embedding_dim, hidden_dim)

In [35]:
h = (torch.zeros(1, batch_size, hidden_dim),
     torch.zeros(1, batch_size, hidden_dim))

In [36]:
out, hidden = lstm(x, h)

In [37]:
out.shape

torch.Size([400, 7, 9])

In [38]:
h1, h2 = hidden
h1.shape

torch.Size([1, 7, 9])

## Model

In [39]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs, hidden) :
        x = self.embeddings(inputs)
        x = x.transpose(0,1)
        lstm_out, lstm_h = self.lstm(x, hidden)
        x = lstm_out[-1]
        x = self.linearOut(x)
        return x, lstm_h
    
    def init_hidden(self, batch_size):
        # Before we've done anything, we dont have any hidden state.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, batch_size, self.hidden_dim).cuda(),
                torch.zeros(1, batch_size, self.hidden_dim).cuda())

In [40]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            hidden = model.init_hidden(y.shape[0])
            y_pred, _ = model(x, hidden)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        print("train loss %.3f" % (sum_loss/total))
        test_metrics(model)
    return model

In [41]:
def test_metrics(model):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in test_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        hidden = model.init_hidden(y.shape[0])
        y_hat, _ = model(x, hidden)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    print("test loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))

In [42]:
train_ds = ImdbDataset(PATH, "train", 400)
test_ds = ImdbDataset(PATH, "test", 400)

In [43]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [44]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 100).cuda()

33920


In [45]:
model = train_epocs(model, epochs=20, lr=0.01)

train loss 0.676
test loss 0.661 and accuracy 0.597
train loss 0.595
test loss 0.578 and accuracy 0.712
train loss 0.534
test loss 0.545 and accuracy 0.752
train loss 0.376
test loss 0.462 and accuracy 0.797
train loss 0.277
test loss 0.434 and accuracy 0.830
train loss 0.203
test loss 0.480 and accuracy 0.791
train loss 0.206
test loss 0.499 and accuracy 0.824
train loss 0.185
test loss 0.569 and accuracy 0.775
train loss 0.136
test loss 0.536 and accuracy 0.826
train loss 0.083
test loss 0.617 and accuracy 0.824
train loss 0.049
test loss 0.642 and accuracy 0.816
train loss 0.035
test loss 0.730 and accuracy 0.823
train loss 0.023
test loss 0.770 and accuracy 0.821
train loss 0.015
test loss 0.823 and accuracy 0.821
train loss 0.011
test loss 0.861 and accuracy 0.820
train loss 0.009
test loss 0.932 and accuracy 0.814
train loss 0.008
test loss 0.947 and accuracy 0.825
train loss 0.006
test loss 0.991 and accuracy 0.816
train loss 0.007
test loss 0.980 and accuracy 0.821
train loss 0

In [46]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [51]:
p = PATH/"model-81.pth"
save_model(model, p)

In [52]:
test_metrics(model)

test loss 0.939 and accuracy 0.817


In [49]:
load_model(model, p)

## GRU model with dropout

In [53]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.dropout = nn.Dropout(0.3)
        self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs, hidden) :
        x = self.embeddings(inputs)
        x = self.dropout(x)
        x = x.transpose(0,1)
        lstm_out, lstm_h = self.gru(x, hidden)
        x = lstm_out[-1]
        x = self.linearOut(x)
        return x, lstm_h
    
    def init_hidden(self, batch_size):
        # Before we've done anything, we dont have any hidden state.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return torch.zeros(1, batch_size, self.hidden_dim).cuda()

In [56]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 50).cuda()

33920


In [57]:
model2 = train_epocs(model2, epochs=10, lr=0.01)

train loss 0.676
test loss 0.644 and accuracy 0.637
train loss 0.595
test loss 0.560 and accuracy 0.737
train loss 0.455
test loss 0.543 and accuracy 0.776
train loss 0.380
test loss 0.374 and accuracy 0.839
train loss 0.274
test loss 0.338 and accuracy 0.867
train loss 0.199
test loss 0.314 and accuracy 0.879
train loss 0.264
test loss 0.399 and accuracy 0.857
train loss 0.151
test loss 0.343 and accuracy 0.880
train loss 0.111
test loss 0.414 and accuracy 0.872
train loss 0.087
test loss 0.421 and accuracy 0.872


In [58]:
p = PATH/"model-gru-87.pth"
save_model(model2, p)

## Exercise:
Start with pre-trained embeddings.

## References

The model in this notebook is adapted from this [pytorch tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html). 