# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## Dataset

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [2]:
from pathlib import Path
PATH = Path("/data2/yinterian/aclImdb/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/aclImdb/README'),
 PosixPath('/data2/yinterian/aclImdb/test'),
 PosixPath('/data2/yinterian/aclImdb/imdbEr.txt'),
 PosixPath('/data2/yinterian/aclImdb/train'),
 PosixPath('/data2/yinterian/aclImdb/imdb.vocab')]

In [3]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [4]:
# first time run this
#!python3 -m spacy download en

In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [6]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [7]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('/data2/yinterian/aclImdb/train/pos/8030_9.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/8819_10.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/6316_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/4781_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/10085_10.txt')]

In [8]:
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [9]:
#counts

In [10]:
len(counts.keys())

103578

In [11]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [12]:
len(counts.keys())

33918

In [13]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [14]:
#vocab2index

## Dataset

In [55]:
def encode_sentence(path, vocab2index, N=120):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    enc[N-l:] = enc1[:l]
    return enc

In [56]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400)

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           1,   774,   101,  2247,   101,   239,    22,  3051,   106,
         455,   834,   123,    52,   940,   131,  1999,   276,  3050,
        1040,    94,   416,  4813,    94,  4814,    76,  2336,  1100,
          76, 31038,    47,   510,   145,  1661,    22,     1,    33,
          25, 18194,

In [57]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=120):
        self.N = N
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        path = self.files[idx]
        return encode_sentence(path, vocab2index, self.N), self.y[idx]

In [58]:
train_ds = ImdbDataset(PATH)
test_ds = ImdbDataset(PATH, "test")

In [59]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [60]:
test_ds[0]

(array([  293,  4435,   181,    76,   147,    52,    57,  1734,   132,
          238,   137,   523,   463,  7319,    25,  3047,   180,  1879,
         2744,  1694,    74,  4420,    52,  1261,    22, 14488, 31950,
        16989,    47, 14488,     3,   185,  2019,  1331,    76,  1087,
         5949,   250,    74,    90,   283,  6908,  1328, 14488,   123,
         3408,     1,     1,    76,     1,   108,  3073,   755,   134,
        16794,   131,     3,   577,  3040,    47,   392,  1087,   145,
           51,    74,   608,   227,   978,   118,   333,    47,   342,
         4439,    79,   140, 15342,    74,   283,   611,   147,    52,
         1258,    25,   470,     3, 23727,    47,   777,  3608,    64,
           25,   963, 24554,    47,   777,    40,  8328, 11888,   411,
           25,   963,    24,   442,   870,  3418,   123,    52,   252,
          166,  5810,    25,  2712,   106,   508, 10875,   392,    58,
           70,   145,  5928], dtype=int32), 1)

## LSTMs

In [21]:
lstm = nn.LSTM(2, 4)  # Input dim is 2, output dim is 4

In [22]:
inputs = [torch.randn(1, 2) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
inputs

tensor([[[ 0.7992,  0.7052]],

        [[-1.0743, -0.4082]],

        [[-1.0387, -0.1603]],

        [[-0.0198,  1.4817]],

        [[-1.9672,  1.0904]]])

In [23]:
inputs.shape

torch.Size([5, 1, 2])

In [24]:
hidden = (torch.zeros(1, 1, 4), torch.zeros(1, 1, 4))

In [25]:
out, hidden = lstm(inputs, hidden)

In [26]:
out

tensor([[[ 0.0737, -0.0569,  0.0293,  0.0154]],

        [[ 0.0243, -0.1587,  0.0312, -0.1934]],

        [[ 0.0166, -0.2160,  0.0273, -0.2424]],

        [[ 0.1015, -0.2339,  0.0241, -0.0654]],

        [[ 0.0615, -0.3624,  0.0121, -0.1566]]])

In [27]:
hidden

(tensor([[[ 0.0615, -0.3624,  0.0121, -0.1566]]]),
 tensor([[[ 0.2976, -1.5159,  0.0268, -0.3277]]]))

### Debugging our model

In [28]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [29]:
x,y = next(iter(train_dl))

In [30]:
x.shape

torch.Size([7, 120])

In [31]:
vocab_size = len(words)
embedding_dim = 10
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

In [32]:
x = embed(x.long())
x.shape

torch.Size([7, 120, 10])

In [33]:
## x should have dimensions seq_len, batch_size, embedding dimension
x = x.transpose(0,1)
x.shape

torch.Size([120, 7, 10])

In [34]:
hidden_dim = 9
lstm = nn.LSTM(embedding_dim, hidden_dim)

In [35]:
h = (torch.zeros(1, batch_size, hidden_dim),
     torch.zeros(1, batch_size, hidden_dim))

In [36]:
out, hidden = lstm(x, h)

In [37]:
out.shape

torch.Size([120, 7, 9])

In [38]:
h1, h2 = hidden
h1.shape

torch.Size([1, 7, 9])

## Model

In [39]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs, hidden) :
        x = self.embeddings(inputs)
        x = x.transpose(0,1)
        lstm_out, lstm_h = self.lstm(x, hidden)
        x = lstm_out[-1]
        x = self.linearOut(x)
        return x, lstm_h
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
                torch.zeros(1, self.batch_size, self.hidden_dim).cuda())

In [40]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            hidden = model.init_hidden()
            y_pred, _ = model(x, hidden)
            model.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        print("train loss %.3f" % (sum_loss/total))
        test_metrics(model)
    return model

In [41]:
def test_metrics(model):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in test_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        hidden = model.init_hidden()
        y_hat, _ = model(x, hidden)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    print("test loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))

In [61]:
train_ds = ImdbDataset(PATH, "train", 400)
test_ds = ImdbDataset(PATH, "test", 400)

In [62]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [63]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 100, batch_size).cuda()

33920


In [None]:
model = train_epocs(model, epochs=20, lr=0.01)

train loss 0.696
test loss 0.687 and accuracy 0.532
train loss 0.680
test loss 0.663 and accuracy 0.598
train loss 0.728
test loss 0.668 and accuracy 0.598
train loss 0.669
test loss 0.678 and accuracy 0.581
train loss 0.668
test loss 0.668 and accuracy 0.591
train loss 0.643
test loss 0.646 and accuracy 0.619
train loss 0.589
test loss 0.615 and accuracy 0.677
train loss 0.576
test loss 0.591 and accuracy 0.688
train loss 0.534
test loss 0.612 and accuracy 0.658
train loss 0.516
test loss 0.608 and accuracy 0.672
train loss 0.480
test loss 0.594 and accuracy 0.694
train loss 0.421
test loss 0.570 and accuracy 0.728
train loss 0.367
test loss 0.615 and accuracy 0.734
train loss 0.351
test loss 0.564 and accuracy 0.733
train loss 0.317
test loss 0.570 and accuracy 0.744
train loss 0.279
test loss 0.553 and accuracy 0.767
train loss 0.236
test loss 0.549 and accuracy 0.784
train loss 0.215
test loss 0.605 and accuracy 0.762
train loss 0.226
test loss 0.580 and accuracy 0.764
train loss 0

## Fighting overfitting 

## References

The model in this notebook is adapted from this [pytorch tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html). 