In [1]:
import torch
import torch.nn as nn

In [2]:
input_dim = 5
hidden_dim = 10
n_layers = 1

# If batch_first = True then the input and output tensors are (batch, seq, feature)
# Otherwise it's, (seq, batch, feature )
lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)

In [3]:
batch_size = 1
seq_len = 1

# Randomise an input tensor
inp = torch.randn(batch_size, seq_len, input_dim)

# Initialise the hidden state
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)

# Initialise the cell state
cell_state = torch.randn(n_layers, batch_size, hidden_dim)

# Make a tuple of hidden and cell states
hidden = (hidden_state, cell_state)

print("Input shape:",inp.shape)
print("Hidden state shape:", hidden_state.shape)
print("Cell state shape", cell_state.shape)


Input shape: torch.Size([1, 1, 5])
Hidden state shape: torch.Size([1, 1, 10])
Cell state shape torch.Size([1, 1, 10])


In [4]:
print(inp)

tensor([[[ 0.0964,  0.4761,  1.5061,  1.3182, -0.5897]]])


In [5]:
# Remember this is a tuple of hidden_state and cell_state
print(hidden)

(tensor([[[ 0.3679,  0.9422,  0.8958, -1.6662,  0.7875, -0.7595, -0.8883,
           0.8247, -1.0505,  1.7329]]]), tensor([[[ 0.7525, -0.2488, -1.0012,  1.6008, -2.7763, -0.7171, -0.7462,
          -1.6271,  0.6473, -0.2141]]]))


In [6]:
# Perform forward pass
# Remember hidden is a tuple of hidden_state and cell_state
out, hidden = lstm_layer(inp, hidden)

print(out)

tensor([[[ 0.1810, -0.2740, -0.3724,  0.2596, -0.0844, -0.2676, -0.3954,
          -0.1689,  0.0517, -0.2117]]], grad_fn=<TransposeBackward0>)


In [7]:
print(hidden)

(tensor([[[ 0.1810, -0.2740, -0.3724,  0.2596, -0.0844, -0.2676, -0.3954,
          -0.1689,  0.0517, -0.2117]]], grad_fn=<StackBackward>), tensor([[[ 0.3271, -0.3798, -0.6802,  0.3936, -0.3623, -0.7226, -0.7636,
          -0.6220,  0.0659, -0.4625]]], grad_fn=<StackBackward>))


In [8]:
# Output shape is the same as the hidden state
# Also note that the output is in fact the hidden_state
# This is because we have a sequence length of 1

print("Output shape: ", out.shape)

Output shape:  torch.Size([1, 1, 10])


In [9]:
# Now increase the sequence length and see what happens
seq_len = 3
inp = torch.randn(batch_size, seq_len, input_dim)

print(inp.shape)

out, hidden = lstm_layer(inp, hidden)

# The output will have the same sequence length as the input - you get an output for each input in sequence
print(out.shape)

# The cell hidden and cell states are still the same dimension as before
print(hidden[0].shape)
print(hidden[1].shape)


torch.Size([1, 3, 5])
torch.Size([1, 3, 10])
torch.Size([1, 1, 10])
torch.Size([1, 1, 10])


In [10]:
# Obtaining the last output - this is a many to one scenario where we just take the last output of the sequence
print(out)
out = out.squeeze()[-1, :]
print(out)
print(out.shape)

tensor([[[-0.0125, -0.0631, -0.1049,  0.3259, -0.0635, -0.3160, -0.1554,
          -0.2281,  0.0431, -0.0153],
         [-0.1226,  0.0285,  0.0116,  0.2910, -0.0378, -0.2754, -0.0009,
          -0.1127,  0.0512, -0.0146],
         [-0.1392,  0.0695,  0.0348,  0.2763,  0.1415, -0.2827,  0.0290,
          -0.0816,  0.1464, -0.0390]]], grad_fn=<TransposeBackward0>)
tensor([-0.1392,  0.0695,  0.0348,  0.2763,  0.1415, -0.2827,  0.0290, -0.0816,
         0.1464, -0.0390], grad_fn=<SliceBackward>)
torch.Size([10])


In [11]:
print(out)

tensor([-0.1392,  0.0695,  0.0348,  0.2763,  0.1415, -0.2827,  0.0290, -0.0816,
         0.1464, -0.0390], grad_fn=<SliceBackward>)


In [12]:
# Now let's imagine a stock market example of what the data & topology would look like
input_dim = 5  # This is the number of features at each time step
hidden_dim = 10  # Hidden dimension
n_layers = 2  # How deep is the model

# If batch_first = True then the input and output tensors are (batch, seq, feature)
# Otherwise it's, (seq, batch, feature )
lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)

# 100 different sequences in the training batch
# Perhaps a rolling window of training examples?
# Perhaps 100 sequences randonly sampled in a time period (with or without overlap allowed)
batch_size = 100  
seq_len = 1000  # 1000 consecutive days of data for each training sequence

# Then the training label could be:
# The absolute value on the 1001th day (would need to standardise / normalise the data for this to generalise?)
# The sign of the value on the 1001th data vs the 1000th day (binary)?

# Randomise an input tensor
inp = torch.randn(batch_size, seq_len, input_dim)

# Initialise the hidden state
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)

# Initialise the cell state
cell_state = torch.randn(n_layers, batch_size, hidden_dim)

# Make a tuple of hidden and cell states
hidden = (hidden_state, cell_state)

print("Input shape:",inp.shape)
print("Hidden state shape:", hidden_state.shape)
print("Cell state shape", cell_state.shape)


Input shape: torch.Size([100, 1000, 5])
Hidden state shape: torch.Size([2, 100, 10])
Cell state shape torch.Size([2, 100, 10])


In [13]:
import bz2
from collections import Counter

# Regular expressions toolkit
import re

# Natural language toolkit
import nltk

import numpy as np

# punkt is a pre-trained tokenizer in English. Divides text into a list of sentences using an unsupervised algo
# to build a model for abbreviationof words, collocations and words that start sentences.
nltk.download('punkt')

train_file = bz2.BZ2File('../data/amazon_reviews/train.ft.txt.bz2')
test_file = bz2.BZ2File('../data/amazon_reviews/test.ft.txt.bz2')

train_file = train_file.readlines()
test_file = test_file.readlines()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\q2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
num_train = 800000  # We're training on the first 800,000 reviews in the dataset
num_test = 200000  # Using 200,000 reviews from test set

train_file = [x.decode('utf-8') for x in train_file[:num_train]]
test_file = [x.decode('utf-8') for x in test_file[:num_test]]

In [15]:
print(train_file[0])
print(test_file[0])

__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"



In [16]:
print(train_file[0].split(' ')[0])
print(train_file[0].split(' ', 1)[1][:-1].lower())
print(train_file[0].split(' ', 1)[1][:-1])


__label__2
stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^
Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [17]:
# Extracting labels from sentences

# Assumes the label is everything up to the first space in the review. The split actually splits the sentence into a 
# list - each element of the list is a word / character seq.  The zeroeth element in the list is the label 
# '__label__2' (positive) or # or '__label__1' (negative)

train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]

# .split(' ',1) means split the review into the first word (the label) and everything else. .split(' ',2) would split the
# review into '__label__2' , 'Stuning' and everything else.
# Then the index slice [1][:-1] accesses this 2nd part [1] and all the characters [:-1] and converst them all to lower case

train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

# Do exactly the same for the test data
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]



In [18]:
# Some simple cleaning of data
# This replaces any numbers with the character '0'

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

print(test_sentences[2])

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

print(test_sentences[2])



batteries died within a year ...: i bought this charger in jul 2003 and it worked ok for a while. the design is nice and convenient. however, after about a year, the batteries would not hold a charge. might as well just get alkaline disposables, or look elsewhere for a charger that comes with batteries that have better staying power.
batteries died within a year ...: i bought this charger in jul 0000 and it worked ok for a while. the design is nice and convenient. however, after about a year, the batteries would not hold a charge. might as well just get alkaline disposables, or look elsewhere for a charger that comes with batteries that have better staying power.


In [19]:
# Modify URLs to <url>
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [20]:
words = Counter()  # Dictionary that will map a word to the number of times it appeared in all the training sentences

for i, sentence in enumerate(train_sentences):
    
    if i == 0:
        print(i, train_sentences[i])
        
    # The sentences will be stored as a list of words/tokens
    train_sentences[i] = []
    
    for word in nltk.word_tokenize(sentence):  # Tokenizing the words
        
        if i == 0:
            print(word)
        
        # Not sure why we're doing the case conversion again
        words.update([word.lower()])  # Converting all the words to lowercase
                
        train_sentences[i].append(word)
        
    if i==0:
        print(train_sentences[i])
        print(words)
        
    if i%20000 == 0:
        print(str((i*100)/num_train) + "% done")
        
print("100% done")

0 stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^
stuning
even
for
the
non-gamer
:
this
sound
track
was
beautiful
!
it
paints
the
senery
in
your
mind
so
well
i
would
recomend
it
even
to
people
who
hate
vid
.
game
music
!
i
have
played
the
game
chrono
cross
but
out
of
all
of
the
games
i
have
ever
played
it
has
the
best
music
!
it
backs
away
from
crude
keyboarding
and
takes
a
fresher
step
with
grate
guitars
and
soulful
orchestras
.
it
would
impress
anyone
who
cares
to
listen
!
^_^
['stuning', 'even', 'for', 'the', 'non-gamer', ':', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'senery', '

In [21]:
print(words['stunning'])

2242


In [22]:
# Removing the words that only appear once
words = {k:v for k,v in words.items() if v>1}

# Sorting the words according to the number of appearances, with the most common word being first
words = sorted(words, key=words.get, reverse=True)

# Adding padding and unknown to our vocabulary so that they will be assigned an index
words = ['_PAD','_UNK'] + words

# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

In [23]:
for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else 0 for word in nltk.word_tokenize(sentence)]

In [24]:
print(test_sentences[0])

[40, 99, 13, 28, 1448, 4272, 57, 31, 10, 3, 40, 1781, 10, 85, 1730, 2, 5, 27, 907, 8, 11, 99, 16, 152, 6, 5, 140, 90, 9, 2, 68, 5, 122, 14, 7, 42, 1847, 9, 210, 58, 243, 108, 2, 7, 133, 1847, 46, 29316, 38, 2642, 14, 3, 2379, 2, 11, 99, 46, 18845, 160, 2, 934, 30, 0, 0, 6, 560, 46, 1285, 2, 31, 10, 160, 21, 2336, 4156, 2, 11, 12, 7, 3570, 14981, 99, 14, 28, 24, 2, 182, 102, 130, 147, 9, 239, 12, 46, 827, 58, 2, 2587, 5, 263, 11, 4, 72, 601, 444, 4, 579, 4, 416, 4, 153, 4, 1690, 4, 1255, 1816, 521, 31, 179, 33, 80, 18, 17, 829, 61, 32]


In [25]:
print(idx2word[40])
print(word2idx['great'])
print(words[40], words[99], words[13], words[28], words[1448], words[4272])

great
40
great cd : my lovely pat


In [26]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
# These comments keep referring to sentences but in fact it's multiple sentences inside a review
# The term should be review not sentence

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

seq_len = 200  # The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

print(test_sentences[0])

# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0    40    99    13    28  1448  4272    57    31    10     3
    40  1781    10    85  1730     2     5    27   907     8    11    99
    16   152     6     5   140    90     9     2    68     5   122    14
     7    42  1847     9   210    58   243   108     2     7   133  1847
    46 29316    38  2642    14     3  2379     2    11    99    46 18845
   160     2   934    30     0     0     6   560    46  1285     2    31
    10   160    21  2336  4156     2    11    12     7  3570 14981    99
    14    28    24     2   182   102   130   147   

In [27]:
# This splits the test populaton (200k reviews) into 50% validation and 50% test set
split_frac = 0.5 # 50% validation, 50% test

print(len(test_sentences))

split_id = int(split_frac * len(test_sentences))
val_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]


200000


In [28]:
print(len(test_sentences))

100000


In [29]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

print(len(train_data))
print(len(val_data))
print(len(test_data))

batch_size = 400

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

800000
100000
100000


In [30]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

print(torch.cuda.is_available())

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    


True


In [31]:
# This network includes a word embedding layer that is also trained

class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [32]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [33]:
epochs = 2
counter = 0
print_every = 1000
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    h = model.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inp, lab in val_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/2... Step: 1000... Loss: 0.143612... Val Loss: 0.184799
Validation loss decreased (inf --> 0.184799).  Saving model ...
Epoch: 1/2... Step: 2000... Loss: 0.167584... Val Loss: 0.168625
Validation loss decreased (0.184799 --> 0.168625).  Saving model ...
Epoch: 2/2... Step: 3000... Loss: 0.137612... Val Loss: 0.171890
Epoch: 2/2... Step: 4000... Loss: 0.105190... Val Loss: 0.169851


In [34]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.163
Test accuracy: 93.746%
