#### Sentiment Analysis using RNN
#### PyTorch

In [47]:
import os
import time
import numpy as np
from tqdm import tqdm
from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
# read sentiments and reviews data from the text files
review_list = []
label_list = []
for label in ['pos', 'neg']:
    for fname in tqdm(os.listdir(
        f'./data/aclImdb/train/{label}/')):
        if 'txt' not in fname:
            continue
        with open(os.path.join(f'./data/aclImdb/train/{label}/',
                              fname), encoding="utf8") as f:
            review_list += [f.read()]
            label_list += [label]
print ('Number of reviews :', len(review_list))

100%|██████████████████████████████████| 12500/12500 [00:00<00:00, 81577.50it/s]
100%|██████████████████████████████████| 12500/12500 [00:00<00:00, 85676.23it/s]

Number of reviews : 25000





In [49]:
# pre-processing review text
review_list = [review.lower() for review in review_list]
review_list = [''.join([letter for letter in review
                        if letter not in punctuation])
                        for review in tqdm(review_list)]
# accumulate all review texts together
reviews_blob = ' '.join(review_list)
# generate list of all words of all reviews
review_words = reviews_blob.split()
# get the word counts
count_words = Counter(review_words)
# sort words as per counts (decreasing order)
total_review_words = len(review_words)
sorted_review_words = count_words.most_common(total_review_words)
print(sorted_review_words[:10])

100%|██████████████████████████████████| 25000/25000 [00:00<00:00, 31828.49it/s]


[('the', 334691), ('and', 162228), ('a', 161940), ('of', 145326), ('to', 135042), ('is', 106855), ('in', 93028), ('it', 77099), ('i', 75719), ('this', 75190)]


##### A word to integer dict. is created. Each word is given a unique integer for further embedding. Here , teh vocab. includes all teh words used in the training review set.
##### example:  [i, like, this, movie] gets converted to [9, 38, 10, 17]

In [50]:
# create word to integer (token) dictionary
# in order to encode text as numbers
vocab_to_token = {word:idx+1 for idx,
                  (word, count) in enumerate(sorted_review_words)}
print(list(vocab_to_token.items())[:10])
print('vocab_to_token list length:',len(list(vocab_to_token.items())))
print('vocab_to_token length:',len(vocab_to_token))

[('the', 1), ('and', 2), ('a', 3), ('of', 4), ('to', 5), ('is', 6), ('in', 7), ('it', 8), ('i', 9), ('this', 10)]
vocab_to_token list length: 121364
vocab_to_token length: 121364


In [51]:
reviews_tokenized = []
for review in review_list:
    word_to_token = [vocab_to_token[word] for word in 
                     review.split()]
    reviews_tokenized.append(word_to_token)
print(review_list[0])
print()
print (reviews_tokenized[0])

zentropa has much in common with the third man another noirlike film set among the rubble of postwar europe like ttm there is much inventive camera work there is an innocent american who gets emotionally involved with a woman he doesnt really understand and whose naivety is all the more striking in contrast with the nativesbr br but id have to say that the third man has a more wellcrafted storyline zentropa is a bit disjointed in this respect perhaps this is intentional it is presented as a dreamnightmare and making it too coherent would spoil the effect br br this movie is unrelentingly grimnoir in more than one sense one never sees the sun shine grim but intriguing and frightening

[13147, 43, 72, 7, 1118, 16, 1, 837, 132, 153, 43770, 19, 272, 756, 1, 15187, 4, 6606, 2322, 38, 57550, 47, 6, 72, 4354, 384, 160, 47, 6, 33, 1309, 313, 36, 201, 2099, 560, 16, 3, 245, 26, 144, 62, 372, 2, 600, 14623, 6, 31, 1, 51, 3226, 7, 2212, 16, 1, 43771, 12, 18, 437, 25, 5, 129, 11, 1, 837, 132, 43, 

In [52]:
### Optional for Understanding 
tmp_review='i like this movie'
tmp_review_tokenised=[vocab_to_token[word] for word in tmp_review.split()]
print(tmp_review_tokenised)

[9, 38, 10, 17]


In [53]:
# encode sentiments as 0 or 1
encoded_label_list = [1 if label =='pos'
                      else 0 for label in label_list]
reviews_len = [len(review) for review in reviews_tokenized]
reviews_tokenized = [reviews_tokenized[i] 
                     for i, l in enumerate(reviews_len)
                     if l>0 ]
encoded_label_list = np.array([encoded_label_list[i]
                              for i, l in enumerate(reviews_len)
                              if l> 0 ], dtype='float32')

##### Input dim to RNN should be consistent. But reviews will have variable length. Hence, padding/truncating is done to bring the sequence length to 512. 

In [54]:
def pad_sequence(reviews_tokenized, sequence_length):
    ''' returns the tokenized review sequences padded with 0's or truncated to the sequence_length.
    '''
    padded_reviews = np.zeros((len(reviews_tokenized), sequence_length), dtype = int)
    
    for idx, review in enumerate(reviews_tokenized):
        review_len = len(review)
        
        if review_len <= sequence_length:
            zeroes = list(np.zeros(sequence_length-review_len))
            new_sequence = zeroes+review
        elif review_len > sequence_length:
            new_sequence = review[0:sequence_length]
        
        padded_reviews[idx,:] = np.array(new_sequence)
    
    return padded_reviews

sequence_length = 512
padded_reviews = pad_sequence(reviews_tokenized=reviews_tokenized, sequence_length=sequence_length)
#plt.hist(reviews_len)

In [55]:
### Optional for Understanding 
### seq length is set to 20 here
tmp_padded_review=pad_sequence(reviews_tokenized=[tmp_review_tokenised], sequence_length=20)
(tmp_padded_review)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         9, 38, 10, 17]])

In [56]:
train_val_split = 0.75
train_X = padded_reviews[:int(train_val_split*len(padded_reviews))]
train_y = encoded_label_list[:int(train_val_split*len(padded_reviews))]
validation_X = padded_reviews[int(train_val_split*len(padded_reviews)):]
validation_y = encoded_label_list[int(train_val_split*len(padded_reviews)):]

In [57]:
# generate torch datasets
train_dataset = TensorDataset(torch.from_numpy(train_X).to(device), torch.from_numpy(train_y).to(device))
validation_dataset = TensorDataset(torch.from_numpy(validation_X).to(device), torch.from_numpy(validation_y).to(device))

batch_size = 32
# torch dataloaders (shuffle data)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)

In [58]:
# get a batch of train data
train_data_iter = iter(train_dataloader)
X_example, y_example = next(train_data_iter)
print('Example Input size: ', X_example.size()) # batch_size, seq_length
print('Example Input:\n', X_example)
print()
print('Example Output size: ', y_example.size()) # batch_size
print('Example Output:\n', y_example)

Example Input size:  torch.Size([32, 512])
Example Input:
 tensor([[   25,    22,   107,  ...,   166,  1789,  4191],
        [    0,     0,     0,  ...,     6,  3846,   161],
        [    0,     0,     0,  ...,    55,   306,  2156],
        ...,
        [  437,   320,    42,  ...,   187,    54,   193],
        [    0,     0,     0,  ...,  2146,     1,  1571],
        [    0,     0,     0,  ..., 22996,    12,  1311]])

Example Output size:  torch.Size([32])
Example Output:
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
        1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1.])


##### Note: Embedding layer
When an embedding layer is created, an embedding matrix is initialised with random vectors having dimensions of (num_embeddings(vocab_size), embedding_dim). This is basically our lookup table where our words are mapped to indexes.Given an input word or token, represented by its index in the vocabulary, you pass this index to the embedding layer which then looks up the corresponding row in the embedding matrix. The embedding vector is then extracted from the row as output which is of the dimension embedding_dim. During training, the embedding vectors are updated through backpropagation to minimize the loss. This means the vectors are adjusted to better represent the semantics and relationships between words for the given task here.

##### like in time-series rnn, the input_dim can be (ex:t-5,t-4,t-3,t-2,t-1,t-0) & output can be (t+1). 
##### here we have seq lenth of 512 tokens. Each token is represented by dense-vector of size 100. Hence 512x100 will be processd one by one to get a binary output??  Basically, it processes word by word in a sequencial mannaer? 

In [59]:
class RNN(nn.Module):
    def __init__(self, input_dimension, embedding_dimension, 
                 hidden_dimension, output_dimension):
        super().__init__()
        self.embedding_layer = nn.Embedding(input_dimension,
                                        embedding_dimension)
        self.rnn_layer = nn.RNN(embedding_dimension, 
                                hidden_dimension,
                                num_layers=1)
        self.fc_layer = nn.Linear(hidden_dimension,
                                  output_dimension)
    def forward(self, sequence):
        # sequence shape = (sequence_length, batch_size)
        embedding = self.embedding_layer(sequence)
        # embedding shape = [sequence_length, batch_size, 
        #                    embedding_dimension]
        output, hidden_state = self.rnn_layer(embedding)
        # output shape = [sequence_length, batch_size, 
        #                 hidden_dimension]
        # hidden_state shape = [1, batch_size, 
        #                      hidden_dimension]
        final_output = self.fc_layer(
            hidden_state[-1,:,:].squeeze(0))
        return final_output

In [60]:
# +1 to account for padding
input_dimension = len(vocab_to_token)+1 
embedding_dimension = 100
hidden_dimension = 32
output_dimension = 1
rnn_model = RNN(input_dimension, embedding_dimension,
                hidden_dimension, output_dimension)

optim = torch.optim.Adam(rnn_model.parameters())
loss_func = nn.BCEWithLogitsLoss()

rnn_model = rnn_model.to(device)
loss_func = loss_func.to(device)

In [61]:
def accuracy_metric(predictions, ground_truth):
    """
    Returns 0-1 accuracy for the given set 
    of predictions and ground truth
    """
    # round predictions to either 0 or 1
    rounded_predictions = \
        torch.round(torch.sigmoid(predictions))
    # convert into float for division
    success = (rounded_predictions == ground_truth).float()
    accuracy = success.sum() / len(success)
    return accuracy

In [62]:
def train(model, dataloader, optim, loss_func):
    loss = 0
    accuracy = 0
    model.train()
    for sequence, sentiment in dataloader:
        optim.zero_grad()
        preds = model(sequence.T).squeeze()
        loss_curr = loss_func(preds, sentiment)
        accuracy_curr = accuracy_metric(preds, sentiment)
        loss_curr.backward()
        optim.step()
        loss += loss_curr.item()
        accuracy += accuracy_curr.item()
    return loss/len(dataloader), accuracy/len(dataloader)

In [63]:
def validate(model, dataloader, loss_func):
    loss = 0
    accuracy = 0
    model.eval()
    with torch.no_grad():
        for sequence, sentiment in dataloader:
            preds = model(sequence.T).squeeze()
            loss_curr = loss_func(preds, sentiment)
            accuracy_curr = accuracy_metric(preds, sentiment)
            loss += loss_curr.item()
            accuracy += accuracy_curr.item()
    return loss/len(dataloader), accuracy/len(dataloader)

In [64]:
num_epochs = 1
best_validation_loss = float('inf')
for ep in range(num_epochs):
    time_start = time.time()
    training_loss, train_accuracy = train(rnn_model, 
                                          train_dataloader,
                                          optim, loss_func)
    validation_loss, validation_accuracy = validate(
        rnn_model, validation_dataloader, loss_func)
    time_end = time.time()
    time_delta = time_end - time_start
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        torch.save(rnn_model.state_dict(), 'rnn_model.pt')
    print(f'epoch number: {ep+1} | time elapsed: {time_delta}s')
    print(f'training loss: {training_loss:.3f} | training accuracy: {train_accuracy*100:.2f}%')
    print(f'\tvalidation loss: {validation_loss:.3f} |  validation accuracy: {validation_accuracy*100:.2f}%')

epoch number: 1 | time elapsed: 66.82937407493591s
training loss: 0.619 | training accuracy: 66.51%
	validation loss: 0.923 |  validation accuracy: 29.96%


In [65]:
def sentiment_inference(model, sentence):
    model.eval()
    # text transformations
    sentence = sentence.lower()
    sentence = ''.join([c for c in sentence
                       if c not in punctuation])
    tokenized = [vocab_to_token.get(token, 0)
                 for token in sentence.split()]
    tokenized = np.pad(tokenized,
                       (512-len(tokenized), 0), 'constant')
    # model inference
    model_input = torch.LongTensor(tokenized).to(device)
    model_input = model_input.unsqueeze(1)
    pred = torch.sigmoid(model(model_input))
    return pred.item()

In [66]:
print(sentiment_inference(rnn_model,
                          "This film is horrible"))
print(sentiment_inference(rnn_model,
                          "Director tried too hard but \
                           this film is bad"))
print(sentiment_inference(rnn_model,
                          "This film will be houseful for weeks"))
print(sentiment_inference(rnn_model,
                          " I just really loved the movie"))

0.3454607427120209
0.3327416479587555
0.302849143743515
0.29525476694107056
