In [None]:
import torch
import torchtext
from torchtext.legacy import data
from torch.utils.data import DataLoader, random_split
import pandas as pd
import random
from tqdm import tqdm
import torch.nn.functional as F
from torch import nn

torchtext.__version__
# torchtext.__version__

In [4]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz

--2021-05-08 09:38:47--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2021-05-08 09:38:47--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz’


2021-05-08 09:38:48 (210 MB/s) - ‘movie_data.csv.gz’ saved [26521894/26521894]



In [5]:
!gunzip -f movie_data.csv.gz

In [6]:
from get_dataset import get_dataset
help(get_dataset)

Help on function get_dataset in module get_dataset:

get_dataset(include_lengths=False)
    returns the
    TEXT, LABEL, train_set, val_set and test_set
    include_lengths=False



In [7]:
SEED = 42  # The answer to life, the universe, and everything

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [8]:
available_device= "cuda" if torch.cuda.is_available() else "cpu"
# available_device= "cpu"
device = torch.device(available_device)
print(device)
# device

cuda


## Creating the dataset and iterator

### Dataset 

In [9]:
TEXT, LABEL, train_set, val_set, test_set = get_dataset(include_lengths=True)

50000


In [10]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_set,
                max_size=MAX_VOCAB_SIZE,
                vectors="glove.6B.100d",
                unk_init = torch.Tensor.normal_)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.33MB/s]                           
100%|█████████▉| 399602/400000 [00:15<00:00, 26602.05it/s]

In [11]:
LABEL.build_vocab(train_set)

In [12]:
### 10 common word train_set
top_commom_words = TEXT.vocab.freqs.most_common(10)
top_commom_words

[('the', 464964),
 (',', 441024),
 ('.', 378804),
 ('a', 250866),
 ('and', 250750),
 ('of', 231373),
 ('to', 214406),
 ('is', 173465),
 ('in', 141132),
 ('I', 125873)]

In [None]:
TEXT.vocab.vectors

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3509, -0.8664,  1.1617,  ..., -0.5238, -1.9368,  0.2217],
        [ 0.6168, -1.0092, -0.0051,  ..., -0.0352, -0.2554,  0.0779],
        [ 0.4369,  0.3981, -0.2551,  ..., -0.3327,  0.4569,  0.6567]])

In [None]:
TEXT.vocab.vectors.shape

torch.Size([25002, 100])

In [None]:
LABEL.vocab.freqs

Counter({'1': 20235, '0': 20265})

In [17]:
LABEL.vocab.stoi

defaultdict(None, {'0': 0, '1': 1})

### Data iterator

In [13]:
BATCH_SIZE = 64

# https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
                                              (train_set, val_set, test_set),
                                              batch_size=BATCH_SIZE,
                                              sort_within_batch=True,          #
                                              sort_key=lambda x: len(x.REVIEWS))

In [14]:
print(len(train_iterator))
for t_i in train_iterator:
    print(t_i.REVIEWS[0].size())
    print(t_i.LABEL.size())
    break

for v_i in val_iterator:
    print(v_i.REVIEWS[0].size())
    print(v_i.LABEL.size())
    break

633
torch.Size([223, 64])
torch.Size([64])
torch.Size([52, 64])
torch.Size([64])


## Defining the model

#### Learnig about embedding layers

In [None]:
emb_l = nn.Embedding(32, 10, padding_idx=0)
emb_l.weight.data.shape

torch.Size([32, 10])

In [None]:
batch_text_dummy =  [[2,0,1,0,30], [1, 0, 29, 0 , 0]]  # extra 0's for padding
input_ =  torch.LongTensor(batch_text_dummy).T
input_.shape  # seq_len, batch_Size

torch.Size([5, 2])

In [None]:
out_ = emb_l(input_)
out_.shape

torch.Size([5, 2, 10])

In [None]:
out_[:,0,:]

tensor([[-0.8888,  1.3465, -1.2052,  0.5617,  0.5860, -0.1069,  0.7700,  0.8317,
         -0.8908, -0.0312],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.4977,  0.6553,  0.9393,  0.7443,  1.3530,  1.3392, -0.0201, -0.0159,
         -0.6893, -1.1100],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [-1.0366, -2.0125,  0.2895, -0.1463, -1.6394,  0.0331, -1.6140,  0.6373,
          1.8592, -1.4831]], grad_fn=<SliceBackward>)

In [None]:
# jut_[1]
out_[:,1,:]

tensor([[ 1.3623, -0.4913,  1.1900, -1.3396,  0.2789,  0.0026,  0.6211, -1.4329,
         -0.9110, -0.8595],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [-0.3568, -0.1084, -0.7495, -0.4504,  1.3501, -0.9847, -1.2822, -1.1335,
          0.5987,  1.7185],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]], grad_fn=<SliceBackward>)

In [None]:
rnn  = nn.RNN(10, 3)
rnn.all_weights[0][0].shape

torch.Size([3, 10])

In [None]:
rnn.all_weights[0][1].shape

torch.Size([3, 3])

In [None]:
rnn.all_weights[0][2].shape

torch.Size([3])

In [None]:
rnn.all_weights[0][3].shape

torch.Size([3])

In [None]:
out_rnn, hid = rnn(out_)

In [None]:
out_rnn.shape

torch.Size([5, 2, 3])

In [None]:
hid.shape

torch.Size([1, 2, 3])

### Model architecture

In [15]:
class Sentiment_Analyzer2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                 out_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                          hidden_size=hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          dropout=dropout)
        """
        As the final hidden state of our LSTM has both a forward and a backward component, 
        which will be concatenated together, the size of the input to the nn.Linear layer 
        is twice that of the hidden dimension size.
        """
        self.fc = nn.Linear(2*hidden_dim, out_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text, text_lengths):
        # text =  [ sent_len, batch_size]
        # text_lenghts = [batch_size]
        embeds = self.dropout(self.embedding(text))
        # embedded = [sent_len, batch_size, embed_dim]
        
        #pack the sequence
        """
        Before we pass our embeddings to the RNN, we need to pack them, 
        which we do with nn.utils.rnn.pack_padded_sequence. 
        This will cause our RNN to only process the non-padded elements of our sequence. 
        
        Note that the lengths argument of packed_padded_sequence must be a CPU tensor 
        so we explicitly make it one by using .to('cpu').
       
        https://androidkt.com/pads-and-pack-variable-length-sequences-in-pytorch/
        """
        packed_embeds = nn.utils.rnn.pack_padded_sequence(embeds, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.lstm(packed_embeds)
        
        """
        # unpack the output , not required for this case but adding for later reference
         Without packed padded sequences, hidden and cell are tensors from the last element in the sequence,
         which will most probably be a pad token,
         however when using packed padded sequences they are both from the last non-padded element in the sequence. 
         We then unpack the output sequence, with nn.utils.rnn.pad_packed_sequence, to transform it from a packed sequence to a tensor. The elements of output from padding tokens will be zero tensors (tensors where every element is zero). Usually, we only have to unpack output if we are going to use it later on in the model. 
         Although we aren't in this case, we still unpack the sequence just to show how it is done.
        """
        output = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [sent_len, batch_size, hid_dim * num_direction]
        
        #hidden = [num layers * num directions, batch size, hid dim]        
        #cell = [num layers * num directions, batch size, hid dim]        
        
        """
        Concat the 
        - final forward (hidden[-2,:,:]) hidden state # normal hidden for 1 direction lstm
        - and backward (hidden[-1,:,:]) hidden state 
        then apply dropout 
        this is because we used bidirectional lstm
        """
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

### Initialize the model

In [16]:
PAD_IDX  =  TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX  =  TEXT.vocab.stoi[TEXT.unk_token]
print(PAD_IDX)
print(UNK_IDX)
TEXT.pad_token

1
0


'<pad>'

In [18]:
# vocab_size, embedding_dim, hidden_dim, 
# out_dim, n_layers, bidirectional, dropout, pad_idx):

EMBEDDING_DIM = 100  # since glove has 100d

args = {
    "vocab_size" : len(TEXT.vocab),
    "embedding_dim" : EMBEDDING_DIM,
    "hidden_dim" : 256,
    "out_dim" : 1,
    "n_layers": 2,
    "bidirectional": True,
    "dropout": 0.5,
    "pad_idx": PAD_IDX 
}

model = Sentiment_Analyzer2(**args).to(device)

In [44]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


### Copy the pretrained embeds to the models embedings layers

In [19]:
pretrained_embeds = TEXT.vocab.vectors
pretrained_embeds.shape

torch.Size([25002, 100])

In [20]:
model.embedding.weight.data.copy_(pretrained_embeds)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1385,  0.2386,  0.1118,  ..., -0.2863, -0.2599,  0.1893],
        [-0.4029,  0.1755, -0.5962,  ..., -0.7070,  0.3583,  0.0524],
        [-1.9296,  0.0574,  0.5448,  ..., -0.0727, -0.1165, -0.3921]],
       device='cuda:0')

#### UNK and PAD value to zeros

As our <unk> and <pad> token aren't in the pre-trained vocabulary they have been initialized using unk_init (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.
    
We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.

Note: like initializing the embeddings, this should be done on the weight.data and not the weight!

In [21]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1385,  0.2386,  0.1118,  ..., -0.2863, -0.2599,  0.1893],
        [-0.4029,  0.1755, -0.5962,  ..., -0.7070,  0.3583,  0.0524],
        [-1.9296,  0.0574,  0.5448,  ..., -0.0727, -0.1165, -0.3921]],
       device='cuda:0')

### Define the criterion and optimizer

In [22]:
from torch.optim import Adam

lr = 0.005
optimizer = Adam(model.parameters(), lr=lr)


"""
Next, we'll define our loss function. In PyTorch this is commonly called a criterion.

The loss function here is binary cross entropy with logits.

Our model currently outputs an unbound real number. As our labels are either 0 or 1, 
we want to restrict the predictions to a number between 0 and 1. We do this using the sigmoid or logit functions.
"""

criterion = nn.BCEWithLogitsLoss()
# The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps.

### Accuracy Function

In [23]:
def batch_accuracy(preds, y):
    # preds = [batch_size]
    # y = [batch_size, 1]
    
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc  = correct.sum() / len(correct)
    
    return acc.item()

### Evaluating Function

In [35]:
def evaluate(model, iterator, criterion):
    total_loss = 0
    total_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            reviews, text_lengths = batch.REVIEWS
            x = reviews.to(device)
            y = batch.LABEL
            #  print(text_lengths.shape)
            out = model(x, text_lengths)
#           k out = model(batch.REVIEWS)
            scores = out.squeeze(1)
            
            loss = criterion(scores, y.to(device))
            acc = batch_accuracy(scores.to('cpu'),y)
            
            total_acc+=acc
            total_loss+=loss.item()
    
    return total_loss/len(iterator), total_acc/len(iterator)

### Training Loop

In [45]:
NUM_EPOCHS = 2

In [26]:
!nvidia-smi

Sat May  8 10:00:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    28W /  70W |   1110MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [32]:
from tqdm import notebook

In [46]:
for epoch in range(NUM_EPOCHS):
    loop = notebook.tqdm(train_iterator,
                total=len(train_iterator),
                leave=True)
    val_loss = 0
    val_acc = 0
    model.train()
    for batch in loop:
        reviews, text_lengths = batch.REVIEWS
        x = reviews.to(device)
        y = batch.LABEL
#         print(text_lengths.shape)
        out = model(x, text_lengths)
        scores = out.squeeze(1)
        
        optimizer.zero_grad()
        
        loss = criterion(scores, y.to(device))
        loss.backward()
        
        optimizer.step()
        
        train_acc = batch_accuracy(scores.to('cpu'), batch.LABEL)
#         loop.set_description(f"Epoch [])
        loop.set_description(f"Epoch [{epoch + 1}/{NUM_EPOCHS}]")
#         loop.set_postfix(loss=loss.item(), train_acc=train_acc.item())
#         loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=0, val_acc=0)
        loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
                             
        # break
    
    val_loss, val_acc  = evaluate(model, val_iterator, criterion)
        
    loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
 
print("Done training")

HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Done training


In [47]:
print(val_loss)
print(val_acc)

0.28451992781229424
0.8998679577464789


#### Test Evaluation

In [48]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.260 | Test Acc: 90.72%


### Results

In [39]:
import spacy
spacy_en = spacy.load('en_core_web_sm')

In [40]:
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]
tokenizer("The quick fox can't jump over a lazy dog.")

['The', 'quick', 'fox', 'ca', "n't", 'jump', 'over', 'a', 'lazy', 'dog', '.']

In [41]:
def predict_sentiment(model, review):
    model.eval()
    tokens = tokenizer(review)
    indexed = [TEXT.vocab.stoi[t] for t in tokens]
    length = [len(indexed)]
    
    in_tensor = torch.LongTensor(indexed).to(device)
    in_tensor = in_tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)

    pred = torch.sigmoid(model(in_tensor, length_tensor))
    return pred.item()

In [50]:
predict_sentiment(model, "An awesome movie,no chessy scence so which makes it nice ! Must watch")

0.981316328048706