- https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

In [1]:
import torch
import torchtext
from torchtext.legacy import data
from torch.utils.data import DataLoader, random_split
import pandas as pd
import random
from tqdm import tqdm
import torch.nn.functional as F
from torch import nn

torchtext.__version__
# torchtext.__version__

'0.9.1'

In [6]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz

--2021-05-09 02:48:55--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2021-05-09 02:48:55--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz.1’


2021-05-09 02:48:56 (274 MB/s) - ‘movie_data.csv.gz.1’ saved [26521894/26521894

In [7]:
!gunzip -f movie_data.csv.gz

In [3]:
SEED = 42  # The answer to life, the universe, and everything

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
available_device= "cuda" if torch.cuda.is_available() else "cpu"
# available_device= "cpu"
device = torch.device(available_device)
print(device)
# device

cuda


## Creating the dataset and iterator

### Dataset 

In [8]:
"""
TorchText Fields have a preprocessing argument. 
A function passed here will be applied to a sentence after it has been tokenized,
but before it has been numericalized(transformed from a list of tokens to a list of indexes). 
This is where we'll pass our generate_ngrams function.

"""

TEXT = data.Field(tokenize='spacy',  # tokenizer
              tokenizer_language='en_core_web_sm',  # none
              batch_first=True
              )

LABEL = data.LabelField(dtype=torch.float)

FIELDS = [('TEXT', TEXT), ('LABEL', LABEL)]

dataset = data.TabularDataset(path='movie_data.csv',
                                  format='csv',
                                  fields=FIELDS,
                                  skip_header=True)

In [9]:
train_set, test_set = dataset.split(split_ratio=[0.9, 0.1],
                                    random_state=random.seed(SEED))

train_set, val_set = train_set.split(split_ratio=[0.9, 0.1],
                                        random_state=random.seed(SEED))

### Build Vocab using the glove 100d 

In [10]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_set,
                max_size=MAX_VOCAB_SIZE,
                vectors="glove.6B.100d",
                unk_init = torch.Tensor.normal_)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 398162/400000 [00:14<00:00, 27720.92it/s]

In [11]:
LABEL.build_vocab(train_set)

In [None]:
### 10 common word train_set
top_commom_words = TEXT.vocab.freqs.most_common(10)
top_commom_words

[('the', 464964),
 (',', 441024),
 ('.', 378804),
 ('a', 250866),
 ('and', 250750),
 ('of', 231373),
 ('to', 214406),
 ('is', 173465),
 ('in', 141132),
 ('I', 125873)]

In [12]:
TEXT.vocab.vectors

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1385,  0.2386,  0.1118,  ..., -0.2863, -0.2599,  0.1893],
        [-0.4029,  0.1755, -0.5962,  ..., -0.7070,  0.3583,  0.0524],
        [-1.9296,  0.0574,  0.5448,  ..., -0.0727, -0.1165, -0.3921]])

In [None]:
TEXT.vocab.vectors.shape

torch.Size([25002, 100])

In [14]:
LABEL.vocab.freqs

Counter({'0': 20265, '1': 20235})

In [13]:
LABEL.vocab.stoi

defaultdict(None, {'0': 0, '1': 1})

### Data iterator

In [15]:
BATCH_SIZE = 64

# https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
                                              (train_set, val_set, test_set),
                                              batch_size=BATCH_SIZE,
                                              sort_within_batch=True,          #
                                              sort_key=lambda x: len(x.TEXT))

In [16]:
print(len(train_iterator))
for t_i in train_iterator:
    print(t_i.TEXT.size())
    print(t_i.LABEL.size())
    break

for v_i in val_iterator:
    print(v_i.TEXT.size())
    print(v_i.LABEL.size())
    break

633
torch.Size([64, 223])
torch.Size([64])
torch.Size([64, 52])
torch.Size([64])


## Defining the model

### Model architecture

In [31]:
class ConvMaxEmbeds(nn.Module):
    def __init__(self, in_channel, n_filters, filter_size, embed_dim):
        super().__init__()
        """
        use a filter that is [n x emb_dim]. 
        This will cover $n$ sequential words entirely, as their width will be emb_dim dimensions. 
        """
        self.conv  = nn.Conv2d(in_channels=1, 
                               out_channels=n_filters,
                               kernel_size=(filter_size, embed_dim))
    
    def forward(self, x):
        # x [batch, 1, seq_len, embeds_dim]
        x = self.conv(x)
        # x [batch, n_filters, seq_len - filter_size + 1, 1]
        x = x.squeeze(3)
        """
        our output will be a vector with number of elements equal to the height 
        of the image (or lenth of the word) minus the height of the filter plus one, 
        `seq_len - filter_size + 1`
        """
        # x  = [batch, n_filters, seq_len - filter_size + 1]
        pooled = F.max_pool1d(x, x.shape[2])
        # [batch, n_filters, 1]
        return pooled.squeeze(2)
        # [batch, n_filters]

In [32]:
"""
Note: there an exception to this if your sentence(s) are shorter than the largest filter used. 
You will then have to pad your sentences to the length of the largest filter.
In the IMDb data there are no reviews only 5 words long so we don't have to worry about that,\
but you will if you are using your own data.
"""
class ConvSentimentAnalyzer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 out_dim, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
       
        """
        In our model, we will also have different sizes of filters, 
        heights of 3, 4 and 5, with 100 of each of them. 
        The intuition is that we will be looking for the occurence of different tri-grams, 
        4-grams and 5-grams that are relevant for analysing sentiment of movie reviews.
        """
        self.conv_max_1 = ConvMaxEmbeds(1, n_filters, 3, embedding_dim)
        self.conv_max_2 = ConvMaxEmbeds(1, n_filters, 4, embedding_dim)
        self.conv_max_3 = ConvMaxEmbeds(1, n_filters, 5, embedding_dim)
        
        self.fc = nn.Linear(3*n_filters, out_dim)
        """
        We can think of the weights of this linear layer as "weighting up the evidence" 
        from each of the 3 * n-filters n-grams and making a final decision.
        """
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # text =  [batch_size, sent_len]
        embeds = self.embedding(text)
        # embeds = [batch_size, sent_len, embed_dim]
        """
        unsqeeuze for making channel dim=1
        """
        embeds = embeds.unsqueeze(1)
        # embeds = [batch_size, 1, sent_len, embed_dim]

        conv_pooled_squeeze_1 =  self.conv_max_1(embeds)        
        conv_pooled_squeeze_2 =  self.conv_max_2(embeds)        
        conv_pooled_squeeze_3 =  self.conv_max_3(embeds)          
        #conv_pooled_squeeze_n = [batch, n_filters]
        
        """
        As our model has n_filters of 3 different sizes, that means we have n*3
        different n-grams the model thinks are important. 
        We concatenate these together into a single vector and pass them through 
        a linear layer to predict the sentiment.
        """
        concat_pooled = torch.cat((conv_pooled_squeeze_1, 
                                   conv_pooled_squeeze_2,
                                   conv_pooled_squeeze_3),
                                  dim=1)
        
        concat_pooled = self.dropout(concat_pooled)
        #concat_pooled = [batch size, n_filters * 3]

        return self.fc(concat_pooled)

### Initialize the model

In [25]:
PAD_IDX  =  TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX  =  TEXT.vocab.stoi[TEXT.unk_token]
print(PAD_IDX)
print(UNK_IDX)
TEXT.pad_token

1
0


'<pad>'

In [33]:
"""
vocab_size, embedding_dim, n_filters,
out_dim, dropout, pad_idx
"""

EMBEDDING_DIM = 100  # since glove has 100d

args = {
    "vocab_size" : len(TEXT.vocab),
    "embedding_dim" : EMBEDDING_DIM,
    "n_filters" : 100,
    "out_dim" : 1,
    "dropout" : 0.5,
    "pad_idx": PAD_IDX 
}

model = ConvSentimentAnalyzer(**args).to(device)

In [34]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,620,801 trainable parameters


### Copy the pretrained embeds to the models embedings layers

In [35]:
pretrained_embeds = TEXT.vocab.vectors
pretrained_embeds.shape

torch.Size([25002, 100])

In [36]:
model.embedding.weight.data.copy_(pretrained_embeds)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1385,  0.2386,  0.1118,  ..., -0.2863, -0.2599,  0.1893],
        [-0.4029,  0.1755, -0.5962,  ..., -0.7070,  0.3583,  0.0524],
        [-1.9296,  0.0574,  0.5448,  ..., -0.0727, -0.1165, -0.3921]],
       device='cuda:0')

#### UNK and PAD value to zeros

As our <unk> and <pad> token aren't in the pre-trained vocabulary they have been initialized using unk_init (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.
    
We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.

Note: like initializing the embeddings, this should be done on the weight.data and not the weight!

In [37]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1385,  0.2386,  0.1118,  ..., -0.2863, -0.2599,  0.1893],
        [-0.4029,  0.1755, -0.5962,  ..., -0.7070,  0.3583,  0.0524],
        [-1.9296,  0.0574,  0.5448,  ..., -0.0727, -0.1165, -0.3921]],
       device='cuda:0')

### Define the criterion and optimizer

In [38]:
from torch.optim import Adam

lr = 0.005
optimizer = Adam(model.parameters(), lr=lr)


"""
Next, we'll define our loss function. In PyTorch this is commonly called a criterion.

The loss function here is binary cross entropy with logits.

Our model currently outputs an unbound real number. As our labels are either 0 or 1, 
we want to restrict the predictions to a number between 0 and 1. We do this using the sigmoid or logit functions.
"""

criterion = nn.BCEWithLogitsLoss()
# The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps.

### Accuracy Function

In [39]:
def batch_accuracy(preds, y):
    # preds = [batch_size]
    # y = [batch_size, 1]
    
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc  = correct.sum() / len(correct)
    
    return acc.item()

### Evaluating Function

In [40]:
def evaluate(model, iterator, criterion):
    total_loss = 0
    total_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            x = batch.TEXT.to(device)
            y = batch.LABEL
            #  print(text_lengths.shape)
            out = model(x)
#           k out = model(batch.REVIEWS)
            scores = out.squeeze(1)
            
            loss = criterion(scores, y.to(device))
            acc = batch_accuracy(scores.to('cpu'),y)
            
            total_acc+=acc
            total_loss+=loss.item()
    
    return total_loss/len(iterator), total_acc/len(iterator)

### Training Loop

In [41]:
NUM_EPOCHS = 5

In [None]:
!nvidia-smi

Sat May  8 12:03:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    27W /  70W |   1082MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [42]:
from tqdm import notebook

In [56]:
for epoch in range(NUM_EPOCHS):
    loop = notebook.tqdm(train_iterator,
                total=len(train_iterator),
                leave=True)
    val_loss = 0
    val_acc = 0
    model.train()
    for batch in loop:
        x = batch.TEXT.to(device)
        y = batch.LABEL
#         print(text_lengths.shape)
        out = model(x)
        scores = out.squeeze(1)
        
        optimizer.zero_grad()
        
        loss = criterion(scores, y.to(device))
        loss.backward()
        
        optimizer.step()
        
        train_acc = batch_accuracy(scores.to('cpu'), batch.LABEL)
        
        loop.set_description(f"Epoch [{epoch + 1}/{NUM_EPOCHS}]")
        loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
                             
        # break
    
    val_loss, val_acc  = evaluate(model, val_iterator, criterion)
        
    loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
 
print("Done training")

HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Done training


In [57]:
print(val_loss)
print(val_acc)

0.5546609364764791
0.8807658452383229


#### Test Evaluation

In [58]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.503 | Test Acc: 88.79%


### Results

In [46]:
import spacy
spacy_en = spacy.load('en_core_web_sm')

In [47]:
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]
tokenizer("The quick fox can't jump over a lazy dog.")

['The', 'quick', 'fox', 'ca', "n't", 'jump', 'over', 'a', 'lazy', 'dog', '.']

In [53]:
def predict_sentiment(model, review, min_len = 5):
    model.eval()
    tokens = tokenizer(review)
    if len(tokens) < min_len:
        tokens += ['<pad>'] * (min_len - len(tokens))
    # ngram_tokens = generate_ngrams(tokens)

    # numericalized
    indexed = [TEXT.vocab.stoi[t] for t in tokens]
    # length = [len(indexed)]
    
    in_tensor = torch.LongTensor(indexed).to(device)
    in_tensor = in_tensor.unsqueeze(0)
    # length_tensor = torch.LongTensor(length)

    pred = torch.sigmoid(model(in_tensor))
    return pred.item()

In [54]:
predict_sentiment(model, "An awesome movie,no chessy scence so which makes it nice ! Must watch")

0.39230817556381226

In [59]:
predict_sentiment(model, "An awesome movie,no chessy scence so which makes it nice ! Must watch")

0.9999291896820068

In [60]:
predict_sentiment(model, "This film is terrible")

2.1012567685829708e-07

In [61]:
predict_sentiment(model, "This film is great")

0.999956488609314