In [2]:
!nvidia-smi

Sat Jun 12 03:34:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch
import torchtext
from torchtext.legacy import data
from torch.utils.data import DataLoader, random_split
import pandas as pd
import random
from tqdm import tqdm
import torch.nn.functional as F
from torch import nn

torchtext.__version__
# torchtext.__version__

'0.9.1'

## Learning N-grams

In [4]:
sentence = "The cow jumps over the moon"
sentence_list = sentence.split(" ")
sentence_list

['The', 'cow', 'jumps', 'over', 'the', 'moon']

In [5]:
for x,y,z in (zip(*[sentence_list[i:] for i in range(3)])):
    # zip maps the list elements entered from the for loop together
    print(x, y, z)
    # print(y)m

The cow jumps
cow jumps over
jumps over the
over the moon


In [1]:
def generate_ngrams(tokenized_sentence, n=2):
    # sentence_list = sentence.split(" ")
    n_grams = set(zip(*[tokenized_sentence[i:] for i in range(n)]))
    # zip maps the list elements entered from the for loop together
    # x = []
    for n_gram in n_grams:
        # print(n_gram)
        tokenized_sentence.append(' '.join(n_gram))
    return tokenized_sentence

t_sentence = ['This', 'film', 'is', 'terrible']
x = generate_ngrams(t_sentence)
x

['This', 'film', 'is', 'terrible', 'film is', 'is terrible', 'This film']

In [6]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz

--2021-06-12 03:34:56--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2021-06-12 03:34:57--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz’


2021-06-12 03:34:57 (166 MB/s) - ‘movie_data.csv.gz’ saved [26521894/26521894]



In [7]:
!gunzip -f movie_data.csv.gz

In [8]:
SEED = 42  # The answer to life, the universe, and everything

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [9]:
available_device= "cuda" if torch.cuda.is_available() else "cpu"
# available_device= "cpu"
device = torch.device(available_device)
print(device)
# device

cuda


## Creating the dataset and iterator

### Dataset 

In [10]:
"""
TorchText Fields have a preprocessing argument. 
A function passed here will be applied to a sentence after it has been tokenized,
but before it has been numericalized(transformed from a list of tokens to a list of indexes). 
This is where we'll pass our generate_ngrams function.

"""

TEXT = data.Field(tokenize='spacy',  # tokenizer
              tokenizer_language='en_core_web_sm',  # none
              preprocessing=generate_ngrams
              )

LABEL = data.LabelField(dtype=torch.float)

FIELDS = [('TEXT', TEXT), ('LABEL', LABEL)]

dataset = data.TabularDataset(path='movie_data.csv',
                                  format='csv',
                                  fields=FIELDS,
                                  skip_header=True)

In [11]:
train_set, test_set = dataset.split(split_ratio=[0.9, 0.1],
                                    random_state=random.seed(SEED))

train_set, val_set = train_set.split(split_ratio=[0.9, 0.1],
                                        random_state=random.seed(SEED))

### Build Vocab using the glove 100d 

In [12]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_set,
                max_size=MAX_VOCAB_SIZE,
                vectors="glove.6B.100d",
                unk_init = torch.Tensor.normal_)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.34MB/s]                           
 99%|█████████▉| 397222/400000 [00:13<00:00, 29562.08it/s]

In [13]:
LABEL.build_vocab(train_set)

In [14]:
### 10 common word train_set
top_commom_words = TEXT.vocab.freqs.most_common(10)
top_commom_words

[('the', 464964),
 (',', 441024),
 ('.', 378804),
 ('a', 250866),
 ('and', 250750),
 ('of', 231373),
 ('to', 214406),
 ('is', 173465),
 ('in', 141132),
 ('I', 125873)]

In [15]:
TEXT.vocab.vectors

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2478,  0.0297,  0.3133,  ..., -0.5366,  0.0615,  0.2137],
        [ 0.6144,  1.0666,  0.9477,  ..., -0.2019,  0.1443, -0.3882],
        [ 0.6160,  0.1285,  0.4471,  ...,  0.5756,  0.9194, -0.2133]])

In [16]:
TEXT.vocab.vectors.shape

torch.Size([25002, 100])

In [17]:
LABEL.vocab.freqs

Counter({'0': 20265, '1': 20235})

In [18]:
LABEL.vocab.stoi

defaultdict(None, {'0': 0, '1': 1})

### Data iterator

In [19]:
BATCH_SIZE = 64

# https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
                                              (train_set, val_set, test_set),
                                              batch_size=BATCH_SIZE,
                                              sort_within_batch=True,          #
                                              sort_key=lambda x: len(x.TEXT))

In [21]:
print(len(train_iterator))
for t_i in train_iterator:
    print(t_i.TEXT.size())
    print(t_i.LABEL.size())
    break

for v_i in val_iterator:
    print(v_i.TEXT.size())
    print(v_i.LABEL.size())
    break

633
torch.Size([117, 64])
torch.Size([64])
torch.Size([103, 64])
torch.Size([64])


## Defining the model

### Model architecture

In [22]:
"""
Instead, it first calculates the word embedding for each word using the Embedding layer (blue), 
then calculates the average of all of the word embeddings (pink)
and feeds this through the Linear layer (silver), and that's it!
"""
class AvgPoolFastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, 
                 out_dim, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, out_dim)
            
    def forward(self, text):
        # text =  [sent_len, batch_size]
        embeds = self.embedding(text)
        # embeds = [sent_len, batch_size, embed_dim]
        embeds = embeds.permute(1,0,2)
        # embeds = [batch_size, sent_len, embed_dim]

        """
        We implement the averaging with the avg_pool2d (average pool 2-dimensions) function.
        you can think of the word embeddings as a 2-dimensional grid, 
        where the words are along one axis `[1]`
        and the dimensions of the word embeddings are along the other `[3]`
        The avg_pool2d uses a filter of size embeds.shape[1] (i.e. the length of the sentence) by 1.

        We calculate the average value of all elements covered by the filter,
        Our filter covers each column 
        then the filter then slides to the right, along the embedings dimension
        
        n filter covers all embedding dimensions we get a [1xembed_dim] tensor
        """
        avg_pooled = F.avg_pool2d(embeds, (embeds.shape[1], 1))        
        #avg_pooled = [batch size, 1 , embed_dim]

        avg_pooled_squeeze = avg_pooled.squeeze(1)
        #avg_pooled = [batch size , embed_dim]

        return self.fc(avg_pooled_squeeze)

### Initialize the model

In [23]:
PAD_IDX  =  TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX  =  TEXT.vocab.stoi[TEXT.unk_token]
print(PAD_IDX)
print(UNK_IDX)
TEXT.pad_token

1
0


'<pad>'

In [24]:
# vocab_size, embedding_dim, hidden_dim, 
# out_dim, n_layers, bidirectional, dropout, pad_idx):

EMBEDDING_DIM = 100  # since glove has 100d

args = {
    "vocab_size" : len(TEXT.vocab),
    "embedding_dim" : EMBEDDING_DIM,
    "out_dim" : 1,
    "pad_idx": PAD_IDX 
}

model = AvgPoolFastText(**args).to(device)

In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,500,301 trainable parameters


### Copy the pretrained embeds to the models embedings layers

In [26]:
pretrained_embeds = TEXT.vocab.vectors
pretrained_embeds.shape

torch.Size([25002, 100])

In [27]:
model.embedding.weight.data.copy_(pretrained_embeds)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2478,  0.0297,  0.3133,  ..., -0.5366,  0.0615,  0.2137],
        [ 0.6144,  1.0666,  0.9477,  ..., -0.2019,  0.1443, -0.3882],
        [ 0.6160,  0.1285,  0.4471,  ...,  0.5756,  0.9194, -0.2133]],
       device='cuda:0')

#### UNK and PAD value to zeros

As our <unk> and <pad> token aren't in the pre-trained vocabulary they have been initialized using unk_init (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.
    
We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.

Note: like initializing the embeddings, this should be done on the weight.data and not the weight!

In [28]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

model.embedding.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2478,  0.0297,  0.3133,  ..., -0.5366,  0.0615,  0.2137],
        [ 0.6144,  1.0666,  0.9477,  ..., -0.2019,  0.1443, -0.3882],
        [ 0.6160,  0.1285,  0.4471,  ...,  0.5756,  0.9194, -0.2133]],
       device='cuda:0')

### Define the criterion and optimizer

In [29]:
from torch.optim import Adam

lr = 0.005
optimizer = Adam(model.parameters(), lr=lr)


"""
Next, we'll define our loss function. In PyTorch this is commonly called a criterion.

The loss function here is binary cross entropy with logits.

Our model currently outputs an unbound real number. As our labels are either 0 or 1, 
we want to restrict the predictions to a number between 0 and 1. We do this using the sigmoid or logit functions.
"""

criterion = nn.BCEWithLogitsLoss()
# The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps.

### Accuracy Function

In [30]:
def batch_accuracy(preds, y):
    # preds = [batch_size]
    # y = [batch_size, 1]
    
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc  = correct.sum() / len(correct)
    
    return acc.item()

### Evaluating Function

In [31]:
def evaluate(model, iterator, criterion):
    total_loss = 0
    total_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            x = batch.TEXT.to(device)
            y = batch.LABEL
            #  print(text_lengths.shape)
            out = model(x)
#           k out = model(batch.REVIEWS)
            scores = out.squeeze(1)
            
            loss = criterion(scores, y.to(device))
            acc = batch_accuracy(scores.to('cpu'),y)
            
            total_acc+=acc
            total_loss+=loss.item()
    
    return total_loss/len(iterator), total_acc/len(iterator)

### Training Loop

In [32]:
NUM_EPOCHS = 5

In [None]:
!nvidia-smi

Sat May  8 12:03:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    27W /  70W |   1082MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [33]:
from tqdm import notebook

In [34]:
for epoch in range(NUM_EPOCHS):
    loop = notebook.tqdm(train_iterator,
                total=len(train_iterator),
                leave=True)
    val_loss = 0
    val_acc = 0
    model.train()
    for batch in loop:
        x = batch.TEXT.to(device)
        y = batch.LABEL
#         print(text_lengths.shape)
        out = model(x)
        scores = out.squeeze(1)
        
        optimizer.zero_grad()
        
        loss = criterion(scores, y.to(device))
        loss.backward()
        
        optimizer.step()
        
        train_acc = batch_accuracy(scores.to('cpu'), batch.LABEL)
        
        loop.set_description(f"Epoch [{epoch + 1}/{NUM_EPOCHS}]")
        loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
                             
        # break
    
    val_loss, val_acc  = evaluate(model, val_iterator, criterion)
        
    loop.set_postfix(loss=loss.item(), train_acc=train_acc, val_loss=val_loss, val_acc=val_acc)
 
print("Done training")

HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Done training


In [35]:
print(val_loss)
print(val_acc)

0.3204576610259607
0.9045774651245332


#### Test Evaluation

In [36]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.297 | Test Acc: 90.57%


### Results

In [37]:
import spacy
spacy_en = spacy.load('en_core_web_sm')

In [38]:
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]
tokenizer("The quick fox can't jump over a lazy dog.")

['The', 'quick', 'fox', 'ca', "n't", 'jump', 'over', 'a', 'lazy', 'dog', '.']

In [53]:
def predict_sentiment(model, review):
    model.eval()
    tokens = tokenizer(review)
    ngram_tokens = generate_ngrams(tokens)

    # numericalized
    indexed = [TEXT.vocab.stoi[t] for t in ngram_tokens]
    # length = [len(indexed)]
    print(ngram_tokens)
    print(indexed)
    in_tensor = torch.LongTensor(indexed).to(device)
    in_tensor = in_tensor.unsqueeze(1)
    # length_tensor = torch.LongTensor(length)

    pred = torch.sigmoid(model(in_tensor))
    return pred.item()

In [52]:
predict_sentiment(model, "A Okay movie, plot is simple")

[175, 4292, 20, 3, 148, 9, 1071, 1311, 283, 15182, 10374, 0, 0]


1.0

In [55]:
predict_sentiment(model, "An awesome movie, Must watch")

['An', 'awesome', 'movie', ',', 'Must', 'watch', ', Must', 'movie ,', 'awesome movie', 'Must watch', 'An awesome']
[1661, 2483, 20, 3, 20490, 142, 0, 283, 0, 0, 0]


1.0

In [56]:
predict_sentiment(model, "This film is terrible")

['This', 'film', 'is', 'terrible', 'film is', 'is terrible', 'This film']
[70, 24, 9, 654, 291, 6526, 804]


1.0248406475937834e-28