In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Transformers for Sentiment Analysis
This notebook follows this tutorial: https://github.com/bentrevett/pytorch-sentiment-analysis.

# Preparing Data

In [2]:
from transformers import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer.save_pretrained('./transformers_cache/bert-base-uncased/')
tokenizer = BertTokenizer.from_pretrained('./transformers_cache/bert-base-uncased/')
print(len(tokenizer.vocab))

30522


In [3]:
# This will tokenize and lower case the data in a way that is consistent with the pre-trained transformer model.
tokens = tokenizer.tokenize("Hello WORLD how ARE yoU?")
print(tokens)

indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

['hello', 'world', 'how', 'are', 'you', '?']
[7592, 2088, 2129, 2024, 2017, 1029]


In [4]:
# `cls_token`: The classifier token which is used when doing sequence classification (classification of the whole
# sequence instead of per-token classification). It is the first token of the sequence when built with special tokens.
init_token = tokenizer.cls_token
# `sep_token`: The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
# for sequence classification or for a text and a question for question answering. It is also used as the last token of 
# a sequence built with special tokens.
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [5]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [6]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

512


Now we define our fields. The transformer expects the batch dimension to be first, so we set `batch_first = True`. As we already have the vocabulary for our text, provided by the transformer we set `use_vocab = False` to tell torchtext that we'll be handling the vocabulary side of things. We pass our `tokenize_and_cut` function as the tokenizer. The `preprocessing` argument is a function that takes in the example after it has been tokenized, this is where we will convert the tokens to their indexes. Finally, we define the special tokens - making note that we are defining them to be their index value and not their string value, i.e. `100` instead of `[UNK]` This is because the sequences will already be converted into indexes.

In [7]:
import torchtext
from torchtext.data import Field, LabelField, BucketIterator

# `use_vocab`: Whether to use a Vocab object. If False, the data in this field should already be numerical.
TEXT = Field(batch_first=True, use_vocab=False, 
             tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, 
             init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx,
             include_lengths=True)
LABEL = LabelField(dtype=torch.float)

train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL, root='data')
train_data, valid_data = train_data.split()

KeyboardInterrupt: 

In [26]:
# Note: The text has already been numericalized. 
print(train_data[0].text[:20])
print(tokenizer.convert_ids_to_tokens(train_data[0].text[:20]))
print(train_data[0].label)

[2061, 1045, 9357, 2006, 1996, 3617, 4942, 29234, 2099, 6833, 2028, 2305, 1037, 3232, 1997, 2086, 3283, 1998, 2245, 1045]
['so', 'i', 'flipped', 'on', 'the', 'digital', 'sub', '##scribe', '##r', 'channels', 'one', 'night', 'a', 'couple', 'of', 'years', 'ago', 'and', 'thought', 'i']
pos


In [27]:
LABEL.build_vocab(train_data)

In [28]:
BATCH_SIZE = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [29]:
for batch in train_iterator:
    break
text, text_lens = batch.text
print(text)
print(text_lens)

tensor([[  101,  1996,  2088,  ...,     0,     0,     0],
        [  101,  1012,  1012,  ...,     0,     0,     0],
        [  101,  2023,  3185,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  4912,  ...,     0,     0,     0],
        [  101,  1045,  2064,  ...,     0,     0,     0],
        [  101,  1996, 22640,  ...,     0,     0,     0]])
tensor([287, 198, 339, 512, 152, 143, 221, 185, 198, 149, 284, 254, 128, 512,
        345, 234, 512, 461, 147, 264, 241, 297, 113, 319, 288, 125, 512, 187,
        171, 377, 162, 296, 234, 249, 317, 235, 177, 203,  65, 240, 362, 218,
        156, 512, 185, 155, 176, 429, 512, 217, 315, 152, 255, 186, 132, 191,
        512, 448, 182, 512,  72, 142, 113,  82, 321, 236, 316, 512, 165, 316,
        117, 287, 122, 197, 512, 205,  75, 431, 369, 237, 194, 512, 512, 133,
        422, 218, 282, 512, 295,  72, 164, 177, 177, 160, 159, 236, 228, 195,
        190, 318, 255, 208, 183, 275, 406, 206, 181, 209, 268, 199, 498, 337,
        158, 2

# Build Model

In [12]:
from transformers import BertModel

# bert = BertModel.from_pretrained('bert-base-uncased')
# bert.save_pretrained('./transformers_cache/bert-base-uncased/')
bert = BertModel.from_pretrained('./transformers_cache/bert-base-uncased/')

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(bert)

109482240

Instead of using an embedding layer to get embeddings for our text, we'll be using the pre-trained transformer model. These embeddings will then be fed into a GRU to produce a prediction for the sentiment of the input sentence. We get the embedding dimension size (called the `hidden_size`) from the transformer via its config attribute. The rest of the initialization is standard.

Within the forward pass, we wrap the transformer in a `no_grad` to ensure no gradients are calculated over this part of the model. The transformer actually returns the embeddings for the whole sequence as well as a *pooled* output. The [documentation](https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel) states that the pooled output is "usually not a good summary of the semantic content of the input, you’re often better with averaging or pooling the sequence of hidden-states for the whole input sequence", hence we will not be using it. The rest of the forward pass is the standard implementation of a recurrent model, where we take the hidden state over the final time-step, and pass it through a linear layer to get our predictions.

In [32]:
bert.config.hidden_size

768

In [40]:
bert(batch.text[0])

KeyboardInterrupt: 

In [37]:
torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]])

In [1]:
batch.text[0].size()

NameError: name 'batch' is not defined