# Brainstorm 
Understanding the Hugging Face tokenizer object and how to encode sentences using them

In [1]:
from tokenizers import Tokenizer

In [2]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

In [7]:
embeddings = tokenizer.encode('Hi, how are you?')
print(embeddings)
print(tokenizer.decode(embeddings.ids))

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
hi, how are you?


In [9]:
embeddings = tokenizer.encode('Hi, how are you?')
embeddings2 = tokenizer.encode('How are you? Hi. ')
print(embeddings.ids)
print(embeddings2.ids)

[101, 7632, 1010, 2129, 2024, 2017, 1029, 102]
[101, 2129, 2024, 2017, 1029, 7632, 1012, 102]


In [19]:
print(tokenizer.decode(embeddings.ids[0:2]))
print(tokenizer.decode(embeddings2.ids[-3:-1]))
print(tokenizer.decode([101, 7632, 1009]))

hi
hi.
hi +


In [22]:
print(tokenizer.decode(embeddings.ids[0:3]))
print(tokenizer.decode(embeddings2.ids[-4:-1]))
print(tokenizer.decode([101, 7632, 1011]))

hi,
? hi.
hi -


In [23]:
embeddings = tokenizer.encode('I am johnny')
embeddings2 = tokenizer.encode('I am tired')
print(embeddings.ids)
print(embeddings2.ids)

[101, 1045, 2572, 5206, 102]
[101, 1045, 2572, 5458, 102]


In [24]:
print(tokenizer.decode(embeddings.ids[-2:-1]))
print(tokenizer.decode(embeddings2.ids[-2:-1]))

johnny
tired


In [37]:
embeddings = tokenizer.encode('queen')
embeddings2 = tokenizer.encode('princess')
print(embeddings.ids)
print(embeddings2.ids)

embeddings3 = tokenizer.encode('king')
embeddings4 = tokenizer.encode('prince')
print(embeddings3.ids)
print(embeddings4.ids)

[101, 3035, 102]
[101, 4615, 102]
[101, 2332, 102]
[101, 3159, 102]


In [35]:
print(tokenizer.decode([3035]))

queen


In [38]:
from transformers import BertModel, BertTokenizer
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
# load
model = BertModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [44]:
input_ids = tokenizer.encode('king', add_special_tokens=True)
input_ids2 = tokenizer.encode('king arthur', add_special_tokens=True)
input_ids3 = tokenizer.encode('king kong', add_special_tokens=True)
input_ids4 = tokenizer.encode('arthur', add_special_tokens=True)

In [45]:
print(input_ids)
print(input_ids2)
print(input_ids3)
print(input_ids4)

[101, 2332, 102]
[101, 2332, 4300, 102]
[101, 2332, 4290, 102]
[101, 4300, 102]


In [None]:
input_text = "Here is some text to encode"
# tokenizer-> token_id
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
# input_ids: [101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 102]
input_ids = torch.tensor([input_ids])

with torch.no_grad():
    last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
last_hidden_states = last_hidden_states.mean(1)
print(last_hidden_states)

# BERT Word Embeddings Tutorial 
From https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

In [5]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [7]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [14]:
print(tokens_tensor)
print(segments_tensors)

tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
          2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
          1012,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [15]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [16]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [25]:
print(type(outputs))
print(outputs.hidden_states)
print(len(outputs.hidden_states))# Model has 13 layers
print(len(outputs.hidden_states[0])) # Model has one batch
print(len(outputs.hidden_states[0][0])) # Model took 22 tokens in the sentence
print(len(outputs.hidden_states[0][0][0])) # Model has 768 features in its hidden unit

<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>
(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.2329,  0.1390,  0.2979,  ..., -0.0655,  0.8885,  0.5109],
         [ 0.2257, -0.7165, -0.7255,  ...,  0.4844,  0.6030, -0.0957],
         ...,
         [-0.0374, -0.6155, -1.4419,  ...,  0.0793, -0.0811, -0.3802],
         [-0.0228,  0.4207, -0.3288,  ...,  0.4464,  0.5178,  0.5501],
         [-0.2350,  0.1566, -0.0462,  ..., -0.4206,  0.3074, -0.2288]]]), tensor([[[ 0.0522,  0.0595, -0.2179,  ...,  0.2280, -0.0712,  0.0148],
         [ 0.3819,  0.1475,  0.2414,  ...,  0.3397,  0.7607,  0.4999],
         [ 0.1705, -0.6168, -0.7296,  ...,  0.8631,  0.6274, -0.3727],
         ...,
         [ 0.6982, -0.4554, -1.7845,  ...,  0.3308,  0.0710, -0.5187],
         [-0.0905,  0.1862, -0.4437,  ...,  0.2244,  0.1810,  0.3740],
         [-0.0825,  0.0466, -0.1526,  ..., -0.2033,  0.3370, -0.1767]]]), tensor([[[-0.0357, -0.2022, 

In [26]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
# Converting a basemodeloutput object into a tensor object
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 22, 768])

In [29]:
print(token_embeddings)
print(type(token_embeddings))

tensor([[[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
            3.8253e-02,  1.6400e-01],
          [ 2.3295e-01,  1.3898e-01,  2.9788e-01,  ..., -6.5465e-02,
            8.8849e-01,  5.1089e-01],
          [ 2.2572e-01, -7.1647e-01, -7.2547e-01,  ...,  4.8439e-01,
            6.0302e-01, -9.5701e-02],
          ...,
          [-3.7402e-02, -6.1545e-01, -1.4419e+00,  ...,  7.9256e-02,
           -8.1097e-02, -3.8018e-01],
          [-2.2755e-02,  4.2067e-01, -3.2878e-01,  ...,  4.4641e-01,
            5.1775e-01,  5.5010e-01],
          [-2.3496e-01,  1.5656e-01, -4.6245e-02,  ..., -4.2065e-01,
            3.0737e-01, -2.2883e-01]]],


        [[[ 5.2195e-02,  5.9528e-02, -2.1788e-01,  ...,  2.2799e-01,
           -7.1235e-02,  1.4849e-02],
          [ 3.8188e-01,  1.4754e-01,  2.4141e-01,  ...,  3.3967e-01,
            7.6073e-01,  4.9991e-01],
          [ 1.7047e-01, -6.1683e-01, -7.2964e-01,  ...,  8.6309e-01,
            6.2739e-01, -3.7271e-01],
          ...,
   

In [30]:
token_embeddings = torch.squeeze(token_embeddings) # Remove any dimension with just size = 1
token_embeddings.size()

torch.Size([13, 22, 768])

In [38]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([22, 13, 768])

Many different ways to get the vector representation of the sentence vector - we have 22 x 13 x 768 vector

In [39]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 22 x 768


In [44]:
print((hidden_states[-2].shape)) # Getting the second last layer [22x768]
print((hidden_states[-2][0].shape)) # Getting the second last layer [22x768] ignoring he batch layer
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
print(sentence_embedding.shape)

torch.Size([1, 22, 768])
torch.Size([22, 768])
torch.Size([768])


In [36]:
# t1 = torch.FloatTensor([
#     [[2,1,1],[3,2,2]],
#     [[4,3,3],[5,4,4]],
#     [[6,5,5],[7,6,6]]
# ])
# print(t1)
# print(t1.shape)

tensor([[[2., 1., 1.],
         [3., 2., 2.]],

        [[4., 3., 3.],
         [5., 4., 4.]],

        [[6., 5., 5.],
         [7., 6., 6.]]])
torch.Size([3, 2, 3])


In [37]:
# t1 = t1.permute(2,1,0)
# print(t1)

tensor([[[2., 4., 6.],
         [3., 5., 7.]],

        [[1., 3., 5.],
         [2., 4., 6.]],

        [[1., 3., 5.],
         [2., 4., 6.]]])


In [45]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.94
Vector similarity for *different* meanings:  0.69
