In [1]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = BertModel.from_pretrained('bert-base-uncased')

In [6]:
named_params = list(model.named_parameters())

print("Number of layers:", len(named_params))
print('===== Embedding Layer =====')
for p in named_params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n===== First Encoder =====\n')
for p in named_params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n===== Output Layer =====\n')
for p in named_params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

Number of layers: 199
===== Embedding Layer =====
embeddings.word_embeddings.weight                       (30522, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)

===== First Encoder =====

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   (768,)
encoder.layer.0.attention

In [None]:
# The pooler is a seperate linear and tanh activated layer that acts on the cls token's representation
# this pooled_ouput is often used as a representation for the entire sentance.

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.encode('Sinan loves a beautiful day')


[101, 8254, 2319, 7459, 1037, 3376, 2154, 102]

In [8]:
# run tokens through the model

#1. Turn token_with_unkown_words into a tensor (will be size (8,))
#2. Unsqueeze a first dimension to simulate batches, the resulting shape is (1,8)

response = model(torch.tensor(tokenizer.encode('Sinan loves a beautiful day')).unsqueeze(0))


In [9]:
# embedding for each token, the first one being the [CLS] token
response.last_hidden_state

tensor([[[-0.2327,  0.1515, -0.0448,  ..., -0.5192,  0.4195,  0.2948],
         [ 0.3051, -0.6614,  0.2500,  ..., -0.9809,  0.2551,  0.2400],
         [-0.3610, -0.8759,  0.4542,  ..., -1.1120,  0.1791,  0.0664],
         ...,
         [ 0.0689, -0.0364,  0.4940,  ..., -0.6558,  0.2227, -0.3868],
         [-0.2657, -0.4257,  0.0056,  ...,  0.1352,  0.3596, -0.4585],
         [ 0.6100,  0.0263, -0.2532,  ..., -0.0680, -0.3901, -0.3541]]],
       grad_fn=<NativeLayerNormBackward0>)

In [10]:
# This layer is trained on top of the Embedding of the cls token. meant to represent the entire sentance as a whole

response.pooler_output.shape

torch.Size([1, 768])

In [11]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [12]:
#to create the pooler ourselves we will grab the final represntation of the cls token

CLS_embedding = response.last_hidden_state[:,0,:].unsqueeze(0)

In [13]:
model.pooler(CLS_embedding).shape

torch.Size([1, 768])

In [14]:
(model.pooler(CLS_embedding) == response.pooler_output).all()

tensor(True)

# 4.2 Word Piece Tokenization

## lets start by taking a look at the Bert tokenizer

Let's use the `from_pretrained` method to grab the uncased bert-base tokenizer.

A list of all availible modules can be found on their sike : https/huggingface.co/transformers/pretrained_models.html

In [15]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'Lenght of the vocabulary: {len(tokenizer.vocab)}')

Lenght of the vocabulary: 30522


In [16]:
text = "A simple sentence!"
tokens = tokenizer.encode(text) # get token ids per BERT-base's vocabulary
print(tokens)
# 101 is cls 102 is sep, and 999 is our exclamation point

[101, 1037, 3722, 6251, 999, 102]


In [17]:
# decode will re-construct the sentence with the added [CLS] and [SEP] tokens
# we can also decode and get rid of the cls and sep tokens
tokenizer.decode(tokens)

'[CLS] a simple sentence! [SEP]'

In [22]:
text = "My friend told me about this class and I love it so far! she was right."
tokens = tokenizer.encode(text)
print(tokens)

[101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102]


In [25]:
# a nicer printout of token ids and token strings
def pretty_print_tokens(tokens, tokenizer):
    print(f'Text: {text}. number of tokens: {len(tokens)}')
    for t in tokens:
            print(f'Token: {t}, token string: {tokenizer.decode([t])}')
pretty_print_tokens(tokens, tokenizer)
# in this case every word corresponds to a specific word

Text: My friend told me about this class and I love it so far! she was right.. number of tokens: 20
Token: 101, token string: [CLS]
Token: 2026, token string: my
Token: 2767, token string: friend
Token: 2409, token string: told
Token: 2033, token string: me
Token: 2055, token string: about
Token: 2023, token string: this
Token: 2465, token string: class
Token: 1998, token string: and
Token: 1045, token string: i
Token: 2293, token string: love
Token: 2009, token string: it
Token: 2061, token string: so
Token: 2521, token string: far
Token: 999, token string: !
Token: 2016, token string: she
Token: 2001, token string: was
Token: 2157, token string: right
Token: 1012, token string: .
Token: 102, token string: [SEP]


In [24]:
'sinan' in tokenizer.vocab

#sinan is not in the vocabulary, so it is broken down into subwords

False

sinan will be broken down into `sin` and `##an`. The ## indicates it is part of another word and this is different from just `an`. I belive this would make sense as prefixes/suffixes can be encoded with a lot more meaning when given the `##` to it.

In [26]:
text_with_unkown_words = "Sinan is our instructor for this awesomesuace class"
tokens_with_unknown_words = tokenizer.encode(text_with_unkown_words)
pretty_print_tokens(tokens_with_unknown_words, tokenizer)
# awesomesauce becomes three words

Text: My friend told me about this class and I love it so far! she was right.. number of tokens: 13
Token: 101, token string: [CLS]
Token: 8254, token string: sin
Token: 2319, token string: ##an
Token: 2003, token string: is
Token: 2256, token string: our
Token: 9450, token string: instructor
Token: 2005, token string: for
Token: 2023, token string: this
Token: 12476, token string: awesome
Token: 6342, token string: ##su
Token: 10732, token string: ##ace
Token: 2465, token string: class
Token: 102, token string: [SEP]


In [28]:
#encode plus gives us the token ids, attention mask (a sequence of ones or zeros if that 
#token should be included in attention calculation), useful for training time
# also token_type_ids (sequence of zeros and ones) lets us know if we are passing one or two sequences into bert

tokens= tokenizer.encode_plus(text_with_unkown_words)
print(tokens)

# calling tokenizer directly does the same thing as encode_plus

{'input_ids': [101, 8254, 2319, 2003, 2256, 9450, 2005, 2023, 12476, 6342, 10732, 2465, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [34]:
#python is the 6th token
python_pet = tokenizer.encode('I love my pet python')

#python is the 6th token
python_language = tokenizer.encode('I love coding in python')

# we want to do a cosine similarity between the two python tokens

#contextful embedding of 'python in 'ilove my pet python'
python_pet_embedding=model(torch.tensor(python_pet).unsqueeze(0))[0][:,5,:].detach().numpy()

#look closely above we are taking the 6th token, which is python and then outputting to an numpy array

python_language_embedding=model(torch.tensor(python_language).unsqueeze(0))[0][:,5,:].detach().numpy()
# contextful embedding of snake in 'snake'
snake_alone_embedding=model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:,1,:].detach().numpy()

#contextful embedding of 'programming' in 'programming'
programming_alone_embedding=model(torch.tensor(tokenizer.encode('programming')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [35]:
#now lets check the cosine similarities

cosine_similarity(python_language_embedding, snake_alone_embedding)

array([[0.58434767]], dtype=float32)

In [36]:
cosine_similarity(python_pet_embedding, snake_alone_embedding)

array([[0.6928657]], dtype=float32)

In [37]:
cosine_similarity(python_language_embedding, programming_alone_embedding)

array([[0.5614741]], dtype=float32)

# 4.3 the man embeddings of a BERT

In [38]:
from transformers import BertModel, BertTokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [39]:
"""
word_embeddings = context-free word embeddings
position_embeddings = encodes word position
token_type_embeddings == 0 or 1. Used to look up segment embeddings
"""
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [40]:
example_phrase= "I am Sinan"

tokenizer.encode(example_phrase, return_tensors='pt')

tensor([[ 101, 1045, 2572, 8254, 2319,  102]])

In [41]:
model.embeddings.word_embeddings(tokenizer.encode(example_phrase, return_tensors='pt'))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [-0.0022, -0.0876,  0.0143,  ...,  0.0232, -0.0024, -0.0213],
         [-0.0614, -0.0044, -0.0755,  ..., -0.0522, -0.0310, -0.0248],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [42]:
model.embeddings.word_embeddings(tokenizer.encode('I am Matt', return_tensors='pt'))
# first and last row of this one are the same as the last one because of the cls and sep token

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [-0.0381, -0.0026,  0.0130,  ...,  0.0038, -0.0279, -0.0082],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [44]:
model.embeddings.position_embeddings

Embedding(512, 768)

In [46]:
model.embeddings.position_embeddings(torch.LongTensor(range(6)))

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        [-4.1949e-03, -1.1852e-02, -2.1180e-02,  ...,  2.2455e-02,
          5.2826e-03, -1.9723e-03],
        [-5.6087e-03, -1.0445e-02, -7.2288e-03,  ...,  2.0837e-02,
          3.5402e-03,  4.7708e-03],
        [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
          2.0183e-02,  3.4077e-03]], grad_fn=<EmbeddingBackward0>)

In [50]:
model.embeddings.LayerNorm(
    model.embeddings.word_embeddings(tokenizer.encode(example_phrase, return_tensors='pt'))+\
    model.embeddings.position_embeddings(torch.LongTensor(range(6))) +\
    model.embeddings.token_type_embeddings(torch.LongTensor([0,0,0,0,0,0]))
)
#above is the same as below (We are just adding the embeddings and sending them thorugh

model.embeddings(tokenizer.encode(example_phrase,return_tensors='pt'))

tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [-3.4026e-04,  5.3974e-01, -2.8805e-01,  ...,  7.5731e-01,
           8.9008e-01,  1.6575e-01],
         [-6.3496e-01,  1.9748e-01,  2.5116e-01,  ..., -4.0819e-02,
           1.3468e+00, -6.9357e-01],
         [ 2.8197e-01, -1.0037e+00,  3.5063e-01,  ...,  8.5378e-01,
           3.9389e-01, -8.4527e-02],
         [-7.3509e-01,  3.3429e-01, -8.3037e-01,  ..., -2.1545e-01,
          -6.6517e-02, -2.6881e-02],
         [-3.2507e-01, -3.1879e-01, -1.1632e-01,  ..., -3.9602e-01,
           4.1120e-01, -7.7552e-02]]], grad_fn=<NativeLayerNormBackward0>)