In [1]:
# BERT — or Bidirectional Encoder Representations from Transformers — is a hugely popular transformer model used for almost everything in NLP.
# Through 12 (or so) encoder layers, BERT encodes a huge amount of information into a set of dense vectors.
# Each dense vector typically contains 768 values — and we usually have 512 of these vectors for each sentence encoded by BERT.
# These vectors contain what we can view as numerical representations of language. We can also extract those vectors — 
# from different layers if wanted — but typically from the final layer.
# Now, with two correctly encoded dense vectors, we can use a similarity metric like Cosine similarity to calculate their semantic similarity.
# Vectors that are more aligned are more semantically alike, and vise-versa.

# But there’s one problem, each sequence is represented by 512 vectors — not one vector.
# So, this is where another — brilliant — adaption of BERT comes into play. Sentence-BERT allows us to create a single vector that represents our
# full sequence, otherwise known as a sentence vector [2].
# We have two ways of implementing SBERT — the easy way using the sentence-tranformers library, or the slightly less easy way using transformers and PyTorch.
# We’ll cover both, starting with the transformers with PyTorch approach so that we can get an intuition for how these vectors are built.
# If you’ve used the HF transformers library, the first few steps will look very familiar. 
# We initialize our SBERT model and tokenizer, tokenize our text, and process our tokens through the model.

In [5]:
# Lets use the not so easy way (transformers and PyTorch)
from transformers import AutoTokenizer, AutoModel
import torch
a = "purple is the best city in the forest"
b = "there is an art to getting your way and throwing bananas on to the street is not it"  # this is very similar to 'g'
c = "it is not often you find soggy bananas on the street"
d = "green should have smelled more tranquil but somehow it just tasted rotten"
e = "joyce enjoyed eating pancakes with ketchup"
f = "as the asteroid hurtled toward earth becky was upset her dentist appointment had been canceled"
g = "to get your way you must not bombard the road with yellow fruit"  # this is very similar to 'b'

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# Tokenize all of our sentences.
tokens = tokenizer([a, b, c, d, e, f, g],
                          max_length = 128,
                          truncation = True,
                          padding = 'max_length',
                          return_tensors = 'pt')
tokens.keys()
tokens['input_ids'][0]

tensor([ 101, 6379, 2003, 1996, 2190, 2103, 1999, 1996, 3224,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [7]:
# Process our tokenized tensors through the model.
outputs = model(**tokens)
print(outputs.keys())

embeddings = outputs.last_hidden_state
print(embeddings[0])
print(embeddings[0].shape)

odict_keys(['last_hidden_state', 'pooler_output'])
tensor([[-0.6239, -0.2058,  0.0411,  ...,  0.1490,  0.5681,  0.2381],
        [-0.3694, -0.1485,  0.3780,  ...,  0.4204,  0.5553,  0.1441],
        [-0.7221, -0.3813,  0.2031,  ...,  0.0761,  0.5162,  0.2813],
        ...,
        [-0.1894, -0.3711,  0.3034,  ...,  0.1536,  0.3265,  0.1376],
        [-0.2496, -0.5227,  0.2341,  ...,  0.3419,  0.3164,  0.0256],
        [-0.3311, -0.4430,  0.3492,  ...,  0.3655,  0.2910,  0.0728]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 768])


In [12]:
# We have our vectors of length 768 — but these are not sentence vectors as we have a vector representation for each token in our sequence
# (128 here as we are using SBERT — for BERT-base this is 512). We need to perform a mean pooling operation to create the sentence vector.

# The first thing we do is multiply each value in our embeddings tensor by its respective attention_mask value.
# The attention_mask contains ones where we have ‘real tokens’ (eg not padding tokens), and zeros elsewhere — 
# this operation allows us to ignore non-real tokens.

mask = tokens['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()
print(mask.shape)
print(mask[0])

# Now we have a masking array that has an equal shape to our output embeddings - we multiply those together to apply the masking operation on our outputs.
masked_embeddings = embeddings * mask
print(masked_embeddings[0])

# Sum the remaining embeddings along axis 1 to get a total value in each of our 768 values.
summed = torch.sum(masked_embeddings, 1)
print(summed.shape)

# Next, we count the number of values that should be given attention in each position of the tensor (+1 for real tokens, +0 for non-real).
counted = torch.clamp(mask.sum(1), min=1e-9)
print(counted.shape)

# Finally, we get our mean-pooled values as the summed embeddings divided by the number of values that should be given attention, counted.
mean_pooled = summed / counted
print(mean_pooled.shape)

torch.Size([7, 128, 768])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([[-0.6239, -0.2058,  0.0411,  ...,  0.1490,  0.5681,  0.2381],
        [-0.3694, -0.1485,  0.3780,  ...,  0.4204,  0.5553,  0.1441],
        [-0.7221, -0.3813,  0.2031,  ...,  0.0761,  0.5162,  0.2813],
        ...,
        [-0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([7, 768])
torch.Size([7, 768])
torch.Size([7, 768])


In [13]:
# Now these are our sentence vectors, using those we can measure similarity by calculating the cosine similarity between each.
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [14]:
# convert to numpy array from torch tensor
mean_pooled = mean_pooled.detach().numpy()

# calculate similarities (will store in array)
scores = np.zeros((mean_pooled.shape[0], mean_pooled.shape[0]))
for i in range(mean_pooled.shape[0]):
    scores[i, :] = cosine_similarity(
        [mean_pooled[i]],
        mean_pooled
    )[0]

print(scores)

# Now, think back to the earlier note about sentences b and g having essentially identical meaning whilst not sharing any of the same keywords.
# We’d hope SBERT and its superior semantic representations of language to identify these two sentences as similar — and 
# lo-and-behold the similarity between both is our second-highest score at 0.66 (circled above)

[[ 1.00000024  0.1869276   0.28297707  0.29628253  0.27451017  0.1017627
   0.21696275]
 [ 0.1869276   1.00000024  0.72058785  0.51428944  0.11749659  0.1930695
   0.66182357]
 [ 0.28297707  0.72058785  1.00000012  0.4886443   0.23568958  0.17157143
   0.5599308 ]
 [ 0.29628253  0.51428944  0.4886443   0.99999976  0.26985505  0.37889433
   0.52388823]
 [ 0.27451015  0.1174966   0.23568957  0.26985502  0.99999988  0.23422134
  -0.01599768]
 [ 0.10176268  0.19306949  0.1715714   0.37889433  0.23422134  1.
   0.22319689]
 [ 0.21696277  0.66182357  0.5599308   0.52388823 -0.0159977   0.22319691
   0.99999994]]


In [15]:
# Fortunately there is a much easier way to do all these, Which is just to use sentence-transformers directly

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

# We encode the sentences (producing our mean-pooled sentence embeddings) like so:
sentence_embeddings = model.encode([a, b, c, d, e, f, g])

# And calculate the cosine similarity just like before.
scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))
for i in range(sentence_embeddings.shape[0]):
    scores[i, :] = cosine_similarity(
        [sentence_embeddings[i]],
        sentence_embeddings
    )[0]
print(scores)

[[ 1.          0.18691427  0.28292829  0.2962288   0.27452093  0.10171202
   0.21698235]
 [ 0.18691427  0.99999994  0.72063208  0.5142017   0.11751355  0.19312078
   0.66177475]
 [ 0.28292829  0.72063208  0.99999988  0.48864788  0.23571709  0.1716553
   0.55989563]
 [ 0.2962288   0.5142017   0.48864788  1.          0.26980412  0.37895399
   0.52387094]
 [ 0.27452087  0.11751357  0.2357171   0.26980412  0.99999976  0.23412059
  -0.01596567]
 [ 0.101712    0.19312075  0.1716553   0.3789539   0.23412059  0.99999976
   0.22327027]
 [ 0.21698233  0.66177469  0.55989563  0.52387094 -0.01596566  0.2232703
   0.99999994]]
