In [1]:
import os
import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

In [3]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]


In [6]:
tokenizer.batch_encode_plus(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128)

{'input_ids': tensor([[  101,  2093,  2086,  2101,  1010,  1996, 13123,  2001,  2145,  2440,
          1997, 15333,  7174,  1012,   102,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  1996,  3869, 13830,  1997, 13002,  1996,  3869, 18912,  2140,
          1998,  2046,  1996, 11848,  2073,  2002,  2387,  2010,  2767,  2175,
          1012,   102],
        [  101,  1996,  2711,  3482,  2001,  8966,  2007, 20919,  2116,  9877,
          1997,  2706,  2101,  1012,   102,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  2002,  2179,  1037,  3393, 28139,  7507,  4609,  1999,  2010,
         18489,  5806,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0

In [5]:
tokens = {"input_ids": [], "attention_mask": []}
for sentence in sentences:
    toks = tokenizer.encode_plus(sentence, max_length = 128, truncation = True, padding="max_length", return_tensors="pt")
    tokens['input_ids'].append(toks['input_ids'][0])
    tokens['attention_mask'].append(toks['attention_mask'][0])


In [6]:
# stacking into single tensor
tokens["input_ids"] = torch.stack(tokens['input_ids'])
tokens["attention_mask"] = torch.stack(tokens['attention_mask'])

In [7]:
outputs = model(**tokens)

In [8]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [9]:
embeddings = outputs.last_hidden_state
embeddings.size()

torch.Size([4, 128, 768])

In [10]:
# single vector embedding of the four texts
attention_mask = tokens['attention_mask']
attention_mask.size()

torch.Size([4, 128])

In [12]:
# get attention mask in shape [4, 128, 768] -> [sentences, tokens, embedding_dim]
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

In [13]:
mask.size()

torch.Size([4, 128, 768])

In [17]:
masked_embeddings = embeddings * mask
masked_embeddings.size()

torch.Size([4, 128, 768])

In [18]:
# summing the masked embeddings along axis 1
summed = torch.sum(masked_embeddings, 1)
summed.size()

torch.Size([4, 768])

In [20]:
# values to be provided attention
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.size()

torch.Size([4, 768])

In [21]:
mean_pooled = summed / summed_mask

In [22]:
mean_pooled.size()

torch.Size([4, 768])

In [24]:
mean_pooled = mean_pooled.detach().numpy()

cosine_similarity([mean_pooled[0]], mean_pooled[1:])

array([[0.33088914, 0.7219258 , 0.5548363 ]], dtype=float32)

In [25]:
import umap

In [28]:
umap_embeddings = umap.UMAP(n_neighbors=2, n_components=2, metric='cosine').fit_transform(mean_pooled)

In [31]:
umap_embeddings.shape

(4, 2)

In [32]:
umap_embeddings

array([[ 1.3746895, -8.148749 ],
       [ 1.4944549, -7.2197933],
       [ 0.6015909, -5.8285623],
       [ 0.5158273, -7.859742 ]], dtype=float32)