## USE
https://radix.ai/blog/2021/3/a-guide-to-building-document-embeddings-part-2/

In [1]:
import numpy as np
import tensorflow_hub as hub
import tensorflow_text

my_document = [
    "This is my document.",
    "It consists of multiple sentences.",
    "This is the third and final sentence.",
]

use_m = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
)
sentence_embs = use_m(my_document).numpy()
doc_emb = np.mean(sentence_embs, axis=0)

In [2]:
sentence_embs.shape

(3, 512)

In [3]:
sentence_embs

array([[ 0.01277125,  0.04824684,  0.02633651, ..., -0.00436524,
        -0.04923647,  0.01573452],
       [ 0.06279358, -0.04311891, -0.06096669, ...,  0.02568746,
        -0.0052941 , -0.04394786],
       [-0.00691685, -0.00399939, -0.00667829, ..., -0.01735331,
         0.05321863,  0.0097335 ]], dtype=float32)

## BERT
https://huggingface.co/sentence-transformers/all-mpnet-base-v2 

In [4]:
import numpy as np
from sentence_transformers import SentenceTransformer

my_document = [
    "This is my document.",
    "It consists of multiple sentences.",
    "This is the third and final sentence.",
]
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
sentence_embs = model.encode(my_document)



In [5]:
sentence_embs.shape

(3, 768)

In [6]:
sentence_embs

array([[-0.04731807,  0.014176  ,  0.00761583, ...,  0.00303538,
        -0.04867401, -0.03072098],
       [ 0.07334287, -0.03173891, -0.00699191, ..., -0.00666402,
        -0.02057767, -0.02304883],
       [ 0.11271803, -0.00349732,  0.00105874, ...,  0.02205897,
        -0.08708036, -0.04711103]], dtype=float32)

## RoBERT

In [7]:
from transformers import BertConfig, RobertaForMaskedLM, RobertaTokenizer

model_path = "pdelobelle/robbert-v2-dutch-base"
tokenizer = RobertaTokenizer.from_pretrained(model_path)
config = BertConfig.from_pretrained(model_path, output_hidden_states=True)

robbert = RobertaForMaskedLM.from_pretrained(model_path, config=config)
robbert.eval()

inputs = tokenizer(my_document, return_tensors="pt", padding=True)

# For every sentence, take the embedding of the first element [CLS] of the final layer. Explanation of the indices:
# 1 - The model outputs a tuple, the first element contains the output of the model, the second the hidden states.
# -1 - The hidden states of the last layer.
# : - All elements in the batch (the sentences).
# 0 - First element of every sentence (including start and end tokens) -> [CLS] token.
sentence_embs = robbert(**inputs)[1][-1][:, 0].detach().numpy()
doc_emb = np.mean(sentence_embs, axis=0)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


In [8]:
sentence_embs.shape

(3, 768)

In [9]:
sentence_embs

array([[-1.0459678 , -0.17232835, -0.6760791 , ..., -0.99047405,
         0.92589414, -1.3538346 ],
       [-0.48277706,  0.72309726,  0.12228713, ..., -1.1843195 ,
         0.15349264, -1.4755102 ],
       [-0.50207144,  0.08903795, -0.19226997, ..., -0.90981984,
         1.0100853 , -1.6406622 ]], dtype=float32)