# Using SBERT models without sentence-transformer installed.

This is a brief guide on how to run sentence-transformer and SBERT pretrain model without installing sentence-transformer.

However, this still requires transfomers to run :(

In [1]:
!pip install -U transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

Take attention mask into account for correct averaging

In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min = 1e-9)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
target_sentence = "I am very happy"
sentences = [
    "I am totally happy today",
    "I am sad"
]

Get sentence transformer - longer than original code.

In [6]:
def get_sentence_embedding(sentence):
    encoded_input = tokenizer(sentence, padding = True, truncation = True, return_tensors = 'pt')
    
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)

    return sentence_embeddings

In [7]:
target_embedding = get_sentence_embedding(target_sentence)

sentence_embeddings = list()

for sentence in sentences:
    sentence_embeddings.append(get_sentence_embedding(sentence))

Predicting similarity score.

In [8]:
for sentence_index in range(len(sentence_embeddings)):
    similarity_score = F.cosine_similarity(target_embedding, sentence_embeddings[sentence_index]).numpy()
    print(f'Similarity score between {target_sentence} and {sentences[sentence_index]} is\n', similarity_score[0])

Similarity score between I am very happy and I am totally happy today is
 0.7668243
Similarity score between I am very happy and I am sad is
 0.41084984
