In [None]:
%%capture
%pip install sentence_transformers

In [2]:
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = [
    "A man is drinking water.",
    "A man is drinking a cup of tea.",
    "The woman is carrying a baby.",
    "A man is riding a cycle.",
    "A girl is playing violin.",
    "Two men are pushing a car in the parking lot.",
    "A man is riding a black horse on an enclosed ground.",
    "A monkey is eating a banana.",
    "A tiger is climbing a tree."
]

modules.json: 100%|██████████| 349/349 [00:00<00:00, 3.01MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 144kB/s]
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 7.60MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 90.4kB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.00MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:15<00:00, 5.98MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 604kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 710kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.52MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 194kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 268kB/s]


In [4]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [6]:
# Query sentences:
queries = [
    "A man is drinking coffee.",
    "Someone in a gorilla costume is eating a banana",
    "A tiger is near a tree to hunt a prey.",
]


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))





Query: A man is drinking coffee.

Top 5 most similar sentences in corpus:
A man is drinking a cup of tea. (Score: 0.6578)
A man is drinking water. (Score: 0.5808)
A man is riding a cycle. (Score: 0.1873)
A tiger is climbing a tree. (Score: 0.1672)
A man is riding a black horse on an enclosed ground. (Score: 0.1299)




Query: Someone in a gorilla costume is eating a banana

Top 5 most similar sentences in corpus:
A monkey is eating a banana. (Score: 0.7489)
A tiger is climbing a tree. (Score: 0.2181)
A man is drinking water. (Score: 0.1145)
A man is drinking a cup of tea. (Score: 0.1144)
A man is riding a black horse on an enclosed ground. (Score: 0.0729)




Query: A tiger is near a tree to hunt a prey.

Top 5 most similar sentences in corpus:
A tiger is climbing a tree. (Score: 0.7822)
A monkey is eating a banana. (Score: 0.2203)
A man is riding a black horse on an enclosed ground. (Score: 0.0832)
A girl is playing violin. (Score: 0.0776)
The woman is carrying a baby. (Score: 0.0