In [2]:
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm as notebook_tqdm

In [3]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [4]:
# Example corpus
documents = [
    "Machine learning enables computers to learn from data.",
    "Cats and dogs can be great pets.",
    "Quantum physics explores the behavior of particles."
]

In [5]:
# Compute embeddings (each document -> 384-dimensional vector)
embeddings = model.encode(documents, show_progress_bar=True)
print("Embeddings shape:", embeddings.shape)


Batches: 100%|██████████| 1/1 [00:00<00:00, 48.04it/s]

Embeddings shape: (3, 384)





In [13]:
# display embeddings for each document with top 10 dimensions
for i, doc in enumerate(documents):
    print(f"Document {i}: {doc}")
    print("Top 10 dimensions of the embedding:", embeddings[i][:10])
    print()

Document 0: Machine learning enables computers to learn from data.
Top 10 dimensions of the embedding: [-0.01599952  0.01208692  0.08066807  0.01379563  0.02841322 -0.0147575
 -0.03639276 -0.0995675   0.0190883   0.00763975]

Document 1: Cats and dogs can be great pets.
Top 10 dimensions of the embedding: [ 4.72100601e-02  3.04640089e-05  7.89149627e-02  3.25913467e-02
 -1.09277025e-01  1.97223872e-02 -1.91940013e-02 -7.74343833e-02
 -1.89232938e-02  2.05149129e-02]

Document 2: Quantum physics explores the behavior of particles.
Top 10 dimensions of the embedding: [-0.06063841 -0.04609358 -0.01987925  0.13131666 -0.03221935  0.05527749
 -0.01502854 -0.03053921  0.02734886  0.04802484]



In [7]:
!pip install faiss-cpu --quiet
import numpy as np
import faiss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [8]:
# Create a FAISS index for 384-dimensional vectors using Euclidean (L2) distance
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))  # add all document vectors to the index

In [9]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x70b41c075f20> >

In [10]:
# Query example: semantic search for "pet animals"
query = "favorite pets"
query_vec = model.encode([query])       # embed the query
D, I = index.search(query_vec, k=2)     # find 2 nearest neighbors

print("Nearest document indices:", I[0])
print("Distances:", D[0])
print("Nearest documents:", [documents[i] for i in I[0]])

Nearest document indices: [1 2]
Distances: [0.7805519 1.8087312]
Nearest documents: ['Cats and dogs can be great pets.', 'Quantum physics explores the behavior of particles.']
