In [189]:
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy import dot
import chromadb

In [190]:
# Load sample corpus
SENTENCES = [
    "I missed my workout yesterday but I'll hit the gym today.",
    "My goal is to get a HD in my finance exam.",
    "I need to make a better budget for myself.",
    "I spent $50 on coffee last month, need to budget better.",
    "ChatGPT retrieval-augmented generations improves accuracy.",
    "Running 5km in under 25 minutes is my next target.",
]
SENTENCES

["I missed my workout yesterday but I'll hit the gym today.",
 'My goal is to get a HD in my finance exam.',
 'I need to make a better budget for myself.',
 'I spent $50 on coffee last month, need to budget better.',
 'ChatGPT retrieval-augmented generations improves accuracy.',
 'Running 5km in under 25 minutes is my next target.']

In [191]:
# Embed with SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(SENTENCES)

(6, 384)

In [192]:
# Assert vectors are unit-length
norms = np.linalg.norm(embeddings, axis=1)
print(norms)

[1.         1.         1.         1.         1.         0.99999994]


In [193]:
# Manual cosine check (>0 -> semantically related)
cos = dot(embeddings[1], embeddings[3])
"cosine:", cos

('cosine:', np.float32(0.15977593))

In [194]:
def most_similar(idx):
    sims = embeddings @ embeddings[idx].T
    order = sims.argsort()[::-1]
    return [(i, sims[i]) for i in order if i != idx]

most_similar(0)

[(np.int64(5), np.float32(0.30851096)),
 (np.int64(3), np.float32(0.13859929)),
 (np.int64(2), np.float32(0.044685468)),
 (np.int64(1), np.float32(0.032101903)),
 (np.int64(4), np.float32(0.028044892))]

In [195]:
# Spin up in-memory Chroma
client = chromadb.Client()
client.delete_collection("memory")
collection = client.create_collection("memory")

embeddings = model.encode(SENTENCES, normalize_embeddings=True)

collection.add(documents=SENTENCES, embeddings=embeddings.tolist(), ids=[f"id{i}" for i in range(len(SENTENCES))] )

In [199]:
# Query
query = "I hate coffee."
q_emb = model.encode([query])
result = collection.query(query_embeddings=q_emb.tolist(), n_results=5)
result["documents"][0]

['I spent $50 on coffee last month, need to budget better.',
 'I need to make a better budget for myself.',
 "I missed my workout yesterday but I'll hit the gym today.",
 'Running 5km in under 25 minutes is my next target.',
 'ChatGPT retrieval-augmented generations improves accuracy.']

In [200]:
result["distances"]

[[0.9683874845504761,
  1.6685576438903809,
  1.859717607498169,
  1.8615751266479492,
  1.9338407516479492]]

In [201]:
docs = collection.get()["documents"]
print(len(docs), len(set(docs)))

6 6
