In [40]:
from fastembed import TextEmbedding

embedder = TextEmbedding("jinaai/jina-embeddings-v2-small-en")
query = "I just discovered the course. Can I join now?"
embedding_generator = embedder.embed([query])
embedding = list(embedding_generator)[0]  

print(embedding.shape)  
print(embedding.min())  

(512,)
-0.11726373885183883


In [41]:
import numpy as np

query1 = "I just discovered the course. Can I join now?"
query2 = "Is it possible for me to enroll in the class?"

embedding1 = list(embedder.embed([query1]))[0]
embedding2 = list(embedder.embed([query2]))[0]


print("Norm query1:", np.linalg.norm(embedding1))
print("Norm query2:", np.linalg.norm(embedding2))


similarity = embedding1.dot(embedding2)
print("Cosine similarity:", similarity)


Norm query1: 1.0
Norm query2: 0.9999999999999999
Cosine similarity: 0.8580827139966509


In [42]:
query = "I just discovered the course. Can I join now?"
doc = "Can I still join the course after the start date?"

embedding_query = list(embedder.embed([query]))[0]
embedding_doc = list(embedder.embed([doc]))[0]

similarity = embedding_query.dot(embedding_doc)
print("Cosine similarity:", similarity)


Cosine similarity: 0.9008528895674548


In [34]:
# Document texts
documents = [
    "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
    "Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.",
    "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00...",
    "You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK...",
    "Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository."
]

# Embed query and docs
q = list(embedder.embed([query]))[0]
V = np.vstack(list(embedder.embed(documents)))

# Compute cosine similarities
similarities = V.dot(q)
most_similar_index = int(np.argmax(similarities))

print("Most similar document index:", most_similar_index)


Most similar document index: 1


In [35]:
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks...",
     'question': 'Course - Can I still join the course after the start date?'},
    {'text': 'Yes, we will keep all the materials after the course finishes...',
     'question': 'Course - Can I follow the course after it finishes?'},
    {'text': 'The purpose of this document is to capture frequently asked technical questions...',
     'question': 'Course - When will the course start?'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements...',
     'question': 'Course - What can I do before the course starts?'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️...',
     'question': 'How can we contribute to the course?'}
]


full_texts = [doc['question'] + ' ' + doc['text'] for doc in documents]


V = np.vstack(list(embedder.embed(full_texts)))


similarities = V.dot(q)
best_index = int(np.argmax(similarities))

print("Best document index:", best_index)


Best document index: 0


In [36]:
embedder = TextEmbedding("BAAI/bge-small-en")
embedding = list(embedder.embed(["example sentence"]))[0]

print("Embedding dimension:", len(embedding))  


Embedding dimension: 384


In [37]:
import qdrant_client
from qdrant_client.models import Distance, VectorParams, PointStruct
import requests


docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
documents_raw = requests.get(docs_url).json()


documents = []
for course in documents_raw:
    if course['course'] != 'machine-learning-zoomcamp':
        continue
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)


embedder = TextEmbedding("BAAI/bge-small-en")


points = []
for idx, doc in enumerate(documents):
    full_text = doc['question'] + ' ' + doc['text']
    vector = list(embedder.embed([full_text]))[0]
    points.append(PointStruct(id=idx, vector=vector, payload=doc))


client = qdrant_client.QdrantClient(":memory:")  # or use local/server
client.recreate_collection(
    collection_name="ml_zoomcamp_faq",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)
client.upsert(collection_name="ml_zoomcamp_faq", points=points)


query = "I just discovered the course. Can I join now?"
query_vector = list(embedder.embed([query]))[0]

search_result = client.search(
    collection_name="ml_zoomcamp_faq",
    query_vector=query_vector,
    limit=1  
)

print("Top result score:", search_result[0].score)


Top result score: 0.8703173398971558


  client.recreate_collection(
  search_result = client.search(
