In [None]:
!pip install fastembed

In [3]:
from fastembed import TextEmbedding
import numpy as np 

In [4]:
model = TextEmbedding(model_name='jinaai/jina-embeddings-v2-small-en')

Fetching 5 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.53s/it]


In [5]:
query = 'I just discovered the course. Can I join now?'

In [12]:
embedding = list(model.embed([query]))[0]

In [32]:
embedding_array = np.array(embedding)
embedding_array

array([-7.63946260e-02, -7.30555505e-02,  5.86501763e-02,  3.92670571e-02,
       -1.41797115e-02, -4.68342854e-02,  2.69862822e-02,  3.49039060e-02,
        1.53419217e-03, -1.10407363e-02,  5.57556285e-02, -2.88360142e-02,
       -4.87239498e-02, -9.90923923e-02,  6.02579764e-02, -1.86105379e-02,
       -8.81040139e-03, -3.01994831e-02, -1.42541815e-02, -3.87884212e-02,
       -2.76125662e-02, -6.94957533e-03,  3.20653319e-02, -5.17838394e-03,
        8.37959894e-02, -8.87329606e-02, -7.30260625e-02,  5.92846802e-02,
        4.07805674e-02,  7.72684822e-02, -4.60029697e-02,  3.98426895e-02,
        1.74821510e-02,  8.70263712e-03, -3.09291054e-02,  2.20470391e-02,
        4.76479896e-02,  1.76566196e-02, -3.62013627e-02, -3.53408062e-02,
       -5.92537995e-03,  2.97091202e-02,  8.05883752e-02,  1.07132722e-02,
       -5.38192519e-02, -9.62881111e-03, -1.17263739e-01,  3.36546374e-02,
        7.05763168e-03,  7.70159791e-03, -5.17482529e-02,  5.46955574e-02,
       -7.80289904e-02,  

In [30]:
array_size = embedding_array.size
print(f'The size of array: {array_size}')

The size of array: 512


In [28]:
min_val = embedding_array.min()
print(f'The Minimal value in this array is : {round(min_val, 3)}')

The Minimal value in this array is : -0.117


In [35]:
norm_len = np.linalg.norm(embedding_array)
print(f'Normalized vector length {norm_len}')

Normalized vector length 1.0


In [36]:
embedding_array.dot(embedding_array)

np.float64(1.0000000000000002)

In [37]:
doc = 'Can I still join the course after the start date?'

In [42]:
doc_embedding = list(model.embed([doc]))[0]
doc_embedding_np = np.array(doc_embedding)

In [47]:
cosine_similarity = embedding_array.dot(doc_embedding_np)
print(f'The cosine similarity between the vector for the query and the vector for the document: {round(cosine_similarity, 2)}')

The cosine similarity between the vector for the query and the vector for the document: 0.9


In [89]:
documents = [
    {
        'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
        'section': 'General course-related questions',
        'question': 'Course - Can I still join the course after the start date?',
        'course': 'data-engineering-zoomcamp'
    },
    {
        'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
        'section': 'General course-related questions',
        'question': 'Course - Can I follow the course after it finishes?',
        'course': 'data-engineering-zoomcamp'
    },
    {
        'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
        'section': 'General course-related questions',
        'question': 'Course - When will the course start?',
        'course': 'data-engineering-zoomcamp'
    },
    {
        'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
        'section': 'General course-related questions',
        'question': 'Course - What can I do before the course starts?',
        'course': 'data-engineering-zoomcamp'
    },
    {
        'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
        'section': 'General course-related questions',
        'question': 'How can we contribute to the course?',
        'course': 'data-engineering-zoomcamp'
    }
]

In [86]:
from fastembed.embedding import TextEmbedding
import numpy as np
from sklearn.preprocessing import normalize

# Initialize embedding model
model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# Embed the query
query = "I just discovered the course. Can I join now?"
query_embedding = list(model.embed([query]))[0]

# Embed all documents
doc_texts = [doc['text'] for doc in documents]
doc_embeddings = list(model.embed(doc_texts))

# Convert to numpy arrays
query_vec = np.array(query_embedding).reshape(1, -1)  # shape (1, 512)
doc_matrix = np.array(doc_embeddings)                # shape (5, 512)

# Normalize for cosine similarity
query_norm = normalize(query_vec)
doc_matrix_norm = normalize(doc_matrix)

# Compute cosine similarities
cosine_similarities = np.dot(doc_matrix_norm, query_norm.T).flatten()

# Find the index with the highest similarity
best_match_index = int(np.argmax(cosine_similarities))
print(f"Best matching document index: {best_match_index}")


Best matching document index: 1


In [107]:
doc_fulltext = [doc['question'] + ' ' + doc['text'] for doc in documents]
doc_fulltext_embedding = list(model.embed(doc_fulltext))

doc_fulltext_array = np.array(doc_fulltext_embedding)

doc_fulltext_norm = normalize(doc_fulltext_array)

cosine_similarity_fulltext = np.dot(doc_fulltext_norm, query_norm.T).flatten()
cosine_similarity_fulltext

array([0.85145432, 0.84365942, 0.8408287 , 0.7755158 , 0.80860078])

In [110]:
best_match_index_fulltext = int(np.argmax(cosine_similarity_fulltext))
print(f"Best matching document index: {best_match_index_fulltext}")

Best matching document index: 0


In [112]:
from fastembed import TextEmbedding
import numpy as np 

dims = []
for model in TextEmbedding.list_supported_models():
    dims.append(model['dim'])
print(np.array(dims).min())

384


In [115]:
from fastembed.embedding import TextEmbedding
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct
import requests

# Step 1: Load documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    if course['course'] == 'machine-learning-zoomcamp':
        for doc in course['documents']:
            doc['course'] = course['course']
            documents.append(doc)

# Step 2: Embedding model (384 dim)
model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Step 3: Qdrant client
qdrant = QdrantClient(":memory:")  # Or Qdrant running locally

# Step 4: Create collection
qdrant.recreate_collection(
    collection_name="ml-zoomcamp-faq",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

# Step 5: Prepare and insert data
points = []
for i, doc in enumerate(documents):
    text = doc['question'] + " " + doc['text']
    embedding = list(model.embed([text]))[0]
    points.append(PointStruct(id=i, vector=embedding, payload={"text": text}))

qdrant.upsert(collection_name="ml-zoomcamp-faq", points=points)

# Step 6: Query
query = "I just discovered the course. Can I join now?"
query_vector = list(model.embed([query]))[0]

# Step 7: Search
results = qdrant.search(
    collection_name="ml-zoomcamp-faq",
    query_vector=query_vector,
    limit=1
)

# Step 8: Get highest score
print(f"Highest score: {results[0].score}")


  qdrant.recreate_collection(


Highest score: 0.7397804856300354


  results = qdrant.search(
