In [1]:
from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = QdrantClient("http://localhost:6333")

In [3]:
import requests 

docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
if docs_response.status_code == 200:
    documents_raw = docs_response.json()

In [8]:
from fastembed import TextEmbedding
import numpy as np 

dims = []
for model in TextEmbedding.list_supported_models():
    dims.append(model['dim'])
print(np.array(dims).min())

384


In [5]:
import json 

EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

{
  "model": "BAAI/bge-small-zh-v1.5",
  "sources": {
    "hf": "Qdrant/bge-small-zh-v1.5",
    "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.09,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "Qdrant/clip-ViT-B-32-text",
  "sources": {
    "hf": "Qdrant/clip-ViT-B-32-text",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model.onnx",
  "description": "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
  "license": "mit",
  "size_in_GB": 0.25,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "jinaai/jina-embeddings-v2-small-e

In [8]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [10]:
collection_name = "zoomcamp-rag"

if client.collection_exists(collection_name):
    print(f"collection {collection_name} already exists")
else:    
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,
            distance=models.Distance.COSINE
        )
    )

collection zoomcamp-rag already exists


In [11]:
points = []
id = 0

for course in documents_raw:
    for doc in course['documents']:
        point = models.PointStruct(
            id=id,
            vector=models.Document(
                text=doc['text'],
                model=model_handle
            ),
            payload={
                "text": doc['text'],
                "section": doc['section'],
                "course": course['course']
            }
        )
        points.append(point)
        id += 1

In [12]:
points[0]

PointStruct(id=0, vector=Document(text="The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", model='jinaai/jina-embeddings-v2-small-en', options=None), payload={'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register

In [11]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [13]:
def search(query, limit=1):
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        limit=limit,
        with_payload=True
    )
    return results

In [14]:
import random

course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])
print(json.dumps(course_piece, indent=2))

{
  "text": "Since the version 1.29 the list_experiments method was deprecated and then removed in the later version\nYou should use search_experiments instead\nAdded by Alex Litvinov",
  "section": "Module 2: Experiment tracking",
  "question": "MlflowClient object has no attribute 'list_experiments'"
}


In [16]:
result = search(course_piece['question'])

Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.12it/s]


In [17]:
result

QueryResponse(points=[ScoredPoint(id=880, version=1, score=0.85721594, payload={'text': 'Problem: CLI commands (mlflow experiments list) do not return experiments\nSolution description: need to set environment variable for the Tracking URI:\n$ export MLFLOW_TRACKING_URI=http://127.0.0.1:5000\nAdded and Answered by Dino Vitale', 'section': 'Module 2: Experiment tracking', 'course': 'mlops-zoomcamp'}, vector=None, shard_key=None, order_value=None)])

In [18]:
print(f"Question: \n{course_piece['question']}\n")
print("Top Retrieved Answer:\n{}\n".format(result.points[0].payload['text']))
print("Original Answer:\n{}".format(course_piece['text']))

Question: 
MlflowClient object has no attribute 'list_experiments'

Top Retrieved Answer:
Problem: CLI commands (mlflow experiments list) do not return experiments
Solution description: need to set environment variable for the Tracking URI:
$ export MLFLOW_TRACKING_URI=http://127.0.0.1:5000
Added and Answered by Dino Vitale

Original Answer:
Since the version 1.29 the list_experiments method was deprecated and then removed in the later version
You should use search_experiments instead
Added by Alex Litvinov


In [19]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [20]:
def search_in_course(query, course="mlops-zoomcamp", limit=1):
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit,
        with_payload=True
    )

    return results            

In [21]:
print(search_in_course("What if I submit homework late?", "data-engineering-zoomcamp").points[0].payload['text'])

No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y
Older news:[source1] [source2]
