In [1]:
from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qd_client = QdrantClient('http://localhost:6333')

In [3]:
qd_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='zoomcamp-sparse'), CollectionDescription(name='zoomcamp-faq'), CollectionDescription(name='zoomcamp-rag')])

In [4]:
import requests 

docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
if docs_response.status_code == 200:
    documents_raw = docs_response.json()

In [5]:
collection_name = 'zoomcamp-sparse'

if qd_client.collection_exists(collection_name):
    print(f'Collection {collection_name} already exists')
else:
    qd_client.create_collection(
        collection_name=collection_name,
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF,
                index=models.SparseIndexParams(type="text", field="text")
            )
        }
    )

Collection zoomcamp-sparse already exists


In [6]:
import uuid

qd_client.upsert(
    collection_name=collection_name,
    points = [
        models.PointStruct(
        id=uuid.uuid4().hex,
        vector={
            "bm25": models.Document(
                text=doc['text'],
                model="Qdrant/bm25",
            ),
        },
        payload={
            'text': doc['text'],
            'section': doc['section'],
            'course': course['course']
        }
    )
    for course in documents_raw
    for doc in course['documents']
    ]
)

Fetching 18 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 25.01it/s]


UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

In [7]:
def search(query):
    model_handle= "Qdrant/bm25"
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        using="bm25",  
        limit=1,
        with_payload=True
    )
    
    return results.points

In [8]:
result = search('the course has already started? Can I still enroll?')

In [9]:
result[0].payload['text']

"Solution:\nCheck if you’re on the Developer Plan. As per the prerequisites, you'll need to be enrolled in the Team Plan or Enterprise Plan to set up a CI Job in dbt Cloud.\nSo If you're on the Developer Plan, you'll need to upgrade to utilise CI Jobs.\nNote from another user: I’m in the Team Plan (trial period) but the option is still disabled. What worked for me instead was this. It works for the Developer (free) plan."

In [10]:
result[0].score

15.356087

In [11]:
import random 
import json 

random.seed(202506)

course = random.choice(documents_raw)
course_piece = random.choice(course["documents"])
print(json.dumps(course_piece, indent=2))

{
  "text": "Even though the upload works using aws cli and boto3 in Jupyter notebook.\nSolution set the AWS_PROFILE environment variable (the default profile is called default)",
  "section": "Module 4: Deployment",
  "question": "Uploading to s3 fails with An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.\""
}
