In [20]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
from sentence_transformers import SentenceTransformer
from openai import OpenAI # We'll use this to talk to vLLM's OpenAI-compatible API
import uuid

vLLM and qdrant servers are running in background through docker
- vLLM is inference server
- qdrant is vector db

1Ô∏è‚É£ Connect to services


In [5]:
qdrant = QdrantClient(url="http://localhost:6333")  # your Qdrant Docker port
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [21]:
# Configure OpenAI to point to local vLLM server
client = OpenAI(
    base_url="http://localhost:8000/v1",  # vLLM API endpoint
    api_key="EMPTY"  # not required for local vLLM
)

In [8]:
COLLECTION_NAME = "rag_demo"

# -------------------------------
# 2Ô∏è‚É£ Create collection (once)
# -------------------------------
qdrant.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

  qdrant.recreate_collection(


True

 3Ô∏è‚É£ Add sample text data

In [9]:
docs = [
    "The Taj Mahal is located in Agra, India. It was built by Mughal Emperor Shah Jahan.",
    "The Great Wall of China was built to protect against invasions from the north.",
    "The Colosseum in Rome was used for gladiator fights and public spectacles.",
]

In [10]:
vectors = embedding_model.encode(docs)

In [11]:
vectors.shape

(3, 384)

In [12]:
# PointStruct is used to define each point in Qdrant. We create a unique ID, the vector, and a payload (metadata).
# upsert is used to add points to the collection. It is Update + Insert.
 
for i in range(len(docs)):
    point = PointStruct(
        id=uuid.uuid4().int >> 64,  # Qdrant needs int IDs
        vector=vectors[i].tolist(),
        payload={"text": docs[i]},
    )
    qdrant.upsert(collection_name=COLLECTION_NAME, points=[point])

4Ô∏è‚É£ Define retrieval function

In [16]:
def retrieve_context(query, top_k=2):
    query_vec = embedding_model.encode([query])[0]

    results = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vec,
        limit=top_k,
        with_payload=True,
    )

    context = "\n".join([r.payload["text"] for r in results.points])
    return context

5Ô∏è‚É£ Ask question via vLLM (OpenAI-compatible)

In [22]:
def rag_query(question):
    context = retrieve_context(question)
    prompt = f"""
    You are a helpful assistant. Use the following context to answer the question.

    Context:
    {context}

    Question: {question}

    Answer:
    """

    response = client.chat.completions.create(
        model="mistralai/Mistral-7B-Instruct-v0.2",  # your vLLM model
        messages=[
            {"role": "system", "content": "You are a knowledgeable assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.3,
    )

    return response.choices[0].message.content


6Ô∏è‚É£ Test it!

In [None]:
query = "Who built the Taj Mahal?"
print("üîç Question:", query)
print("üí¨ Answer:", rag_query(query))