# Sparse Vectors with BM25

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/sparse_vectors_bm25.ipynb)

Implement hybrid search using sparse BM25 vectors alongside dense embeddings in Qdrant.

In [None]:
!pip install -q qdrant-client sentence-transformers

In [None]:
from collections import Counter
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer

In [None]:
client = QdrantClient(":memory:")
dense_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Create collection with both dense and sparse vectors
client.create_collection(
    collection_name="hybrid_collection",
    vectors_config={
        "dense": models.VectorParams(size=384, distance=models.Distance.COSINE),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        ),
    },
)

In [None]:
def simple_sparse_vector(text: str):
    """Create a simple sparse vector from term frequencies."""
    tokens = text.lower().split()
    tf = Counter(tokens)
    indices = [abs(hash(t)) % 30000 for t in tf]
    values = [float(c) for c in tf.values()]
    return models.SparseVector(indices=indices, values=values)

In [None]:
documents = [
    "Qdrant is a vector database for similarity search",
    "It supports both dense and sparse vectors",
    "Hybrid search combines keyword and semantic matching",
    "BM25 is a classic information retrieval scoring function",
    "Dense vectors capture semantic meaning of text",
]

points = []
for idx, doc in enumerate(documents):
    dense_vec = dense_model.encode(doc).tolist()
    sparse_vec = simple_sparse_vector(doc)

    points.append(
        models.PointStruct(
            id=idx,
            vector={
                "dense": dense_vec,
                "bm25": sparse_vec,
            },
            payload={"text": doc},
        )
    )

client.upsert(collection_name="hybrid_collection", points=points)
print(f"Loaded {len(points)} documents with dense + sparse vectors")

In [None]:
# Search using dense vector
query = "How does keyword search work?"
query_dense = dense_model.encode(query).tolist()

response = client.query_points(
    collection_name="hybrid_collection",
    query=query_dense,
    using="dense",
    limit=3,
)
results = response.points

print("Dense search results:")
for r in results:
    print(f"  Score: {r.score:.4f} | {r.payload['text']}")