# Semantic Chunking for Better RAG

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/semantic_chunking.ipynb)

Split documents into semantically coherent chunks instead of fixed-size windows. Uses embedding similarity to find natural breakpoints, then loads into Qdrant.

In [None]:
!pip install -q qdrant-client sentence-transformers numpy

In [None]:
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer

In [None]:
client = QdrantClient(":memory:")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def semantic_chunk(text: str, model, threshold: float = 0.5, min_chunk_size: int = 50):
    """Split text into chunks at semantic breakpoints.
    
    Embeds each sentence, then splits where consecutive sentence
    similarity drops below the threshold.
    """
    # Split into sentences
    sentences = [s.strip() for s in text.replace('\n', ' ').split('.') if s.strip()]
    if len(sentences) <= 1:
        return [text]

    # Embed all sentences
    embeddings = model.encode(sentences)

    # Compute cosine similarity between consecutive sentences
    similarities = []
    for i in range(len(embeddings) - 1):
        a, b = embeddings[i], embeddings[i + 1]
        sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        similarities.append(sim)

    # Find breakpoints where similarity drops
    chunks = []
    current_chunk = [sentences[0]]

    for i, sim in enumerate(similarities):
        if sim < threshold and len('. '.join(current_chunk)) >= min_chunk_size:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentences[i + 1]]
        else:
            current_chunk.append(sentences[i + 1])

    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')

    return chunks

In [None]:
# Sample document with distinct topics
document = """
Qdrant is a vector similarity search engine. It provides a production-ready 
service with a convenient API. Qdrant supports filtering and payload indexing.

Machine learning models convert data into vector embeddings. These embeddings 
capture semantic meaning in high-dimensional space. Similar items have vectors 
that are close together.

Docker makes it easy to deploy Qdrant. You can use Docker Compose for 
production setups. Kubernetes is recommended for large-scale deployments 
with automatic scaling.

Python is the most popular language for working with vector databases. 
The qdrant-client library provides a convenient interface. TypeScript 
and Rust clients are also available.
"""

# Compare fixed vs semantic chunking
semantic_chunks = semantic_chunk(document, model, threshold=0.5)

print(f"Semantic chunks ({len(semantic_chunks)}):")
for i, chunk in enumerate(semantic_chunks):
    print(f"\n  Chunk {i}: {chunk[:100]}..." if len(chunk) > 100 else f"\n  Chunk {i}: {chunk}")

In [None]:
# Load semantic chunks into Qdrant
client.create_collection(
    collection_name="semantic_chunks",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

points = [
    PointStruct(
        id=i,
        vector=model.encode(chunk).tolist(),
        payload={"text": chunk, "chunk_index": i},
    )
    for i, chunk in enumerate(semantic_chunks)
]

client.upsert(collection_name="semantic_chunks", points=points)
print(f"Loaded {len(points)} semantic chunks")

In [None]:
# Search
query = "How do I deploy Qdrant?"
query_vec = model.encode(query).tolist()

response = client.query_points(
    collection_name="semantic_chunks",
    query=query_vec,
    limit=3,
)

print(f"Query: '{query}'\n")
for r in response.points:
    print(f"Score: {r.score:.4f}")
    print(f"  {r.payload['text']}\n")