# Extract PDF Text and Load into Qdrant

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/pdf_to_qdrant.ipynb)

Parse PDFs with PyMuPDF, chunk text with overlap, embed, and store in Qdrant for RAG pipelines.

In [None]:
!pip install -q qdrant-client sentence-transformers PyMuPDF

In [None]:
import fitz  # PyMuPDF
import tempfile
from pathlib import Path
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer

In [None]:
client = QdrantClient(":memory:")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Create a sample PDF for demo
pdf_path = Path(tempfile.mktemp(suffix=".pdf"))
doc = fitz.open()

pages_text = [
    "Qdrant is a vector similarity search engine and vector database. It provides a production-ready service with a convenient API to store, search, and manage points (vectors with an additional payload). Qdrant is tailored to extended filtering support.",
    "HNSW (Hierarchical Navigable Small World) is the primary indexing algorithm used in Qdrant. It builds a multi-layer graph structure that enables efficient approximate nearest neighbor search. The key parameters are m (number of connections) and ef_construct (search depth during construction).",
    "Quantization in Qdrant reduces memory usage by compressing vector representations. Scalar quantization converts float32 to int8, reducing memory by 4x. Binary quantization provides up to 32x reduction but works best with high-dimensional vectors like OpenAI embeddings.",
]

for text in pages_text:
    page = doc.new_page()
    page.insert_text((72, 72), text, fontsize=12)

doc.save(str(pdf_path))
doc.close()
print(f"Created sample PDF with {len(pages_text)} pages")

In [None]:
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50):
    """Split text into overlapping chunks."""
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i : i + chunk_size]
        if chunk.strip():
            chunks.append(chunk)
    return chunks

In [None]:
client.create_collection(
    collection_name="pdf_collection",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

In [None]:
doc = fitz.open(str(pdf_path))
points = []
point_id = 0

for page_num, page in enumerate(doc):
    text = page.get_text()
    chunks = chunk_text(text)

    for chunk in chunks:
        embedding = model.encode(chunk).tolist()
        points.append(
            PointStruct(
                id=point_id,
                vector=embedding,
                payload={"text": chunk, "page": page_num, "source": str(pdf_path)},
            )
        )
        point_id += 1

client.upsert(collection_name="pdf_collection", points=points)
print(f"Loaded {point_id} chunks from {len(doc)} pages")

In [None]:
# Search the PDF content
query_vector = model.encode("How does quantization reduce memory?").tolist()
results = client.search(
    collection_name="pdf_collection",
    query_vector=query_vector,
    limit=3,
)

for r in results:
    print(f"Score: {r.score:.4f} | Page {r.payload['page']}")
    print(f"  {r.payload['text'][:120]}...")
    print()