# Docling PDF Reader example

In [None]:
import rich
from rich.pretty import pprint
import warnings
import os
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic")

## Basic usage

In order to load PDF data with Docling, we use a `DoclingPDFReader`.

To demonstrate a basic RAG pipeline, we set:
- `export_type` to Markdown, for using the Markdown text format
- `chunk_docs` to False, to just get back the exported documents (one can also use `chunk_docs=True` — in that case, chunking would also be performed internally, so the returned nodes would already be chunked)

In [None]:
from llama_index.readers.docling.base import DoclingPDFReader

reader = DoclingPDFReader(
    export_type=DoclingPDFReader.ExportType.MARKDOWN,  # Markdown export or native format (JSON)
    chunk_docs=False,  # whether to chunk the docs already within the reader or return the raw content
)
docs = reader.load_data(
    file_path="https://arxiv.org/pdf/2408.09869",  # PDF local path or URL (or iterable thereof)
)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Let's preview a doc sample:

In [None]:
import rich

md_snippet = docs[0].text[:700]
rich.print(f"{md_snippet}...")

## RAG demo

Next, we get the chunks and preview a chunk sample:

In [None]:
from llama_index.core.node_parser import MarkdownNodeParser

node_parser = MarkdownNodeParser()
nodes = node_parser.get_nodes_from_documents(documents=docs)
pprint(nodes[7], max_length=2, max_string=250, max_depth=2)

Setting up the embed model:

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Setting up the vector store:

In [None]:
from tempfile import TemporaryDirectory
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri=os.environ.get(
        "MILVUS_URL", default=f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
    ),
    collection_name="docling_collection",
    dim=len(embed_model.get_text_embedding("hi")),
    overwrite=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Setting up the index:

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    nodes=nodes,
    embed_model=embed_model,
    storage_context=storage_context,
    show_progress=True,
)

Generating embeddings:   0%|          | 0/33 [00:00<?, ?it/s]

Setting up the LLM:

In [None]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

HF_TOKEN = os.environ.get("HF_TOKEN")

llm = HuggingFaceInferenceAPI(
    token=HF_TOKEN,
    model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
)

And now we are ready to ask questions on our document content.

In [None]:
query_engine = index.as_query_engine(llm=llm)
query_res = query_engine.query("Can I use OCR with Docling?")
pprint(query_res, max_length=5, max_string=70, max_depth=4)