In [7]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables.")
else:
    print("OPENAI_API_KEY is set.")

OPENAI_API_KEY is set.


In [15]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
)

In [12]:
import tiktoken
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
from langchain_docling import DoclingLoader
from docling.chunking import HybridChunker

enc = tiktoken.get_encoding("cl100k_base")

tokenizer = OpenAITokenizer(
    tokenizer=enc,
    max_tokens=128 * 1024,  # set to the model's context window
)

FILE_PATH = "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"

loader = DoclingLoader(file_path=FILE_PATH, chunker=HybridChunker(tokenizer=tokenizer))

In [13]:
docs = loader.load()

2026-01-05 14:01:21,819 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 14:01:21,843 - INFO - Going to convert document batch...
2026-01-05 14:01:21,845 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-05 14:01:21,850 - INFO - Auto OCR model selected ocrmac.
2026-01-05 14:01:21,851 - INFO - Accelerator device: 'mps'
2026-01-05 14:01:23,965 - INFO - Accelerator device: 'mps'
2026-01-05 14:01:24,832 - INFO - Processing document NIPS-2017-attention-is-all-you-need-Paper.pdf
2026-01-05 14:01:35,989 - INFO - Finished converting document NIPS-2017-attention-is-all-you-need-Paper.pdf in 16.83 sec.


In [14]:
for d in docs[:3]:
    print(f"- {d.page_content=}")

- d.page_content='Attention Is All You Need\nAshish Vaswani ∗ Google Brain avaswani@google.com, 1 = Noam Shazeer ∗ Google Brain noam@google.com. Ashish Vaswani ∗ Google Brain avaswani@google.com, 2 = Niki Parmar ∗ Google Research nikip@google.com. Ashish Vaswani ∗ Google Brain avaswani@google.com, 3 = Jakob Uszkoreit ∗ Google Research usz@google.com. Llion Jones ∗ Google Research llion@google.com, 1 = Aidan N. Gomez ∗ † University of Toronto aidan@cs.toronto.edu. Llion Jones ∗ Google Research llion@google.com, 2 = Łukasz Kaiser ∗ Google Brain lukaszkaiser@google.com. Llion Jones ∗ Google Research llion@google.com, 3 = Łukasz Kaiser ∗ Google Brain lukaszkaiser@google.com'
- d.page_content='Abstract\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the T

In [17]:
from langchain_milvus import Milvus
from pathlib import Path
from tempfile import mkdtemp

milvus_uri = str(Path(mkdtemp()) / "vector.db")

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="vectordb",
    connection_args={"uri": milvus_uri},
    index_params={"index_type": "FLAT"},
    drop_old=True,
)

  from pkg_resources import DistributionNotFound, get_distribution
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2026-01-05 14:10:05,768 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [19]:
TOP_K = 3

In [20]:
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

In [26]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [23]:
from langchain_core.prompts import PromptTemplate

PROMPT = PromptTemplate.from_template(
"""You must answer using ONLY the context below. Do not use outside knowledge.

CONTEXT (each excerpt includes its citation tag like [source:... page:... chunk:...])
---------------------
{context}
---------------------

QUERY: {input}

CITATION RULES
- Every factual claim must end with a citation tag copied from the context, like: [source:XYZ page:12 chunk:5].
- If a sentence contains multiple claims from different excerpts, include multiple citation tags at the end of that sentence.
- Do NOT invent citation tags. Use only tags that appear in the context verbatim.

Return exactly:
1) Final answer (short, 2–6 sentences, with citations)
2) Key points (3–7 bullets, each bullet with citations)
3) Assumptions (or "None")
If the context doesn't support the answer, say: "Not answerable from context."

RESPONSE:
"""
)


In [24]:
QUESTION = "What is the structure of a Transformer model?"

In [27]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
resp_dict = rag_chain.invoke({"input": QUESTION})

2026-01-05 14:21:52,038 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-05 14:21:57,654 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [28]:
import json

for i, doc in enumerate(resp_dict["context"]):
    print(f"Source {i + 1}:")
    print(f"  text: {json.dumps(doc.page_content)}")
    for key in doc.metadata:
        if key != "pk":
            val = doc.metadata.get(key)
            print(f"  {key}: {val}")

Source 1:
  text: "3 Model Architecture\nMost competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29]. Here, the encoder maps an input sequence of symbol representations ( x 1 , ..., x n ) to a sequence of continuous representations z = ( z 1 , ..., z n ) . Given z , the decoder then generates an output sequence ( y 1 , ..., y m ) of symbols one element at a time. At each step the model is auto-regressive [9], consuming the previously generated symbols as additional input when generating the next.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively."
  source: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
  dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/18', '