In [5]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables.")
else:
    print("OPENAI_API_KEY is set.")

OPENAI_API_KEY is set.


In [6]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
)

In [7]:
import tiktoken
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
from langchain_docling import DoclingLoader
from docling.chunking import HybridChunker

enc = tiktoken.get_encoding("cl100k_base")

tokenizer = OpenAITokenizer(
    tokenizer=enc,
    max_tokens=128 * 1024,  # set to the model's context window
)

FILE_PATH = "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"

loader = DoclingLoader(file_path=FILE_PATH, chunker=HybridChunker(tokenizer=tokenizer))

In [8]:
docs = loader.load()

2026-01-06 00:37:41,065 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-06 00:37:41,165 - INFO - Going to convert document batch...
2026-01-06 00:37:41,165 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-06 00:37:41,187 - INFO - Loading plugin 'docling_defaults'
2026-01-06 00:37:41,189 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-06 00:37:41,196 - INFO - Loading plugin 'docling_defaults'
2026-01-06 00:37:41,200 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-01-06 00:37:42,023 - INFO - Auto OCR model selected ocrmac.
2026-01-06 00:37:42,030 - INFO - Loading plugin 'docling_defaults'
2026-01-06 00:37:42,033 - INFO - Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']
2026-01-06 00:37:42,038 - INFO - Accelerator device: 'mps'
2026-01-06 00:37:45,071 - INFO - Loading plugin 'docling_defaul

In [9]:
for d in docs[:3]:
    print(f"- {d.page_content=}")

- d.page_content='Attention Is All You Need\nAshish Vaswani ∗ Google Brain avaswani@google.com, 1 = Noam Shazeer ∗ Google Brain noam@google.com. Ashish Vaswani ∗ Google Brain avaswani@google.com, 2 = Niki Parmar ∗ Google Research nikip@google.com. Ashish Vaswani ∗ Google Brain avaswani@google.com, 3 = Jakob Uszkoreit ∗ Google Research usz@google.com. Llion Jones ∗ Google Research llion@google.com, 1 = Aidan N. Gomez ∗ † University of Toronto aidan@cs.toronto.edu. Llion Jones ∗ Google Research llion@google.com, 2 = Łukasz Kaiser ∗ Google Brain lukaszkaiser@google.com. Llion Jones ∗ Google Research llion@google.com, 3 = Łukasz Kaiser ∗ Google Brain lukaszkaiser@google.com'
- d.page_content='Abstract\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the T

In [10]:
from langchain_milvus import Milvus
from pathlib import Path
from tempfile import mkdtemp

milvus_uri = str(Path(mkdtemp()) / "vector.db")

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="vectordb",
    connection_args={"uri": milvus_uri},
    index_params={"index_type": "FLAT"},
    drop_old=True,
)

  from pkg_resources import DistributionNotFound, get_distribution
2026-01-06 00:38:00,933 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [11]:
TOP_K = 3

In [12]:
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

In [13]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [14]:
from langchain_core.prompts import PromptTemplate

PROMPT = PromptTemplate.from_template(
"""You must answer using ONLY the context below. Do not use outside knowledge.

CONTEXT (each excerpt includes its citation tag like [source:... page:... chunk:...])
---------------------
{context}
---------------------

QUERY: {input}

CITATION RULES
- Every factual claim must end with a citation tag copied from the context, like: [source:XYZ page:12 chunk:5].
- If a sentence contains multiple claims from different excerpts, include multiple citation tags at the end of that sentence.
- Do NOT invent citation tags. Use only tags that appear in the context verbatim.

Return exactly:
1) Final answer (short, 2–6 sentences, with citations)
2) Key points (3–7 bullets, each bullet with citations)
3) Assumptions (or "None")
If the context doesn't support the answer, say: "Not answerable from context."

RESPONSE:
"""
)


In [15]:
queries = ['mechanics of scaled dot product attention',
 'key aspects of multi head attention']

In [16]:
import json

def show_rag_response(resp_dict: dict) -> None:
    for i, doc in enumerate(resp_dict["context"]):
        print(f"Source {i + 1}:")
        print(f"  text: {json.dumps(doc.page_content)}")
        for key in doc.metadata:
            if key != "pk":
                val = doc.metadata.get(key)
                print(f"  {key}: {val}")

In [17]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

In [18]:
for query in queries:
    question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    resp_dict = rag_chain.invoke({"input": query})
    print(f"\n\n=== RAG Response for query: {query} ===")
    print()
    show_rag_response(resp_dict)

2026-01-06 00:38:04,206 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-06 00:38:21,617 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




=== RAG Response for query: mechanics of scaled dot product attention ===

Source 1:
  text: "Scaled Dot-Product Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel.\nquery with all keys, divide each by \u221a d k , and apply a softmax function to obtain the weights on the values.\nIn practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q . The keys and values are also packed together into matrices K and V . We compute the matrix of outputs as:\n<!-- formula-not-decoded -->\nThe two most commonly used attention functions are additive attention [2], and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of 1 \u221a d k . Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoreti

2026-01-06 00:38:23,150 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-06 00:38:27,772 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




=== RAG Response for query: key aspects of multi head attention ===

Source 1:
  text: "3.2.2 Multi-Head Attention\nInstead of performing a single attention function with d model-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to d k , d k and d v dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding d v -dimensional output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.\n4 To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1 . Then thei