In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables.")
else:
    print("OPENAI_API_KEY is set.")

OPENAI_API_KEY is set.


In [3]:
from langchain.chat_models import init_chat_model
from deepagents import create_deep_agent

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


**RAG.**
Define a retrieval function that the deep_agent will use as a tool

In [4]:
FILE_PATH = "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"
TOP_K = 3
queries = ['mechanics of scaled dot product attention',
 'key aspects of multi head attention']

In [5]:
from pathlib import Path
from tempfile import mkdtemp

import tiktoken
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_milvus import Milvus

from langchain_core.prompts import PromptTemplate
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
)

enc = tiktoken.get_encoding("cl100k_base")
tokenizer = OpenAITokenizer(
    tokenizer=enc,
    max_tokens=128 * 1024,  # set to the model's context window
)

loader = DoclingLoader(file_path=FILE_PATH, chunker=HybridChunker(tokenizer=tokenizer))
docs = loader.load()

milvus_uri = str(Path(mkdtemp()) / "vector.db")

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="vectordb",
    connection_args={"uri": milvus_uri},
    index_params={"index_type": "FLAT"},
    drop_old=True,
)

retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

PROMPT = PromptTemplate.from_template(
"""You must answer using ONLY the context below. Do not use outside knowledge.

CONTEXT (each excerpt includes its citation tag like [source:... page:... chunk:...])
---------------------
{context}
---------------------

QUERY: {input}

CITATION RULES
- Every factual claim must end with a citation tag copied from the context, like: [source:XYZ page:12 chunk:5].
- If a sentence contains multiple claims from different excerpts, include multiple citation tags at the end of that sentence.
- Do NOT invent citation tags. Use only tags that appear in the context verbatim.

Return exactly:
1) Final answer (short, 2–6 sentences, with citations)
2) Key points (3–7 bullets, each bullet with citations)
3) Assumptions (or "None")
If the context doesn't support the answer, say: "Not answerable from context."

RESPONSE:
"""
)

llm = ChatOpenAI(model="gpt-4o", temperature=0)

question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

2026-01-06 23:39:02,487 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-06 23:39:02,575 - INFO - Going to convert document batch...
2026-01-06 23:39:02,576 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-06 23:39:02,584 - INFO - Loading plugin 'docling_defaults'
2026-01-06 23:39:02,586 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-06 23:39:02,592 - INFO - Loading plugin 'docling_defaults'
2026-01-06 23:39:02,595 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-01-06 23:39:03,188 - INFO - Auto OCR model selected ocrmac.
2026-01-06 23:39:03,198 - INFO - Loading plugin 'docling_defaults'
2026-01-06 23:39:03,204 - INFO - Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']
2026-01-06 23:39:03,208 - INFO - Accelerator device: 'mps'
2026-01-06 23:39:05,105 - INFO - Loading plugin 'docling_defaul

In [6]:
from langchain.tools import tool

@tool
def retrieve_from_vectorstore(queries: list[str]) -> dict:
    """Retrieve and generate answers from the vectorstore for a list of queries.
    This function invokes the RAG chain for each query and returns aggregated results."""
    
    resp_dict={}
    for query in queries:
        resp_dict[query] = rag_chain.invoke({"input": query})
        
    return resp_dict

**Prepare settings for subagents**

In [None]:
examples = """
Example 1 single information need
Input query:
How does dropout prevent overfitting in neural networks
Output:
{"sub_queries":["dropout regularization reducing neural network overfitting"]}

Example 2 two distinct information needs
Input query:
effects of microplastics on marine food webs and human health risks
Output:
{"sub_queries":["microplastics impacts on marine food webs","human health risks from microplastic exposure"]}

Example 3 avoid over splitting broad impact
Input query:
What are the latest advancements in natural language processing and how do they impact machine learning models
Output:
{"sub_queries":["recent advancements in natural language processing","impact of recent NLP advances on machine learning models"]}

Example 4 three distinct information needs
Input query:
role of gut microbiome in obesity and type 2 diabetes and dietary interventions
Output:
{"sub_queries":["gut microbiome links to obesity","gut microbiome links to type 2 diabetes","dietary interventions modulating gut microbiome in metabolic disease"]}
""".strip()


query_decomposition_prompt = """
You are an academic query decomposition and refinement agent for vector retrieval.
You will receive a query from a user which may contain single or multiple distinct information needs.

Here are the examples which are done well:
{examples}

Goal:
Return a list of refined sub-queries for retrieving relevant academic context from a vector database.

Output (strict):
- Return ONLY an object with key "sub_queries" containing a list of strings.
- Each string must be a refined retrieval query.
- Do NOT add any other keys or any extra text.

Decomposition rules:
- If the input expresses ONE coherent information need, return exactly 1 refined query.
- If the input contains multiple distinct information needs different questions topics or aspects return 2 to 5 sub-queries.
- Each sub-query must be atomic one concept or aspect only.
- Avoid overlap no near-duplicate sub-queries.

Refinement rules apply to every sub-query:
- Preserve the user intent exactly do not change meaning.
- Do NOT add new concepts not present or clearly implied.
- Use precise academic or technical terminology.
- Prefer noun phrases avoid questions and full sentences.
- Remove filler words stopwords and conversational phrasing.
- Length 5 to 12 words per sub-query max 12.
- Do NOT use punctuation quotes Boolean operators or special syntax.
- Do NOT include years author names or datasets unless explicitly present in the input.
- Properly identify distinct aspects if the input is broad or vague.
- Do NOT over-split sub-queries unnecessarily.
- Try to keep related concepts together.

Return the structured output now.
""".format(examples=examples)

query_decomposition_model = "gpt-5-mini"
query_decomposition_description = "Analyzes research queries and generates optimized, non-overlapping sub-queries for vector database retrieval."

In [11]:
vectorstore_retrieval_prompt = """
You are a vectorstore retrieval orchestration agent. Your primary responsibility is to retrieve academic context from a vector database using refined sub-queries.

Task:
You will receive one or more sub-queries from the query decomposition agent. Your role is to use the retrieve_from_vectorstore tool to fetch relevant academic papers, citations, and context for each sub-query.

Instructions:
1. Accept the list of sub-queries provided by the upstream agent.
2. Call retrieve_from_vectorstore with ALL sub-queries at once to retrieve relevant documents and answers from the vector database.
3. The tool returns a dictionary mapping each sub-query to its retrieved context and generated answer.
4. Aggregate and structure the retrieved results, ensuring no loss of information.
5. Present the aggregated retrieval results in a clear, hierarchical format organized by sub-query.

Output format:
- Organize results by sub-query as keys
- Include retrieved context, citations, and synthesized answers for each sub-query
- Preserve all citation metadata (source, page, chunk information)
- Flag any sub-queries that returned insufficient context

Important:
- Always use retrieve_from_vectorstore to access the vector database. Do not attempt to retrieve information manually.
- Ensure all sub-queries are passed to the tool in a single batch for efficiency.
- If retrieval fails for any sub-query, clearly indicate which ones had issues.
"""

vectorstore_retrieval_model = "gpt-5-nano"
vectorstore_retrieval_description = "Executes batch retrieval of academic context from vector database and preserves citations and metadata for structured result aggregation."

**Main Deep Agent**
- Decomposes user queries into subqueries
- Retrieves relevant information from the vectorstore using subqueries
- Formats and structures the retrieved context

In [9]:
from langchain.agents import create_agent
from deepagents.middleware.subagents import SubAgentMiddleware
from langchain.chat_models import init_chat_model

model = init_chat_model(model="gpt-5-mini")

In [12]:
from typing_extensions import TypedDict
from typing import List, Annotated

class AggregatedContext(TypedDict):
    sub_query: Annotated[str, "The sub-query string"]
    retrieved_context: Annotated[str, "The retrieved context string"]
    citations: Annotated[List[str], "List of citation identifiers"]
    synthesized_answer: Annotated[str, "The synthesized answer string"]
    
class AggregatedContextList(TypedDict):
    results: Annotated[List[AggregatedContext], "List of aggregated context for each sub-query"]

In [13]:
PROMPT = """
You are the main retrieval orchestrator for academic question answering. You coordinate sub-agents for query decomposition and vectorstore retrieval, then deliver a concise, citation-backed response.

Workflow:
1) Send the user query to query_decomposition_subagent. Expect an object with key sub_queries.
2) If decomposition fails or returns no sub-queries, fall back to a single sub-query equal to the original user query.
3) Send the full sub_queries list to vectorstore_retrieval_subagent (retrieve_from_vectorstore tool) in one batch. Expect a dictionary mapping each sub-query to retrieved context and an answer with citations.
4) Aggregate all sub-query answers into a single, coherent response.

Response format (strict):
- Final answer: 2–6 sentences, each factual claim ends with citations from the retrieved context.
- Key points: 3–7 bullets, each bullet ends with citations.
- Assumptions: list assumptions made, or "None".

Rules:
- Never invent citations; only use those provided by the retrieval results.
- If any sub-query returned insufficient context, say so explicitly for that sub-query.
- If no context supports an answer, reply: "Not answerable from context.".
- Keep wording concise and academic; avoid markdown tables and superfluous formatting.
"""

In [15]:
agent = create_agent(
    model=model,
    system_prompt=PROMPT,
    middleware=[
        SubAgentMiddleware(
            default_model="gpt-4o",
            default_tools=[],
            subagents=[
                {
                    "name": "query_decomposition_subagent",
                    "description": query_decomposition_description,
                    "system_prompt": query_decomposition_prompt,
                    "model": query_decomposition_model,
                },
                {
                    "name": "vectorstore_retrieval_subagent",
                    "description": vectorstore_retrieval_description,
                    "system_prompt": vectorstore_retrieval_prompt,
                    "tools": [retrieve_from_vectorstore],
                    "model": vectorstore_retrieval_model,
                },
            ],
        )
    ],
    response_format=AggregatedContextList
)

In [16]:
result = agent.invoke({"messages": [{"role": "user", "content": "What is the self attention mechanism and how does it work in transformer models?"}]})

2026-01-07 00:22:22,912 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:24,956 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:27,413 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:38,678 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:44,411 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:48,829 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:50,554 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-07 00:22:57,314 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 00:22:58,337 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200

In [20]:
result["structured_response"]['results']

[{'sub_query': 'self attention mechanism definition',
  'retrieved_context': 'Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. [cite:463398441214279686]\nIn a self-attention layer all of the keys, values, and queries come from the same place, specifically the output of the previous layer, allowing each position to attend to all positions in the previous layer. [cite:463398441214279690]\nThe Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequence-aligned RNNs or convolution. [cite:463398441214279686]',
  'citations': ['463398441214279686', '463398441214279690'],
  'synthesized_answer': 'Self-attention, also called intra-attention, relates different positions within a single sequence to compute contextualized representations; queries, keys, and values are all deriv