In [10]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables.")
else:
    print("OPENAI_API_KEY is set.")

OPENAI_API_KEY is set.


In [11]:
def read_markdown_file(filepath):
    """Reads the content of a Markdown file as a string."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except FileNotFoundError:
        return f"Error: The file at {filepath} was not found."
    except Exception as e:
        return f"An error occurred: {e}"

In [12]:
from langchain.chat_models import init_chat_model
from deepagents import create_deep_agent

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


**RAG.**
Define a retrieval function that the deep_agent will use as a tool

In [13]:
FILE_PATH = "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"
TOP_K = 3
queries = ['mechanics of scaled dot product attention',
 'key aspects of multi head attention']

In [14]:
from pathlib import Path
from tempfile import mkdtemp

import tiktoken
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_milvus import Milvus

from langchain_core.prompts import PromptTemplate
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

embedding = OpenAIEmbeddings(
    model="text-embedding-3-large",
)

enc = tiktoken.get_encoding("cl100k_base")
tokenizer = OpenAITokenizer(
    tokenizer=enc,
    max_tokens=128 * 1024,  # set to the model's context window
)

loader = DoclingLoader(file_path=FILE_PATH, chunker=HybridChunker(tokenizer=tokenizer))
docs = loader.load()

milvus_uri = str(Path(mkdtemp()) / "vector.db")

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="vectordb",
    connection_args={"uri": milvus_uri},
    index_params={"index_type": "FLAT"},
    drop_old=True,
)

retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

PROMPT = PromptTemplate.from_template(read_markdown_file("../prompts/retriever_prompt.md"))

llm = ChatOpenAI(model="gpt-4o", temperature=0)

question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

2026-01-07 18:12:44,583 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-07 18:12:44,674 - INFO - Going to convert document batch...
2026-01-07 18:12:44,675 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-07 18:12:44,683 - INFO - Loading plugin 'docling_defaults'
2026-01-07 18:12:44,685 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-07 18:12:44,691 - INFO - Loading plugin 'docling_defaults'
2026-01-07 18:12:44,695 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-01-07 18:12:45,563 - INFO - Auto OCR model selected ocrmac.
2026-01-07 18:12:45,570 - INFO - Loading plugin 'docling_defaults'
2026-01-07 18:12:45,572 - INFO - Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']
2026-01-07 18:12:45,577 - INFO - Accelerator device: 'mps'
2026-01-07 18:12:47,129 - INFO - Loading plugin 'docling_defaul

In [15]:
from langchain.tools import tool

@tool
def retrieve_from_vectorstore(queries: list[str]) -> dict:
    """Retrieve and generate answers from the vectorstore for a list of queries.
    This function invokes the RAG chain for each query and returns aggregated results."""
    
    resp_dict={}
    for query in queries:
        resp_dict[query] = rag_chain.invoke({"input": query})
        
    return resp_dict

**Prepare settings for subagents**

In [16]:
examples = read_markdown_file("../prompts/query_decomposition_examples.md")
examples[:100]

'Example 1 single information need\nInput query:\nHow does dropout prevent overfitting in neural networ'

In [17]:
query_decomposition_prompt = read_markdown_file("../prompts/query_decomposition_prompt.md")
query_decomposition_prompt.format(examples=examples)[:500]

'You are an academic query decomposition and refinement agent for vector retrieval.\nYou will receive a query from a user which may contain single or multiple distinct information needs.\n\nHere are the examples which are done well:\nExample 1 single information need\nInput query:\nHow does dropout prevent overfitting in neural networks\nOutput:\n{"sub_queries":["dropout regularization reducing neural network overfitting"]}\n\nExample 2 two distinct information needs\nInput query:\neffects of microplastics o'

In [18]:
query_decomposition_model = "gpt-5-mini"
query_decomposition_description = "Analyzes research queries and generates optimized, non-overlapping sub-queries for vector database retrieval."

In [19]:
vectorstore_retrieval_prompt = read_markdown_file("../prompts/vectorstore_retrieval_prompt.md")
vectorstore_retrieval_prompt[:500]

'You are a vectorstore retrieval orchestration agent. Your primary responsibility is to retrieve academic context from a vector database using refined sub-queries.\n\nTask:\nYou will receive one or more sub-queries from the query decomposition agent. Your role is to use the retrieve_from_vectorstore tool to fetch relevant academic papers, citations, and context for each sub-query.\n\nInstructions:\n1. Accept the list of sub-queries provided by the upstream agent.\n2. Call retrieve_from_vectorstore with '

In [20]:
vectorstore_retrieval_model = "gpt-5-nano"
vectorstore_retrieval_description = "Executes batch retrieval of academic context from vector database and preserves citations and metadata for structured result aggregation."

**Main Deep Agent**
- Decomposes user queries into subqueries
- Retrieves relevant information from the vectorstore using subqueries
- Formats and structures the retrieved context

In [21]:
from langchain.agents import create_agent
from deepagents.middleware.subagents import SubAgentMiddleware
from langchain.agents.middleware import TodoListMiddleware
from langchain.chat_models import init_chat_model

model = init_chat_model(model="gpt-5-mini")

In [22]:
from typing_extensions import TypedDict
from typing import List, Annotated

class AggregatedContext(TypedDict):
    sub_query: Annotated[str, "The sub-query string"]
    retrieved_context: Annotated[str, "The retrieved context string"]
    citations: Annotated[List[str], "List of citation identifiers"]
    synthesized_answer: Annotated[str, "The synthesized answer string"]

class AggregatedContextList(TypedDict):
    results: Annotated[List[AggregatedContext], "List of aggregated context for each sub-query"]

In [23]:
PROMPT = read_markdown_file("../prompts/retrieval_orchestrator_prompt.md")
PROMPT[:500]

'You are the main retrieval orchestrator for academic question answering. You coordinate sub-agents for query decomposition and vectorstore retrieval, then deliver a concise, citation-backed response.\n\nWorkflow:\n1) Send the user query to query_decomposition_subagent. Expect an object with key sub_queries.\n2) If decomposition fails or returns no sub-queries, fall back to a single sub-query equal to the original user query.\n3) Send the full sub_queries list to vectorstore_retrieval_subagent (retrie'

In [24]:
todo_list_prompt = read_markdown_file("../prompts/todo_list_prompt.md")
todo_list_prompt[:500]

"Always use the write_todos tool to break down the user's academic query into the following sequential steps:\n        \n1. DECOMPOSE: Analyze the user's query and identify distinct information needs. Send to query_decomposition_subagent to generate refined sub-queries.\n2. VALIDATE: Ensure the decomposed sub-queries are non-overlapping and atomic (each addresses one concept/aspect).\n3. RETRIEVE: Pass all sub-queries to vectorstore_retrieval_subagent in a single batch to fetch relevant academic cont"

In [25]:
agent = create_agent(
    model=model,
    system_prompt=PROMPT,
    middleware=[
        TodoListMiddleware(
            system_prompt=todo_list_prompt,
        ),
        SubAgentMiddleware(
            default_model="gpt-4o",
            default_tools=[],
            subagents=[
                {
                    "name": "query_decomposition_subagent",
                    "description": query_decomposition_description,
                    "system_prompt": query_decomposition_prompt,
                    "model": query_decomposition_model,
                },
                {
                    "name": "vectorstore_retrieval_subagent",
                    "description": vectorstore_retrieval_description,
                    "system_prompt": vectorstore_retrieval_prompt,
                    "tools": [retrieve_from_vectorstore],
                    "model": vectorstore_retrieval_model,
                },
            ],
        )
    ],
    response_format=AggregatedContextList
)

In [26]:
result = agent.invoke({"messages": [{"role": "user", "content": "What is the self attention mechanism and how does it work in transformer models?"}]})

2026-01-07 18:13:23,794 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:26,864 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:34,717 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:37,819 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:42,018 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:45,909 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:47,958 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-07 18:13:54,826 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-07 18:13:56,050 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200

In [27]:
result["structured_response"]['results']

[{'sub_query': 'self attention mechanism definition purpose',
  'retrieved_context': 'Vaswani et al., "Attention Is All You Need" (Transformer paper) â€” sections 2 Background, 3.2.3 Applications of Attention in our Model, and 4 Why Self-Attention describe self-attention (intra-attention) as an attention mechanism that relates different positions of a single sequence to compute a sequence representation; they motivate it by reduced path length for long-range dependencies, parallelizability, and interpretability versus RNNs/convolutions.',
  'citations': ['Vaswani et al., 2017 (NIPS paper)'],
  'synthesized_answer': 'Self-attention (intra-attention) relates different positions within one sequence to compute contextualized representations, reducing the path length for long-range dependencies and enabling parallel computation across positions [Vaswani et al., 2017].'},
 {'sub_query': 'self attention computation in transformer models',
  'retrieved_context': 'Vaswani et al., "Attention Is 

In [28]:
for tool in result['messages'][3].tool_calls:
    print(tool['name'])

task


In [29]:
from rich.console import Console
from rich.panel import Panel
from rich.text import Text
from rich.table import Table

console = Console()

console.print("\n" + "="*80)
console.print(Panel.fit("ðŸ¤– Agent Execution Summary", style="bold cyan"))
console.print("="*80 + "\n")

for i, msg in enumerate(result['messages'], 1):
    msg_content = msg.content if hasattr(msg, 'content') else str(msg)
    if len(msg_content.strip()) == 0:
        msg_content = "Tool Calls: "
        msg_content += [tool['name'] for tool in msg.tool_calls][0]
    header = Text(f"Step {i}: ", style="bold yellow")
    header.append(msg_content[:50] + "..." if len(msg_content) > 50 else msg_content, style="cyan")
    console.print(header)
    console.print(Text(msg_content, style="white dim"))
    console.print()

# Display structured results
console.print("\n" + "="*80)
console.print(Panel.fit("ðŸ“Š Retrieved Context & Answers", style="bold green"))
console.print("="*80 + "\n")

if "structured_response" in result and "results" in result["structured_response"]:
    for idx, item in enumerate(result["structured_response"]["results"], 1):
        # Sub-query panel
        console.print(Panel(
            Text(item.get("sub_query", "N/A"), style="bold white"),
            title=f"Sub-Query {idx}",
            border_style="cyan",
            expand=False
        ))
        
        # Synthesized answer
        if item.get("synthesized_answer"):
            console.print(Text("Answer:", style="bold green"))
            console.print(Text(item["synthesized_answer"], style="green"))
        
        # Citations
        if item.get("citations"):
            console.print(Text("\nCitations:", style="bold magenta"))
            for citation in item["citations"]:
                console.print(Text(f"  â€¢ {citation}", style="magenta"))
        
        # Context preview
        if item.get("retrieved_context"):
            context_preview = item["retrieved_context"][:200] + "..." if len(item["retrieved_context"]) > 200 else item["retrieved_context"]
            console.print(Text(f"\nContext Preview:", style="bold yellow"))
            console.print(Text(context_preview, style="yellow dim"))
        
        console.print("\n")
