In [1]:
# Central Configuration Dictionary to manage all system parameters
config = {
    "data_dir": "./data", # Directory to store raw and cleaned data
    "vector_store_dir": "./vector_store", # Directory to persist our vector store
    "llm_provider": "openai", # The LLM provider we are using
    "reasoning_llm":"gpt-4o", # The powerful model for planning and synthesis
    "fast_llm": "gpt-4o-mini", # A faster, cheaper model for simpler tasks like the baseline RAG
    "embedding_model": "text-embedding-3-small", # The model for creating document embeddings
    "reranker_model":"cross-encoder/ms-marco-MiniLM-L-6-v2", # The model for precision reranking
    "max_reasoning_iterations": 7, # A safeguard to prevent the agent from getting into an infinite loop
    "top_k_retrieval": 10, # Number of documents for initial broad recall
    "top_n_rerank": 3, # Number of documents to keep after precision reranking
}

In [2]:
import os # For interacting with the operating system (e.g., managing environment variables)
import re # For regular expression operations, useful for text cleaning
import json # For working with JSON data
from getpass import getpass # To securely prompt for user input like API keys without echoing to the screen
from pprint import pprint # For pretty-printing Python objects, making them more readable
import uuid # To generate unique identifiers
from typing import List, Dict, TypedDict, Literal, Optional # For type hinting to create clean, readable, and maintainable code

# Helper function to securely set environment variables if they are not already present
def _set_env(var: str):
    # Check if the environment variable is not already set
    if not os.environ.get(var): # If not, prompt the user to enter it securely
        os.environ[var] = getpass(f"Enter your{var}: ")

# Set the API keys for the services we will use
_set_env("OPENAI_API_KEY")
# For accessing OpenAI models (GPT-4o, embeddings)
_set_env("LANGSMITH_API_KEY")
# For tracing and debugging with LangSmith
_set_env("TAVILY_API_KEY") # For the web search tool
# Enable LangSmith tracing to get detailed logs and visualizations of our agent's execution
os.environ["LANGSMITH_TRACING"] = "true"
# Define a project name in LangSmith to organize our runs
os.environ["LANGSMITH_PROJECT"] = "Advanced-Deep-Thinking-RAG"

In [77]:
import os
import re
from bs4 import BeautifulSoup

def parse_local_10k(html_path, clean_path):
    """
    Parse a locally downloaded SEC 10-K iXBRL HTML file and extract cleaned human-readable text.
    """

    print(f"Parsing local 10-K file: {html_path}")

    with open(html_path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")

    # 1) 去掉明显不需要的标签
    for tag in soup(["script", "style", "noscript", "header", "footer"]):
        tag.decompose()

    # 2) 去掉带命名空间的标签 (iXBRL: ix:, xbrli:, us-gaap: 等)
    #    BeautifulSoup 会把它们当成名字里带冒号的 tag
    for tag in soup.find_all():
        if ":" in tag.name:
            tag.decompose()

    # 3) 从常见容器中抽取文本
    text_parts = []
    for tag in soup.find_all(["p", "div", "li", "td"]):
        t = tag.get_text(" ", strip=True)
        if not t:
            continue

        # 过滤明显是 XBRL / 垃圾行
        if any(bad in t for bad in ["us-gaap:", "xbrli:", "iso4217:", "dei:"]):
            continue
        if len(t) < 5:
            continue

        text_parts.append(t)

    # 4) 合并 & 清洗
    text = "\n\n".join(text_parts)

    # 连续空行压缩
    clean_text = re.sub(r"\n{3,}", "\n\n", text)
    # 多空格压缩
    clean_text = re.sub(r"[ \t]{2,}", " ", clean_text).strip()

    os.makedirs(os.path.dirname(clean_path), exist_ok=True)
    with open(clean_path, "w", encoding="utf-8") as f:
        f.write(clean_text)

    print(f"✅ Cleaned text saved to: {clean_path}")
    print("\n--- Preview ---\n")
    print(clean_text[:1000] + "...")


In [78]:
import os

# 定义数据文件夹
data_dir = os.path.join(os.getcwd(), "data")
os.makedirs(data_dir, exist_ok=True)  # 如果不存在则自动创建

# 三个变量定义
url_10k = "https://www.sec.gov/Archives/edgar/data/1045810/000104581023000017/nvda-20230129.htm"
doc_path_raw = os.path.join(data_dir, "nvda_10k_raw.html")
doc_path_clean = os.path.join(data_dir, "nvda_10k_clean.txt")

print("当前工作目录：", os.getcwd())
print("原始文件路径：", doc_path_raw)
print("清洗文件路径：", doc_path_clean)



当前工作目录： c:\Users\22959\Documents\WXSK_Works\Agentic RAG Project
原始文件路径： c:\Users\22959\Documents\WXSK_Works\Agentic RAG Project\data\nvda_10k_raw.html
清洗文件路径： c:\Users\22959\Documents\WXSK_Works\Agentic RAG Project\data\nvda_10k_clean.txt


In [79]:
data_dir = os.path.join(os.getcwd(), "data")
html_path = os.path.join(data_dir, "nvda_20230129.htm")
clean_path = os.path.join(data_dir, "nvda_10k_clean.txt")

parse_local_10k(html_path, clean_path)

Parsing local 10-K file: c:\Users\22959\Documents\WXSK_Works\Agentic RAG Project\data\nvda_20230129.htm
✅ Cleaned text saved to: c:\Users\22959\Documents\WXSK_Works\Agentic RAG Project\data\nvda_10k_clean.txt

--- Preview ---

Table of Contents

Table of Contents

UNITED STATES

SECURITIES AND EXCHANGE COMMISSION

Washington, D.C. 20549

____________________________________________________________________________________________

ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the fiscal year ended

TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

Commission file number:

ORATION

(Exact name of registrant as specified in its charter) (State or other jurisdiction of (I.R.S. Employer Incorporation or Organization) Identification No.

In [92]:
complex_query = (
    "Based on NVIDIA's 2023 10-K filing, identify their key risks related to competition. "
    "Then, find recent news (post-filing, from 2024) about AMD's AI chip strategy and explain "
    "how this new strategy directly addresses or exacerbates one of NVIDIA's stated risks."
)

In [93]:
from langchain_community.document_loaders import TextLoader  # 文本加载器
from langchain_text_splitters import RecursiveCharacterTextSplitter  # 文本切分器


print("Loading and chunking the document...")

# Initialize the loader with the path to our cleaned 10-K file
loader = TextLoader(doc_path_clean, encoding='utf-8')

# Load the document into memory
documents = loader.load()

# Initialize the text splitter with a defined chunk size and overlap
# chunk_size=1000: Each chunk will be approximately 1000 characters long.
# chunk_overlap=150: Each chunk will share 150 characters with the previous one to maintain context.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)

# Split the loaded document into smaller, manageable chunks
doc_chunks = text_splitter.split_documents(documents)

print(f"Document loaded and split into {len(doc_chunks)} chunks.")


Loading and chunking the document...
Document loaded and split into 359 chunks.


In [94]:
from langchain_community.vectorstores import Chroma  # The vector store we will use
from langchain_openai import OpenAIEmbeddings  # The function to create embeddings

print("Creating baseline vector store...")

# Initialize the embedding function using the model specified in our config
embedding_function = OpenAIEmbeddings(model=config['embedding_model'])

# Create the Chroma vector store from our document chunks
# This process takes each chunk, creates an embedding for it, and indexes it.
baseline_vector_store = Chroma.from_documents(
    documents=doc_chunks,
    embedding=embedding_function
)

# Create a retriever from the vector store
# The retriever is the component that will actually perform the search.
# search_kwargs={"k": 3}: This tells the retriever to return the top 3 most relevant chunks for any given query.
baseline_retriever = baseline_vector_store.as_retriever(search_kwargs={"k": 3})

print(f"Vector store created with {baseline_vector_store._collection.count()} embeddings.")


Creating baseline vector store...
Vector store created with 1109 embeddings.


In [95]:
from langchain_core.prompts import ChatPromptTemplate  # For creating prompt templates
from langchain_openai import ChatOpenAI  # The OpenAI chat model interface
from langchain_core.runnables import RunnablePassthrough  # A tool to pass inputs through the chain
from langchain_core.output_parsers import StrOutputParser  # To parse the LLM's output as a simple string

# This template instructs the LLM on how to behave.
# {context}: This is where we will inject the content from our retrieved documents.
# {question}: This is where the user's original question will go.
template = """
You are an AI financial analyst. Answer the question based only on the following context:
{context}

Question: {question}
"""

# Create a prompt template from the text
prompt = ChatPromptTemplate.from_template(template)

# We use our 'fast_llm' for this simple task, as defined in our config
llm = ChatOpenAI(model=config["fast_llm"], temperature=0)

# A helper function to format the list of retrieved documents into a single string
def format_docs(docs):
    return "\n\n---\n\n".join(doc.page_content for doc in docs)

# The complete RAG chain defined using LCEL's pipe (|) syntax
baseline_rag_chain = (
    {
        "context": baseline_retriever | format_docs,
        "question": RunnablePassthrough()
    }
    # The context is generated by taking the question, passing it to the retriever, and formatting the result
    # The original question is passed through unchanged
    | prompt  # The dictionary is then passed to the prompt template
    | llm  # The formatted prompt is passed to the language model
    | StrOutputParser()  # The LLM's output message is parsed into a string
)


In [96]:
from rich.console import Console  # For pretty-printing output with markdown
from rich.markdown import Markdown

# Initialize the rich console for better output formatting
console = Console()

# Our complex, multi-hop, multi-source query
complex_query_adv = (
    "Based on NVIDIA's 2023 10-K filing, identify their key risks related to competition. "
    "Then, find recent news (post-filing, from 2024) about AMD's AI chip strategy and explain "
    "how this new strategy directly addresses or exacerbates one of NVIDIA's stated risks."
)

print("Executing complex query on the baseline RAG chain...")

# Invoke the chain with our challenging query
baseline_result = baseline_rag_chain.invoke(complex_query_adv)

console.print("\n--- BASELINE RAG FAILED OUTPUT ---")

# Print the result using markdown formatting for readability
console.print(Markdown(baseline_result))


Executing complex query on the baseline RAG chain...


In [97]:
from typing import List, Optional, Literal
from langchain_core.documents import Document
from pydantic import BaseModel, Field


# Pydantic model for a single step in the agent's reasoning plan
class Step(BaseModel):
    # A specific, answerable sub-question for this research step
    sub_question: str = Field(
        description="A specific, answerable question for this step."
    )

    # The agent's justification for why this step is necessary
    justification: str = Field(
        description="A brief explanation of why this step is necessary to answer the main query."
    )

    # The specific tool to use for this step: either internal document search or external web search
    tool: Literal["search_10k", "search_web"] = Field(
        description="The tool to use for this step."
    )

    # A list of critical keywords to improve the accuracy of the search
    keywords: List[str] = Field(
        description="A list of critical keywords for searching relevant document sections."
    )

    # (Optional) A likely document section to perform a more targeted, filtered search within
    document_section: Optional[str] = Field(
        description=(
            "A likely document section title (e.g., 'Item 1A. Risk Factors') "
            "to search within. Only for 'search_10k' tool."
        )
    )


In [98]:
from typing import List
from pydantic import BaseModel, Field


# Pydantic model for the overall plan, which is a list of individual steps
class Plan(BaseModel):
    # A list of Step objects that outlines the full research plan
    steps: List[Step] = Field(
        description="A detailed, multi-step plan to answer the user's query."
    )


In [99]:
from typing import List, TypedDict
from langchain_core.documents import Document


# A TypedDict to store the results of a completed step in our research history
class PastStep(TypedDict):
    step_index: int  # The index of the completed step (e.g., 1, 2, 3)
    sub_question: str  # The sub-question that was addressed in this step
    retrieved_docs: List[Document]  # The precise documents retrieved and reranked for this step
    summary: str  # The agent's one-sentence summary of the findings from this step


In [100]:
from typing import List, TypedDict
from langchain_core.documents import Document


# The main state dictionary that will be passed between all nodes in our LangGraph agent
class RAGState(TypedDict):
    original_question: str  # The initial, complex query from the user that starts the process
    plan: Plan  # The multi-step plan generated by the Planner Agent
    past_steps: List[PastStep]  # A cumulative history of completed research steps and their findings
    current_step_index: int  # The index of the current step in the plan being executed
    retrieved_docs: List[Document]  # Documents retrieved in the current step (results of broad recall)
    reranked_docs: List[Document]  # Documents after precision reranking in the current step
    synthesized_context: str  # The concise, distilled context generated from the reranked docs
    final_answer: str  # The final, synthesized answer to the user's original question


In [101]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from rich.pretty import pprint as rprint

# The system prompt that instructs the LLM how to behave as a planner
planner_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """You are an expert research planner. Your task is to create a clear, multi-step plan 
        to answer a complex user query by retrieving information from multiple sources.

        You have two tools available:
        1. `search_10k`: Use this to search for information within NVIDIA's 2023 10-K financial filing. 
           This is best for historical facts, financial data, and stated company policies or risks 
           from that specific time period.
        2. `search_web`: Use this to search the public internet for recent news, competitor information, 
           or any topic that is not specific to NVIDIA's 2023 10-K.

        Decompose the user's query into a series of simple, sequential sub-questions. 
        For each step, decide which tool is more appropriate.
        For `search_10k` steps, also identify the most likely section of the 10-K 
        (e.g., 'Item 1A. Risk Factors', 'Item 7. Management's Discussion and Analysis...').
        It is critical to use the exact section titles found in a 10-K filing where possible."""
    ),
    (
        "human",
        "User Query: {question}"
    )  # The user's original, complex query
])


In [102]:
# Initialize our powerful reasoning model, as defined in the config
reasoning_llm = ChatOpenAI(model=config["reasoning_llm"], temperature=0)

# Create the planner agent by piping the prompt to the LLM and instructing it to use our structured 'Plan' output
planner_agent = planner_prompt | reasoning_llm.with_structured_output(Plan)
print("Tool-Aware Planner Agent created successfully.")

# Let's test the planner agent with our complex query to see its output
print("\n--- Testing Planner Agent ---")
test_plan = planner_agent.invoke({"question": complex_query_adv})

# Use rich's pretty print for a clean, readable display of the Pydantic object
rprint(test_plan)


Tool-Aware Planner Agent created successfully.

--- Testing Planner Agent ---


In [103]:
from langchain_core.output_parsers import StrOutputParser  # To parse the LLM's output as a simple string
from langchain_core.prompts import ChatPromptTemplate

# The prompt for our query rewriter, instructing it to act as a search expert
query_rewriter_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """You are a search query optimization expert. Your task is to rewrite a given sub-question
        into a highly effective search query for a vector database or web search engine, using keywords 
        and context from the research plan.

        The rewritten query should be specific, use terminology likely to be found in the target source 
        (a financial 10-K or news articles), and be structured to retrieve the most relevant text snippets."""
    ),
    (
        "human",
        "Current sub-question: {sub_question}\n\n"
        "Relevant keywords from plan: {keywords}\n\n"
        "Context from past steps:\n{past_context}"
    )
])


In [104]:
# Create the agent by piping the prompt to our reasoning LLM and a string output parser
query_rewriter_agent = query_rewriter_prompt | reasoning_llm | StrOutputParser()
print("Query Rewriter Agent created successfully.")

# Let's test the rewriter agent. We'll pretend we've already completed the first two steps of our plan.
print("\n--- Testing Query Rewriter Agent ---")

# Test sub-question: final synthesis-style query
test_sub_q = (
    "How does AMD's 2024 AI chip strategy potentially exacerbate the competitive "
    "risks identified in NVIDIA's 10-K?"
)

# Relevant keywords from the hypothetical plan
test_keywords = [
    "impact",
    "threaten",
    "competitive pressure",
    "market share",
    "technological change",
]

# Mock past context, simulating prior steps' findings
test_past_context = (
    "Step 1 Summary: NVIDIA's 10-K lists intense competition and rapid technological "
    "change as key risks. "
    "Step 2 Summary: AMD launched its MI300X AI accelerator in 2024 to directly compete "
    "with NVIDIA's H100."
)

# Invoke the agent with our test data
rewritten_q = query_rewriter_agent.invoke(
    {
        "sub_question": test_sub_q,
        "keywords": test_keywords,
        "past_context": test_past_context,
    }
)

print(f"Original sub-question: {test_sub_q}")
print(f"Rewritten Search Query: {rewritten_q}")


Query Rewriter Agent created successfully.

--- Testing Query Rewriter Agent ---
Original sub-question: How does AMD's 2024 AI chip strategy potentially exacerbate the competitive risks identified in NVIDIA's 10-K?
Rewritten Search Query: "AMD 2024 AI chip strategy impact on NVIDIA competitive risks 10-K, MI300X vs H100, market share threat, technological change"


In [109]:
import re

raw_text = documents[0].page_content

# 只匹配独立一行的 ITEM 标题：
# - 行首可有空格
# - ITEM + 数字(+可选字母) + '.' 
# - 后面标题内容不包含句号，一直到行尾（防止把整句正文吃进来）
section_header_pattern = re.compile(
    r"""
    ^\s*                                  # 行首空格
    ITEM\s+(?P<num>\d+[A-Z]?)\.\s+        # ITEM 1. / ITEM 1A.
    (?P<title>[^.\n]{1,200}?)\s*          # 标题：无句号，不跨行，长度限制防止乱飙
    $                                     # 必须到行尾结束
    """,
    re.IGNORECASE | re.MULTILINE | re.VERBOSE
)

matches = list(section_header_pattern.finditer(raw_text))

section_titles = []
sections_content = []

for i, m in enumerate(matches):
    num = m.group("num").upper()
    title = m.group("title").strip()

    clean_title = f"ITEM {num}. {title}"
    section_titles.append(clean_title)

    start = m.end()
    end = matches[i + 1].start() if i + 1 < len(matches) else len(raw_text)
    content = raw_text[start:end].strip()
    sections_content.append(content)

print(f"Identified {len(section_titles)} document sections.")
print(f"Section content blocks: {len(sections_content)}")
for t in section_titles:
    print(f"- {t}")

assert len(section_titles) == len(sections_content), "Mismatch between titles and content sections"


Identified 22 document sections.
Section content blocks: 22
- ITEM 1. BUSINESS
- ITEM 1A. RISK FACTORS
- ITEM 1B. UNRESOLVED STAFF COMMENTS
- ITEM 2. PROPERTIES
- ITEM 3. LEGAL PROCEEDINGS
- ITEM 4. MINE SAFETY DISCLOSURES
- ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES
- ITEM 6. [RESERVED]
- ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS
- ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK
- ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA
- ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNTANTS ON ACCOUNTING AND FINANCIAL DISCLOSURE
- ITEM 9A. CONTROLS AND PROCEDURES
- ITEM 9B. OTHER INFORMATION
- ITEM 9C. DISCLOSURE REGARDING FOREIGN JURISDICTIONS THAT PREVENT INSPECTIONS
- ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPORATE GOVERNANCE
- ITEM 11. EXECUTIVE COMPENSATION
- ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL OWNERS AND MANAGEMENT AND R

In [111]:
import uuid  # We'll use this to give each chunk a unique ID, which is good practice

# This list will hold our new, metadata-rich document chunks
doc_chunks_with_metadata = []

# Loop through each section's content along with its title using enumerate
for i, content in enumerate(sections_content):
    # Get the corresponding title for the current content block
    section_title = section_titles[i]

    # Use the same text splitter as before, but this time, we run it ONLY on the content of the current section
    section_chunks = text_splitter.split_text(content)

    # Now, loop through the smaller chunks created from this one section
    for chunk in section_chunks:
        # Generate a unique ID for this specific chunk
        chunk_id = str(uuid.uuid4())

        # Create a new LangChain Document object for the chunk
        doc_chunks_with_metadata.append(
            Document(
                page_content=chunk,
                # This is the most important part: we attach the metadata
                metadata={
                    "section": section_title,       # The section this chunk belongs to
                    "source_doc": doc_path_clean,   # Where the document came from
                    "id": chunk_id                  # The unique ID for this chunk
                }
            )
        )

print(f"Created {len(doc_chunks_with_metadata)} chunks with section metadata.")
print("\n--- Sample Chunk with Metadata ---")

# To prove it worked, let's find a chunk that we know should be in the 'Risk Factors' section and print it
sample_chunk = next(
    c
    for c in doc_chunks_with_metadata
    if "risk factors" in c.metadata.get("section", "").lower()
)
print(sample_chunk)


Created 368 chunks with section metadata.

--- Sample Chunk with Metadata ---
page_content='In evaluating NVIDIA, the following risk factors should be considered in addition to the other information in this Annual Report on Form 10-K. Purchasing or owning NVIDIA common stock involves investment risks including, but not limited to, the risks described below. Any one of the following risks could harm our business, financial condition, results of operations or reputation, which could cause our stock price to decline, and you may lose all or a part of your investment. Additional risks, trends and uncertainties not presently known to us or that we currently believe are immaterial may also harm our business, financial condition, results of operations or reputation.

Risk Factors Summary

Risks Related to Our Industry and Markets

• Failure to meet the evolving needs of our industry and markets may adversely impact our financial results.

Table of Contents

Table of Contents

• Competition in

In [112]:
from typing import Literal
from pydantic import BaseModel


class RetrievalDecision(BaseModel):
    # The chosen retrieval strategy. Must be one of these three options.
    strategy: Literal["vector_search", "keyword_search", "hybrid_search"]
    # The agent's justification for its choice.
    justification: str


In [113]:
retrieval_supervisor_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """You are a retrieval strategy expert. Based on the user's query, you must decide the best retrieval strategy.
You have three options:
1. `vector_search`: Best for conceptual, semantic, or similarity-based queries.
2. `keyword_search`: Best for queries with specific, exact terms, names, or codes (e.g., 'Item 1A', 'Hopper architecture').
3. `hybrid_search`: A good default that combines both, but may be less precise than a targeted strategy."""
    ),
    (
        "human",
        "User Query: {sub_question}"  # The rewritten search query will be passed here.
    ),
])


In [114]:
# Create the agent by piping our prompt to the reasoning LLM and structuring its output with our Pydantic class
retrieval_supervisor_agent = retrieval_supervisor_prompt | reasoning_llm.with_structured_output(RetrievalDecision)
print("Retrieval Supervisor Agent created.")

# Let's test it with two different types of queries to see how it behaves
print("\n--- Testing Retrieval Supervisor Agent ---")

query1 = "revenue growth for the Compute & Networking segment in fiscal year 2023"
decision1 = retrieval_supervisor_agent.invoke({"sub_question": query1})
print(f"Query: '{query1}'")
print(f"Decision: {decision1.strategy}, Justification: {decision1.justification}")

query2 = "general sentiment about market competition and technological innovation"
decision2 = retrieval_supervisor_agent.invoke({"sub_question": query2})
print(f"\nQuery: '{query2}'")
print(f"Decision: {decision2.strategy}, Justification: {decision2.justification}")


Retrieval Supervisor Agent created.

--- Testing Retrieval Supervisor Agent ---
Query: 'revenue growth for the Compute & Networking segment in fiscal year 2023'
Decision: keyword_search, Justification: The query is looking for specific information related to 'revenue growth', 'Compute & Networking segment', and 'fiscal year 2023'. These are precise terms and likely to be found in structured data or reports, making keyword search the most effective strategy.

Query: 'general sentiment about market competition and technological innovation'
Decision: vector_search, Justification: The query is conceptual and seeks an understanding of general sentiment, which involves interpreting the nuances of market competition and technological innovation. Vector search is best suited for capturing the semantic meaning and context of such broad topics.


In [116]:
import numpy as np  # A fundamental library for numerical operations in Python
from rank_bm25 import BM25Okapi  # The library for implementing the BM25 keyword search algorithm

print("Creating advanced vector store with metadata...")
# We create a new Chroma vector store, this time using our metadata-rich chunks
advanced_vector_store = Chroma.from_documents(
    documents=doc_chunks_with_metadata,
    embedding=embedding_function)
print(f"Advanced vector store created with {advanced_vector_store._collection.count()} embeddings.")

Creating advanced vector store with metadata...
Advanced vector store created with 1477 embeddings.


In [117]:
print("\nBuilding BM25 index for keyword search...")
# Create a list where each element is a list of words from a document
tokenized_corpus = [doc.page_content.split(" ") for doc in doc_chunks_with_metadata]
# Create a list of all unique document IDs
doc_ids = [doc.metadata["id"] for doc in doc_chunks_with_metadata]
# Create a mapping from a document's ID back to the full Document object for easy lookup
doc_map = {doc.metadata["id"]: doc for doc in doc_chunks_with_metadata}
# Initialize the BM25Okapi index with our tokenized corpus
bm25 = BM25Okapi(tokenized_corpus)


Building BM25 index for keyword search...


In [118]:
# Strategy 1: Pure Vector Search with Metadata Filtering
def vector_search_only(query: str, section_filter: str = None, k: int = 10):
    # This dictionary defines the metadata filter. ChromaDB will only search documents that match this.
    filter_dict = {"section": section_filter} if section_filter and "Unknown" not in section_filter else None
    # Perform the similarity search with the optional filter
    return advanced_vector_store.similarity_search(query, k=k, filter=filter_dict)

# Strategy 2: Pure Keyword Search (BM25)
def bm25_search_only(query: str, k: int = 10):
    # Tokenize the incoming query
    tokenized_query = query.split(" ")
    # Get the BM25 scores for the query against all documents in the corpus
    bm25_scores = bm25.get_scores(tokenized_query)
    # Get the indices of the top k documents
    top_k_indices = np.argsort(bm25_scores)[::-1][:k]
    # Use our doc_map to return the full Document objects for the top results
    return [doc_map[doc_ids[i]] for i in top_k_indices]

# Strategy 3: Hybrid Search with Reciprocal Rank Fusion (RRF)
def hybrid_search(query: str, section_filter: str = None, k: int = 10):
    # 1. Perform a keyword search
    bm25_docs = bm25_search_only(query, k=k)
    # 2. Perform a semantic search with the metadata filter
    semantic_docs = vector_search_only(query, section_filter=section_filter, k=k)
    # 3. Combine and re-rank the results using Reciprocal Rank Fusion (RRF)
    # Get a unique set of all documents found by either search method
    all_docs = {doc.metadata["id"]: doc for doc in bm25_docs + semantic_docs}.values()
    # Create lists of just the document IDs from each search result
    ranked_lists = [[doc.metadata["id"] for doc in bm25_docs], [doc.metadata["id"] for doc in semantic_docs]]
    
    # Initialize a dictionary to store the RRF scores for each document
    rrf_scores = {}
    # Loop through each ranked list (BM25 and Semantic)
    for doc_list in ranked_lists:
        # Loop through each document ID in the list with its rank (i)
        for i, doc_id in enumerate(doc_list):
            if doc_id not in rrf_scores:
                rrf_scores[doc_id] = 0
            # The RRF formula: add 1 / (rank + k) to the score. We use k=61 as a standard default.
            rrf_scores[doc_id] += 1 / (i + 61)
    
    # Sort the document IDs based on their final RRF scores in descending order
    sorted_doc_ids = sorted(rrf_scores.keys(), key=lambda x: rrf_scores[x], reverse=True)
    # Return the top k Document objects based on the fused ranking
    final_docs = [doc_map[doc_id] for doc_id in sorted_doc_ids[:k]]
    return final_docs

print("\nAll retrieval strategy functions ready.")


All retrieval strategy functions ready.


In [126]:
# Test Keyword Search to see if it can precisely find a specific section
print("\n--- Testing Keyword Search ---")
test_query = "Item 1A. Risk Factors"
test_results = bm25_search_only(test_query)
print(f"Query: {test_query}")
print(f"Found {len(test_results)} documents.")
for res in test_results:
    print(f"- {res.metadata['section']} (ID: {res.metadata['id']})")


--- Testing Keyword Search ---
Query: Item 1A. Risk Factors
Found 10 documents.
- ITEM 1. BUSINESS (ID: 8c9add87-8b66-4dcd-a4e0-b6d840f015c8)
- ITEM 1. BUSINESS (ID: 4e6a4237-df16-4170-afe9-2c067dc31e4d)
- ITEM 1. BUSINESS (ID: d88cde03-bcdc-46e3-8d77-21cd4408b5ba)
- ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS (ID: c75084c8-a1d6-4364-9e62-e6b95df4fd62)
- ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS (ID: 79d09d22-bd91-47ba-b626-1221b29bed00)
- ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA (ID: 8bb72dc4-9565-41b6-9e41-7a09412b80b6)
- ITEM 1A. RISK FACTORS (ID: 672b09ea-2424-477a-9125-a2bd92e578c1)
- ITEM 3. LEGAL PROCEEDINGS (ID: 69a94f4d-3e86-441c-b19f-ddb6b214f139)
- ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS (ID: 6c9eb122-a9e9-496a-bcdd-3818fa294ed5)
- ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND 

In [129]:
from sentence_transformers import CrossEncoder  # The library for using cross-encoder models

print("Initializing CrossEncoder reranker...")
# Initialize the CrossEncoder model using the name from our central config dictionary.
# The library will automatically download the model from the Hugging Face Hub if it's not cached.
reranker = CrossEncoder(config["reranker_model"])

Initializing CrossEncoder reranker...


In [130]:
def rerank_documents_function(query: str, documents: List[Document]) -> List[Document]:
    # If we have no documents to rerank, return an empty list immediately.
    if not documents:
        return []

    # Create the pairs of [query, document_content] that the cross-encoder needs.
    pairs = [(query, doc.page_content) for doc in documents]

    # Use the reranker to predict a relevance score for each pair. This returns a list of scores.
    scores = reranker.predict(pairs)

    # Combine the original documents with their new scores.
    doc_scores = list(zip(documents, scores))

    # Sort the list of (document, score) tuples in descending order based on the score.
    doc_scores.sort(key=lambda x: x[1], reverse=True)

    # Extract just the Document objects from the top N sorted results.
    # The number of documents to keep is controlled by 'top_n_rerank' in our config.
    reranked_docs = [doc for doc, score in doc_scores[:config["top_n_rerank"]]]

    return reranked_docs

In [131]:
# The prompt for our distiller agent, instructing it to synthesize and be concise
distiller_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a helpful assistant. Your task is to synthesize the 
            following retrieved document snippets into a single, concise paragraph.
            The goal is to provide a clear and coherent context that directly 
            answers the question: '{question}'. Focus on removing redundant 
            information and organizing the content logically. 
            Answer only with the synthesized context."""
        ),
        (
            "human", 
            "Retrieved Documents:\n{context}"
        )  # The content of our top 3 reranked documents will be passed here
    ]
)

In [132]:
# Create the agent by piping our prompt to the reasoning LLM and a string output parser
distiller_agent = distiller_prompt | reasoning_llm | StrOutputParser()
print("Contextual Distiller Agent created.")

Contextual Distiller Agent created.


In [None]:
from langchain_tavily import TavilySearch
# Initialize the Tavily search tool.
# k=3: This parameter instructs the tool to return the top 3 most relevant search results for a given query.
web_search_tool = TavilySearch(max_results=3,)

In [137]:
def web_search_function(query: str) -> List[Document]:
    # 调用 Tavily 工具
    resp: dict = web_search_tool.invoke({"query": query})
    # resp 是一个 dict，我们从中取出 "results" 列表
    results = resp.get("results", [])

    docs: List[Document] = []
    for item in results:
        item: dict
        # Tavily 返回里通常有 content / url / title 等字段
        content = item.get("content") or item.get("raw_content") or ""
        if not content:
            continue
        url = item.get("url", "")
        docs.append(
            Document(
                page_content=content,
                metadata={"source": url} if url else {}
            )
        )

    return docs

In [138]:
print("\n--- Testing Web Search Tool ---")
test_query_web = "AMD AI chip strategy 2024"
test_results_web = web_search_function(test_query_web)
print(f"Found {len(test_results_web)} results for query: '{test_query_web}'")
if test_results_web:
    print(f"Top result snippet: {test_results_web[0].page_content[:250]}...")


--- Testing Web Search Tool ---
Found 5 results for query: 'AMD AI chip strategy 2024'
Top result snippet: Feb 3(Reuters) - AMD (AMD.O), opens new tab investors will closely examine the chip designer's artificial intelligence strategy when it reports fourth-quarter results on Tuesday as Big Tech's shift to custom silicon raises doubts about its place in t...


In [139]:
# The prompt for our reflection agent, instructing it to be concise and factual
reflection_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a research assistant. Based on the retrieved context 
            for the current sub-question, write a concise, one-sentence summary 
            of the key findings. This summary will be added to our research history. 
            Be factual and to the point."""
        ),
        (
            "human",
            "Current sub-question: {sub_question}\n\nDistilled context:\n{context}"
        )
    ]
)

In [140]:
# Create the agent by piping our prompt to the reasoning LLM and a string output parser
reflection_agent = reflection_prompt | reasoning_llm | StrOutputParser()
print("Reflection Agent created.")

Reflection Agent created.


In [141]:
class Decision(BaseModel):
    # The decision must be one of these two actions.
    next_action: Literal["CONTINUE_PLAN", "FINISH"]
    # The agent must justify its decision.
    justification: str

In [None]:
# The prompt for our policy agent, instructing it to act as a master strategist
policy_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a master strategist. Your role is to analyze the research progress 
            and decide the next action. You have the original question, the initial plan, 
            and a log of completed steps with their summaries.
            - If the collected information in the Research History is sufficient to 
            comprehensively answer the Original Question, decide to FINISH.
            - Otherwise, if the plan is not yet complete, decide to CONTINUE_PLAN."""
        ),
        (
            "human",
            "Original Question: {question}\n\nInitial Plan:\n{plan}\n\nResearch History (Completed Steps):\n{history}"
        )
    ]
)

In [145]:
# Create the agent by piping our prompt to the reasoning LLM 
# and structuring its output with our Decision class
policy_agent = (
    policy_prompt 
    | reasoning_llm.with_structured_output(Decision)
)
print("Policy Agent created.")

# Now, let's test the policy agent with two different states of our research process
print("\n--- Testing Policy Agent (Incomplete State) ---")

# First, a state where only Step 1 is complete.
plan_str = json.dumps(
    [s.model_dump() for s in test_plan.steps]
)
incomplete_history = (
    "Step 1 Summary: NVIDIA's 10-K states that the semiconductor industry "
    "is intensely competitive and subject to rapid technological change."
)

decision1 = policy_agent.invoke(
    {
        "question": complex_query_adv,
        "plan": plan_str, 
        "history": incomplete_history
    }
)
print(
    f"Decision: {decision1.next_action}, "
    f"Justification: {decision1.justification}"
)

print("\n--- Testing Policy Agent (Complete State) ---")

# Second, a state where both Step 1 and Step 2 are complete.
complete_history = (
    incomplete_history 
    + "\nStep 2 Summary: In 2024, AMD launched its MI300X accelerator "
    "to directly compete with NVIDIA in the AI chip market, "
    "gaining adoption from major cloud providers."
)

decision2 = policy_agent.invoke(
    {
        "question": complex_query_adv,
        "plan": plan_str,
        "history": complete_history
    }
)
print(
    f"Decision: {decision2.next_action}, "
    f"Justification: {decision2.justification}"
)

Policy Agent created.

--- Testing Policy Agent (Incomplete State) ---
Decision: CONTINUE_PLAN, Justification: The research so far has only identified the general competitive nature of the semiconductor industry as a risk for NVIDIA. However, the specific details of NVIDIA's key risks related to competition, as outlined in their 2023 10-K filing, have not been fully explored. Additionally, we have not yet gathered any information on AMD's 2024 AI chip strategy, which is crucial to understanding how it might impact NVIDIA's competitive position. Therefore, it is necessary to continue with the plan to gather comprehensive information on both NVIDIA's specific risks and AMD's strategy.

--- Testing Policy Agent (Complete State) ---
Decision: CONTINUE_PLAN, Justification: The research has identified NVIDIA's general competitive risks and AMD's recent AI chip strategy. However, the analysis of how AMD's strategy specifically addresses or exacerbates NVIDIA's stated risks is still pending. C