In [48]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables.")
else:
    print("OPENAI_API_KEY is set.")

OPENAI_API_KEY is set.


In [49]:
from langchain.tools import tool
from langchain_tavily import TavilySearch
from typing import Optional, Literal, Annotated

@tool
def can_perform_web_search() -> bool:
    """
        Check if web search can be performed.
    """
    tavily_api_key = os.getenv("TAVILY_API_KEY")
    if not tavily_api_key:
        return False
    return True

@tool
def web_search(query: Annotated[str, "The search query"], topic: Annotated[Optional[Literal["general", "news", "finance"]], "The topic to search within"] = "general", max_results: Annotated[Optional[int], "The maximum number of results to return"] = 5) -> list[dict]:
    """
        Search the web using TavilySearch. 
        You can specify a topic and the maximum number of results to return in order to refine the search.
    """
    search = TavilySearch(max_results=max_results, topic=topic)
    results = search.run(query)
    return results

In [50]:
from langchain.agents import create_agent
from langchain.chat_models import init_chat_model

model = init_chat_model(model="gpt-5-mini")
tools = [web_search, can_perform_web_search]

In [51]:
def read_markdown_file(filepath):
    """Reads the content of a Markdown file as a string."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except FileNotFoundError:
        return f"Error: The file at {filepath} was not found."
    except Exception as e:
        return f"An error occurred: {e}"

In [52]:
PROMPT = read_markdown_file("../prompts/reasoning_prompt.md")
PROMPT[:500]

'You are a reasoning agent that crafts clear, well-supported answers.\n\nTools available:\n- can_perform_web_search: returns True/False indicating if web search is available.\n- web_search: performs a Tavily search (gated by Human-in-the-Loop approval).\n\nTool usage policy (strict):\n- Before any web_search call, you MUST first call can_perform_web_search.\n- If can_perform_web_search returns False, DO NOT call web_search; continue using only the provided context.\n- If can_perform_web_search returns Tru'

In [53]:
from langchain.agents.middleware import HumanInTheLoopMiddleware 
from langgraph.checkpoint.memory import InMemorySaver

In [54]:
agent = create_agent(
    model=model, 
    tools=tools,
    middleware=[
        HumanInTheLoopMiddleware(
            interrupt_on={
                "can_perform_web_search": False,
                "web_search": {"allowed_decisions": ['approve', 'reject']},
            },
            description_prefix="Tool execution pending approval",
        )
    ],
    checkpointer=InMemorySaver(), 
)

In [55]:
user_question = "What is the self attention mechanism and how does it work in transformer models, also how anthropic explains it?"
context = [
  {
    "sub_query": "self attention mechanism definition and purpose",
    "retrieved_context": "Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence (Vaswani et al., 2017, Section 2). The Transformer uses self-attention in encoder and decoder layers to allow each position to attend to all positions in the previous layer, enabling modeling of dependencies without recurrence (Vaswani et al., 2017, Section 3.2.3). Self-attention connects all positions with a constant number of sequential operations, improving parallelization and shortening path lengths for long-range dependencies compared to recurrent layers (Vaswani et al., 2017, Section 4).",
    "citations": [
      "Vaswani et al., 2017 - Attention Is All You Need; Section 2, 3.2.3, 4; https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"
    ],
    "synthesized_answer": "Self-attention (intra-attention) relates positions within a single sequence to compute contextualized representations, enabling the model to represent each token with information from all other tokens in the sequence [Vaswani et al., 2017, Section 2]."
  },
  {
    "sub_query": "self attention operation within transformer architecture",
    "retrieved_context": "In a self-attention layer all keys, values and queries come from the same source (the previous layer) and each position can attend to all positions in that layer; in the decoder self-attention is masked to prevent leftward (future) information flow and the model also uses encoder-decoder attention where decoder queries attend encoder keys/values (Vaswani et al., 2017, Section 3.2.3). The Transformer implements multi-head attention and scaled dot-product attention to compute weights and aggregate values, enabling parallel computation and flexible representation learning (Vaswani et al., 2017, Sections 3.2.3 and 4).",
    "citations": [
      "Vaswani et al., 2017 - Attention Is All You Need; Section 3.2.3, 4; https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"
    ],
    "synthesized_answer": "Transformer self-attention forms queries, keys, and values from the same input, computes attention weights (e.g., scaled dot-product, often via multiple heads), applies those weights to values to produce context-aware outputs, uses masking in decoder self-attention to preserve autoregression, and includes encoder–decoder attention to let the decoder attend to encoder outputs [Vaswani et al., 2017, Sections 3.2.3 and 4]."
  }
]

In [56]:
messages = [
    ("system", PROMPT.format(user_question=user_question, context=context)),
    # ("human", user_question),
]
config = {"configurable": {"thread_id": "9"}} 

In [57]:
result = agent.invoke({"messages": messages}, config=config)

In [58]:
result['__interrupt__']

[Interrupt(value={'action_requests': [{'name': 'web_search', 'args': {'query': 'Anthropic explanation of self-attention mechanism transformer self-attention Anthropics blog', 'topic': 'general', 'max_results': 5}, 'description': "Tool execution pending approval\n\nTool: web_search\nArgs: {'query': 'Anthropic explanation of self-attention mechanism transformer self-attention Anthropics blog', 'topic': 'general', 'max_results': 5}"}], 'review_configs': [{'action_name': 'web_search', 'allowed_decisions': ['approve', 'reject']}]}, id='892bfa223e10a0caeb7afeff592745c0')]

In [59]:
from langgraph.types import Command

res = agent.invoke(
    Command( 
        resume={"decisions": [{"type": 'approve'}]}
    ), 
    config=config
)

In [60]:
from IPython.display import Markdown, display

display(Markdown(res["messages"][-1].content))

Restatement: Self-attention is an attention mechanism that lets each position (token) in a single sequence compute a context-aware representation by attending to (looking at) other positions in that same sequence [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).

How it works in Transformer models (stepwise, concise)
- Inputs are embedded and (optionally) positionally encoded; for a given layer each token’s representation produces three vectors by learned linear projections: a query q, a key k, and a value v. All q/k/v come from the same source in self-attention (the previous layer) [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).
- Attention weights: for a query q at position i, compute dot-products with all keys k_j to measure relevance, scale by sqrt(d_k) to stabilize gradients, then apply softmax across j to get normalized attention weights α_{ij} (this is “scaled dot-product attention”) [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).
- Context output: form a weighted sum of the values using those weights: output_i = Σ_j α_{ij} v_j. That output is the context-aware representation for position i in that layer [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).
- Masking and decoder use: in decoder self-attention the softmax is masked so positions cannot attend to future tokens (preserving autoregressive generation); encoder–decoder attention uses decoder queries over encoder keys/values so the decoder can condition on encoder outputs [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).
- Multi‑head attention: the model runs several attention “heads” (different learned q/k/v projections) in parallel so each head can learn different kinds of relationships; their outputs are concatenated and linearly projected to produce the final per-position output for the layer [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).
- Practical benefits: self-attention connects every pair of positions via a small number of sequential operations (improving parallelism) and shortens the path for long-range dependency modeling compared with recurrent layers [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).

How Anthropic (Transformer Circuits viewpoint) explains self-attention
- Anthropic’s Transformer Circuits work presents attention not only as a mathematical weighting but as components in “circuits”: individual attention heads act like feature detectors that route or copy specific features (directions) from tokens into other token representations; the value and output matrices together can act as a low‑rank map that takes features detected by attention and writes them back into the model’s embedding space [referenced commentary: 3Blue1Brown pointing to Anthropic’s Transformer Circuits](https://www.youtube.com/watch?v=eMlx5fFNoYc&vl=en). This perspective emphasizes interpreting attention heads as modular computations within larger circuits rather than only as abstract weight matrices [3Blue1Brown (references Transformer Circuits)].
- In that interpretability framing, you can think of (a) a head’s keys/queries as the pattern detector that decides when to copy or route a feature, and (b) the values and subsequent output projection as the mechanism that places the copied feature into the receiving token’s representation — together enabling specific, human-interpretable transformations inside the network (a view developed in the Transformer Circuits analyses cited by Anthropic) [3Blue1Brown (references Transformer Circuits)].

Sources
- Vaswani et al., “Attention Is All You Need” (Transformer self-attention definitions, scaled dot-product, multi-head, masking, encoder–decoder attention) [Vaswani et al., 2017](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf).
- Commentary referencing Anthropic’s Transformer Circuits (overview of interpretability framing: heads as feature detectors and the combined role of value/output matrices) [3Blue1Brown video referencing Transformer Circuits](https://www.youtube.com/watch?v=eMlx5fFNoYc&vl=en).