# Notebook 7 · Adaptive Routing and Cost Control

Adaptive RAG activates only the components necessary for a request. We will measure the cost impact of routing short factual questions to a cheaper model and using the full agent stack only for analytical prompts.

In [None]:
from typing import Any, Dict

from langchain_openai import ChatOpenAI as LangChainChatOpenAI

from pprint import pprint

from shared import (
    DEFAULT_MODEL,
    RetrievalContext,
    build_baseline_chain,
    build_retrieval_context,
    pretty_print_json,
    time_execution,
)


In [None]:
light_model = LangChainChatOpenAI(model='gpt-4.1-mini', temperature=0.0)
rich_model = LangChainChatOpenAI(model='gpt-4.1', temperature=0.0)
context = build_retrieval_context(top_k=3)
qa_chain = build_baseline_chain(context.retriever)


In [None]:
def adaptive_answer(question: str) -> Dict[str, Any]:
    if len(question.split()) < 12:
        llm = light_model
        mode = 'lightweight'
    else:
        llm = rich_model
        mode = 'full'
    llm_response = llm.predict(question)
    retrieval_augmented = qa_chain.run(question)
    return {'mode': mode, 'llm_response': llm_response, 'retrieval_augmented': retrieval_augmented}

adaptive_answer('List the steps to invite a contractor to a workspace.')


## Suggested experiments

* Swap the routing heuristic for a classifier fine-tuned on historical queries.
* Track token usage (e.g. via LangChain callbacks) to quantify savings.
* Combine with the verified RAG notebook to enforce grounding in the full mode.