# Imports

In [None]:
#!pip install -r requirements.txt

In [None]:
import os, sys, json, re
from datetime import datetime
from typing import List, Optional, Literal, TypedDict, Annotated
import operator
from dotenv import load_dotenv, find_dotenv
from pydantic import BaseModel, Field, ConfigDict
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langgraph.graph import StateGraph, END
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import requests
import xml.etree.ElementTree as ET
import numpy as np
import unicodedata

# Setup

In [None]:
load_dotenv(find_dotenv(usecwd=True))
BASE_URL = os.getenv("LITELLM_BASE_URL", add your base url)
API_KEY = os.getenv("LITELLM_API_KEY", add your api key)
MODEL_NAME = os.getenv("MODEL_NAME", "qwen3-32b")
llm = ChatOpenAI(base_url=BASE_URL, api_key=API_KEY, model=MODEL_NAME, temperature=0.7, max_tokens=2000)

# Pydantic Models

In [None]:
class Paper(BaseModel):
    title: str; abstract: str = ""; 
    year: Optional[int] = None; 
    source: str = "arxiv"; 
    url: Optional[str] = None

In [None]:
class QueryClassification(BaseModel):
    query_type: Literal["conceptual", "design", "implementation", "planning"]
    confidence: Literal["high", "medium", "low"]; 
    reasoning: str
    needs_memory: bool; 
    is_followup: bool; 
    target_agents: List[str]

In [None]:
class TrendAnalysis(BaseModel):
    trends: List[str]; 
    emerging_directions: List[str]; 
    confidence: Literal["high", "medium", "low"]

In [None]:
class ContradictionAnalysis(BaseModel):
    contradictions: List[str]; 
    unsolved_problems: List[str]; 
    opportunities: List[str]

In [None]:
class Hypothesis(BaseModel):
    statement: str; 
    triz_principles: List[str]; 
    rationale: str; novelty_score: int

In [None]:
class ExperimentPlan(BaseModel):
    feasibility: Literal["high", "medium", "low"]; 
    steps: List[str]
    resources: List[str]; 
    duration: str; 
    challenges: List[str]

In [None]:
class MemoryEntry(BaseModel): 
    query: str; 
    response_summary: str; 
    agents_used: List[str]; 
    key_findings: List[str] = []

In [None]:
def merge_lists(l, r): 
    return (l or []) + (r or [])

In [None]:
class AgentState(TypedDict):
    user_query: str; 
    classification: Optional[QueryClassification]; 
    current_agent: Optional[str]
    agents_activated: Annotated[List[str], operator.add]; 
    literature_data: Optional[dict]
    papers: Annotated[List, merge_lists]; 
    trends: Optional[TrendAnalysis]; 
    gaps: Optional[ContradictionAnalysis]
    hypothesis: Optional[Hypothesis]; 
    novelty_score: Optional[dict]; 
    experiment_plan: Optional[ExperimentPlan]
    feasibility_score: Optional[dict]; 
    final_response: Optional[str]; 
    messages: Annotated[List[str], operator.add]
    session_history: Annotated[List, merge_lists]; 
    notes: Annotated[List[str], operator.add]; 
    memory_context: Optional[str]

In [None]:
def create_initial_state(q): 
    return {k: [] if 'List' in str(v) else None for k,v in AgentState.__annotations__.items()} | {'user_query': q}

# Tools

In [None]:
def clean_text(text):
    if not text: return ""
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    return ' '.join(text.split())

In [None]:
def search_arxiv(query, max_results=10):
    print(f"[TOOL] ArXiv: {query}")
    try:
        r = requests.get("http://export.arxiv.org/api/query", params={'search_query': f'all:{query}', 'max_results': max_results}, timeout=15)
        root = ET.fromstring(r.content)
        papers = []
        for e in root.findall('{http://www.w3.org/2005/Atom}entry'):
            t = e.find('{http://www.w3.org/2005/Atom}title')
            s = e.find('{http://www.w3.org/2005/Atom}summary')
            if t is not None and s is not None:
                clean_title = clean_text(t.text)
                clean_abstract = clean_text(s.text)
                papers.append(Paper(title=clean_title, abstract=clean_abstract))
        print(f"[TOOL] ArXiv found {len(papers)} papers")
        return {"papers_found": len(papers), "key_topics": [p.title for p in papers[:5]], "papers": papers}
    except Exception as e: 
        return {"papers_found": 0, "papers": [], "error": str(e)}


In [None]:
def calc_novelty(hyp, papers):
    if not papers: return {"score": 7, "reason": "No papers"}
    hw = set(re.findall(r'\b\w{4,}\b', hyp.lower()))
    overlaps = [len(hw & set(re.findall(r'\b\w{4,}\b', f"{p.title} {p.abstract}".lower())))/len(hw) for p in papers[:10] if hw]
    return {"score": max(1,min(10,int((1-np.mean(overlaps))*10))) if overlaps else 7, "reason": f"Overlap: {np.mean(overlaps)*100:.1f}%" if overlaps else "N/A"}


In [None]:
def calc_feasibility(exp):
    s = 10 - (2 if any(x in ' '.join(exp.resources).lower() for x in ['supercomputer','quantum','a100']) else 0) - (2 if 'year' in exp.duration.lower() else 0)
    return {"category": "high" if s>=7 else "medium" if s>=4 else "low", "score": max(1,s)}

In [None]:
TRIZ = ["Segmentation","Taking out","Local quality","Asymmetry","Merging","Universality","Dynamics","Feedback"]

# Memory and LLM utilities

In [None]:
def llm_retry(): # retry decorator for LLM calls
    return retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=2, max=10),
        retry=retry_if_exception_type((Exception,)),
        reraise=True
    )

In [None]:
@llm_retry() 
def invoke_with_parser(llm, parser, prompt_template, **kwargs): 
    if kwargs:
        try:
            prompt = prompt_template.format(**kwargs)
        except KeyError as e:
            print(f"[LLM] Warning: Prompt formatting failed ({e}), using raw string.")
            prompt = prompt_template
    else:
        prompt = prompt_template
    response = llm.invoke([
        SystemMessage(content=f"Respond with valid JSON matching this schema:\n{parser.get_format_instructions()}"),
        HumanMessage(content=prompt)
    ])
    content = response.content.strip()
    content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
    if content.startswith("```"):
        content = re.sub(r'^```\w*\n?|\n?```', '', content)
    json_match = re.search(r'(\{[\s\S]*\}|\[[\s\S]*\])', content)
    if json_match:
        content = json_match.group(1)
    content = re.sub(r'\\x([0-9a-fA-F]{2})', r'\\u00\1', content) 
    content = content.replace(r'\x2014', '-')     
    return parser.parse(content)

In [None]:
class MemStore: 
    def __init__(self, filepath="memory.json"):
        self.filepath = filepath
        self.history = self._load() 
        self.notes = []
    def _load(self):
        if os.path.exists(self.filepath):
            try:
                with open(self.filepath, 'r') as f:
                    data = json.load(f)
                    return [MemoryEntry(**d) for d in data]
            except Exception as e:
                print(f"[MEMORY] error loading file: {e}")
        return []
    def _save(self):
        try:
            data = [entry.dict() for entry in self.history]
            with open(self.filepath, 'w') as f:
                json.dump(data, f, indent=2)
        except Exception as e:
            print(f"[MEMORY] error saving file: {e}")
    def add(self, q, r, a, f=None):
        entry = MemoryEntry(
            query=q,
            response_summary=r,
            agents_used=a,
            key_findings=f or []
        )
        self.history.append(entry)
        self._save()  
    def context(self, q, n=3): 
        return "\n".join([f"Q:{e.query[:60]}->R:{e.response_summary[:80]}" for e in self.history[-n:]]) if self.history else ""



In [None]:
mem = MemStore()

In [None]:
# create Pydantic parsers for each output type
query_parser = PydanticOutputParser(pydantic_object=QueryClassification)
trend_parser = PydanticOutputParser(pydantic_object=TrendAnalysis)
gap_parser = PydanticOutputParser(pydantic_object=ContradictionAnalysis)
hypothesis_parser = PydanticOutputParser(pydantic_object=Hypothesis)
experiment_parser = PydanticOutputParser(pydantic_object=ExperimentPlan)

# Agent nodes 

In [None]:
def router_node(state):
    # router agent
    q, ctx = state["user_query"], mem.context(state["user_query"])
    prompt = f"""
    Classify this research query.

    Query: {q}
    {f'Previous context: {ctx}' if ctx else 'No previous context.'}
    
    Query types:
    - conceptual: Theory questions, concepts, comparisons
    - design: Architecture, hypothesis design, methodology  
    - implementation: Code, practical how-to, technical details
    - planning: Full research workflow, complete plans
    
    Determine the query type, confidence, whether memory is needed, if it's a follow-up, and which agents should handle it.
    """
    try:
        classification = invoke_with_parser(llm, query_parser, prompt)
        print(f"[ROUTER] {classification.query_type} -> {classification.target_agents}")
        return {"classification": classification, "agents_activated": ["router"], "messages": [f"Router: {classification.query_type}"]}
    except Exception as e:
        print(f"[ROUTER] parse failed after retries: {e}, using fallback")
        fallback = QueryClassification(
            query_type="planning", confidence="low", reasoning="Fallback due to parse error",
            needs_memory=bool(ctx), is_followup=False,
            target_agents=["research_analyst", "hypothesis_generator", "experiment_designer"]
        )
        return {"classification": fallback, "agents_activated": ["router"], "messages": ["Router: fallback"]}


In [None]:
def research_node(state):
    # research snalyst with Pydantic parsing and retry
    q = state["user_query"]
    print(f"[RESEARCH] {q}...")
    # call ArXiv search 
    lit = search_arxiv(' '.join(q.split()))
    papers = lit.get("papers", [])
    pc = "\n".join([f"- {p.title}: {p.abstract}..." for p in papers]) if papers else "No papers found"
    trend_prompt = f"""
    Analyze research trends from these papers.

    Papers:
    {pc}

    Research focus: {q}

    Identify current trends, emerging directions, and your confidence level."""
    try:
        trends = invoke_with_parser(llm, trend_parser, trend_prompt)
    except Exception as e:
        print(f"[RESEARCH] trend parse failed: {e}")
        trends = TrendAnalysis(trends=["Emerging AI research"], emerging_directions=["Novel methodologies"], confidence="medium")
    # gap analysis with retry and Pydantic
    gap_prompt = f"""
    Find research gaps and opportunities from these papers.

    Papers:
    {pc}

    Identify contradictions in the literature, unsolved problems, and research opportunities.
    """
    try:
        gaps = invoke_with_parser(llm, gap_parser, gap_prompt)
    except Exception as e:
        print(f"[RESEARCH] gap parse failed: {e}")
        gaps = ContradictionAnalysis(contradictions=["Limited scope"], unsolved_problems=["Scalability"], opportunities=["Novel approaches"])
    print(f"[RESEARCH] {len(papers)} papers, {len(trends.trends)} trends, {len(gaps.opportunities)} opportunities")
    return {"literature_data": lit, "papers": papers, "trends": trends, "gaps": gaps, 
            "agents_activated": ["research_analyst"], "messages": [f"Research: {len(papers)} papers"]}


In [None]:
def hypothesis_node(state):
    # hypothesis generator
    q, tr, ga, papers = state["user_query"], state.get("trends"), state.get("gaps"), state.get("papers", [])
    print(f"[HYPOTHESIS] generating...")
    tl = (tr.trends if tr else ["Emerging research"])[:3]
    gl = (ga.opportunities if ga else ["Novel approaches"])[:3]
    hyp_prompt = f"""
    Generate a research hypothesis using TRIZ methodology.

    Research question: {q}

    Current trends: {', '.join(tl)}
    Research opportunities: {', '.join(gl)}

    TRIZ Principles to consider: {', '.join(TRIZ[:5])}

    Create a specific, testable hypothesis with:
    - A clear statement (minimum 20 characters)
    - Which TRIZ principles apply
    - Rationale for why this hypothesis matters
    - Novelty score (1-10)
    """
    try:
        hypothesis = invoke_with_parser(llm, hypothesis_parser, hyp_prompt)
        nv = calc_novelty(hypothesis.statement, papers)
        print(f"[HYPOTHESIS] created, novelty: {nv['score']}/10")
        return {"hypothesis": hypothesis, "novelty_score": nv, 
                "agents_activated": ["hypothesis_generator"], "messages": [f"Hypothesis: novelty {nv['score']}"]}
    except Exception as e:
        print(f"[HYPOTHESIS] parse failed: {e}")
        fallback = Hypothesis(
            statement=f"Applying {TRIZ[0]} principle to {q[:40]} will improve research outcomes",
            triz_principles=TRIZ[:2], rationale="Addresses identified research gaps through systematic innovation",
            novelty_score=6
        )
        return {"hypothesis": fallback, "novelty_score": {"score": 6}, 
                "agents_activated": ["hypothesis_generator"], "messages": ["Hypothesis: fallback"]}


In [None]:
def experiment_node(state):
    # experiment designer
    q, h = state["user_query"], state.get("hypothesis")
    print(f"[EXPERIMENT] designing...")
    hs = h.statement if h else f"Test the approach: {q}"
    exp_prompt = f"""
    Design an experiment to test this hypothesis.

    Hypothesis: {hs}
    Research context: {q}

    Create a practical experiment plan with:
    - Feasibility assessment (high/medium/low)
    - 3-7 concrete steps
    - Required resources
    - Estimated duration
    - Potential challenges
    """
    try:
        experiment = invoke_with_parser(llm, experiment_parser, exp_prompt)
        feas = calc_feasibility(experiment)
        print(f"[EXPERIMENT] {len(experiment.steps)} steps, feasibility: {feas['category']}")
        return {"experiment_plan": experiment, "feasibility_score": feas,
                "agents_activated": ["experiment_designer"], "messages": [f"Experiment: {len(experiment.steps)} steps"]}
    except Exception as e:
        print(f"[EXPERIMENT] parse failed: {e}")
        fallback = ExperimentPlan(
            feasibility="medium",
            steps=["Define experimental setup", "Prepare datasets", "Implement approach", "Run experiments", "Analyze results"],
            resources=["Computing resources", "Datasets", "Evaluation metrics"],
            duration="4-6 weeks",
            challenges=["Data availability", "Computational constraints"]
        )
        return {"experiment_plan": fallback, "feasibility_score": {"category": "medium", "score": 7},
                "agents_activated": ["experiment_designer"], "messages": ["Experiment: fallback"]}


In [None]:
def mem_ret_node(state):
    # memory retrieval node
    ctx = mem.context(state["user_query"])
    print(f"[MEMORY] retrieved context: {len(ctx)} chars")
    return {"memory_context": ctx or None, "agents_activated": ["memory_manager"]}

In [None]:
def mem_upd_node(state):
    # memory update node
    findings = []
    h = state.get("hypothesis")
    if h and h.statement:
        findings.append(f"Hypothesis: {h.statement[:80]}")
    tr = state.get("trends")
    if tr and tr.trends:
        findings.append(f"Trends: {', '.join(tr.trends[:3])}")
    ga = state.get("gaps")
    if ga and ga.opportunities:
        findings.append(f"Opportunities: {', '.join(ga.opportunities[:3])}")
    e = state.get("experiment_plan")
    if e and e.steps:
        findings.append(f"Experiment: {len(e.steps)} steps, {e.feasibility} feasibility")
    mem.add(state["user_query"], state.get("final_response", "")[:150], 
            list(set(state.get("agents_activated", []))), findings)
    print(f"[MEMORY] saved interaction with {len(findings)} key findings")
    return {"agents_activated": ["memory_manager"]}

In [None]:
@llm_retry()
def synth_llm_call(llm, parts):
    # synthesizer LLM call
    return llm.invoke([
        SystemMessage(content="Synthesize multi-agent findings into a clear, comprehensive response."),
        HumanMessage(content="\n".join(parts))
    ]).content.strip()

In [None]:
def synth_node(state):
    # synthesizer node with retry
    q = state["user_query"]
    c = state.get("classification")
    tr, ga = state.get("trends"), state.get("gaps")
    h, e = state.get("hypothesis"), state.get("experiment_plan")
    nv, fv = state.get("novelty_score", {}), state.get("feasibility_score", {})
    print(f"[SYNTH] generating response...")
    parts = [f"Query: {q}"]
    if c: parts.append(f"Query type: {c.query_type}")
    if tr: parts.append(f"Trends: {', '.join(tr.trends)}")
    if ga: parts.append(f"Opportunities: {', '.join(ga.opportunities)}")
    if h: 
        parts.append(f"Hypothesis: {h.statement}")
        parts.append(f"TRIZ principles: {', '.join(h.triz_principles)}")
        parts.append(f"Novelty score: {nv.get('score', h.novelty_score)}/10")
    if e:
        parts.append(f"Experiment: {len(e.steps)} steps, Duration: {e.duration}")
        parts.append(f"Feasibility: {fv.get('category', e.feasibility)}")
    try:
        response = synth_llm_call(llm, parts)
    except Exception as ex:
        print(f"[SYNTH] LLM failed after retries: {ex}")
        response = "\n".join(parts)
    print(f"[SYNTH] response: {len(response)} chars")
    return {"final_response": response, "agents_activated": ["synthesizer"]}

# Graph construction + run function

In [None]:
# agent name mapping is needed to map LLM-returned names -> actual node names cuz I encountered some issues with that
AGENT_NAME_MAP = {
    # research-related
    "research": "research_analyst", "researcher": "research_analyst", "research_analyst": "research_analyst",
    "research assistant": "research_analyst", "researchassistant": "research_analyst",
    "theory agent": "research_analyst", "theoryagent": "research_analyst",
    "literature": "research_analyst", "analyst": "research_analyst",
    "researchagent": "research_analyst", "theoryanalysis": "research_analyst",
    "researchdesign": "research_analyst", "researchplanningagent": "research_analyst",
    "literaturereview": "research_analyst", "literatureagent": "research_analyst",
    # hypothesis-related
    "hypothesis": "hypothesis_generator", "hypothesisgenerator": "hypothesis_generator",
    "hypothesis_generator": "hypothesis_generator", "hypothesisdesignagent": "hypothesis_generator",
    "design": "hypothesis_generator", "designer": "hypothesis_generator",
    "multiagentsystemsagent": "hypothesis_generator", "multiagentarchitect": "hypothesis_generator",
    "systemarchitecture": "hypothesis_generator", "systemdesignagent": "hypothesis_generator",
    "designagents": "hypothesis_generator", "trizexperts": "hypothesis_generator",
    "trizagent": "hypothesis_generator", "architectureagent": "hypothesis_generator",
    "multiagentsystemarchitect": "hypothesis_generator",
    # experiment-related
    "experiment": "experiment_designer", "experimentdesigner": "experiment_designer",
    "experiment_designer": "experiment_designer", "implementation": "experiment_designer",
    "planner": "experiment_designer", "researchplanner": "experiment_designer",
    "memorysystemdesigner": "experiment_designer", "memorymanagementagent": "experiment_designer",
    "langgraphdeveloperagent": "experiment_designer", "errorhandlingspecialist": "experiment_designer",
    "implementationagent": "experiment_designer", "codeagent": "experiment_designer",
    "developeragent": "experiment_designer", "practicalagent": "experiment_designer",
}

In [None]:
def normalize_agent_name(name):
    # maps LLM-returned agent names to actual node names 
    normalized = name.lower().replace(" ", "").replace("_", "").replace("-", "")
    return AGENT_NAME_MAP.get(normalized, None)

In [None]:
def get_target_agents_normalized(target_agents):
    # normalizes a list of target agents to actual node names
    normalized = set()
    for agent in target_agents:
        mapped = normalize_agent_name(agent)
        if mapped:
            normalized.add(mapped)
    return normalized

In [None]:
def route_after_router(state):
    c = state.get("classification")
    if not c: return "research_analyst"
    if c.needs_memory: return "memory_retrieval"
    if c.query_type == "conceptual": return "research_analyst"
    elif c.query_type == "design": return "research_analyst"
    elif c.query_type == "implementation": return "experiment_designer"
    else: return "research_analyst"  # planning

In [None]:
def route_after_memory(state):
    # normalizes agent names and fallback to query_type
    c = state.get("classification")
    if not c: return "research_analyst"
    normalized_agents = get_target_agents_normalized(c.target_agents)
    print(f"[ROUTE] target_agents={c.target_agents} -> normalized={normalized_agents}")
    # check normalized agents
    if "research_analyst" in normalized_agents:
        return "research_analyst"
    elif "hypothesis_generator" in normalized_agents:
        return "hypothesis_generator"
    elif "experiment_designer" in normalized_agents:
        return "experiment_designer"
    print(f"[ROUTE] fallback to query_type={c.query_type}")
    if c.query_type == "conceptual":
        return "research_analyst"
    elif c.query_type == "design":
        return "research_analyst"  # design needs research first
    elif c.query_type == "implementation":
        return "experiment_designer"
    else:  # planning
        return "research_analyst"

In [None]:
def route_after_research(state):
    c = state.get("classification")
    if not c: return "synth"
    if c.query_type in ["design", "planning"]: return "hypothesis_generator"
    return "synth"

In [None]:
def route_after_hypothesis(state):
    c = state.get("classification")
    if not c: return "synth"
    if c.query_type == "planning": return "experiment_designer"
    return "synth"

In [None]:
# build graph
workflow = StateGraph(AgentState)
workflow.add_node("router", router_node)
workflow.add_node("memory_retrieval", mem_ret_node)
workflow.add_node("research_analyst", research_node)
workflow.add_node("hypothesis_generator", hypothesis_node)
workflow.add_node("experiment_designer", experiment_node)
workflow.add_node("synth", synth_node)
workflow.add_node("memory_update", mem_upd_node)
workflow.set_entry_point("router")
workflow.add_conditional_edges("router", route_after_router, {
    "memory_retrieval": "memory_retrieval", "research_analyst": "research_analyst",
    "hypothesis_generator": "hypothesis_generator", "experiment_designer": "experiment_designer"
})
workflow.add_conditional_edges("memory_retrieval", route_after_memory, {
    "research_analyst": "research_analyst", "hypothesis_generator": "hypothesis_generator",
    "experiment_designer": "experiment_designer"
})
workflow.add_conditional_edges("research_analyst", route_after_research, {
    "hypothesis_generator": "hypothesis_generator", "synth": "synth"
})
workflow.add_conditional_edges("hypothesis_generator", route_after_hypothesis, {
    "experiment_designer": "experiment_designer", "synth": "synth"
})
workflow.add_edge("experiment_designer", "synth")
workflow.add_edge("synth", "memory_update")
workflow.add_edge("memory_update", END)
graph = workflow.compile()
print(graph.get_graph().draw_mermaid())

In [None]:
def run(q):
    # runs a query through the multi-agent graph
    print(f"\n{'---'}\nQUERY: {q}\n{'---'}")
    try:
        s = graph.invoke(create_initial_state(q))
        if s is None:
            print("[ERROR] graph returned None")
            return {}
        print(f"\n{'---'}\nRESPONSE:\n{'---'}\n{s.get('final_response','')}\n{'---'}\nAgents: {set(s.get('agents_activated',[]))}\n")
        return s
    except Exception as e:
        print(f"[ERROR] graph execution failed: {e}")
        import traceback
        traceback.print_exc()
        return {}

# Experiments

In [None]:
# Exp 1: conceptual
r1 = run("What are the benefits of multi-agent systems for LLM orchestration vs single-agent approaches?")
if r1:
    c1 = r1.get('classification')
    print(f"Analysis: Type={c1.query_type if c1 else 'N/A'}, Papers={r1.get('literature_data',{}).get('papers_found',0) if r1.get('literature_data') else 0}")

In [None]:
# Exp 2: design
r2 = run("Design a hypothesis about improving agent communication in multi-agent LLM systems")
if r2:
    h2 = r2.get('hypothesis')
    nv2 = r2.get('novelty_score', {})
    print(f"Analysis: Hypothesis={h2.statement[:60] if h2 else 'N/A'}..., Novelty={nv2.get('score','N/A') if nv2 else 'N/A'}")

In [None]:
# Exp 3: implementation
r3 = run("How to implement tool calling with error handling in LangGraph agents?")
if r3:
    c3 = r3.get('classification')
    print(f"Analysis: Type={c3.query_type if c3 else 'N/A'}, Agents={set(r3.get('agents_activated',[]))}")

In [None]:
# Exp 4: planning
r4 = run("Create a research plan to investigate memory management strategies in multi-agent conversational AI")
if r4:
    e4 = r4.get('experiment_plan')
    f4 = r4.get('feasibility_score', {})
    print(f"Analysis: Steps={len(e4.steps) if e4 else 0}, Feasibility={f4.get('category','N/A') if f4 else 'N/A'}")

In [None]:
# Exp 5: follow-up (tests memory)
r5 = run("Expand on the previous hypothesis with more TRIZ principles")
if r5:
    c5 = r5.get('classification')
    print(f"Analysis: Is followup={c5.is_followup if c5 else 'N/A'}, Memory used={r5.get('memory_context') is not None}")

In [None]:
# additional miscallenous queries
queries = [
    "Explain the theoretical trade-offs between Centralized Training with Decentralized Execution (CTDE) and fully decentralized learning in Multi-Agent Reinforcement Learning (MARL).",

    "Design a fault-tolerant multi-agent architecture for a cross-chain bridge that monitors liquidity pools and validates transactions between Ethereum and Solana.",

    "How to implement a custom shared experience replay buffer for multiple DQN agents using Ray and Redis in Python?",

    "Summarize the latest research trends in using Large Language Models for optimizing SQL query execution plans in distributed databases.",
    ]

# then run loop
for q in queries:
    run(q)


# Reflection and Evaluation

What worked:

System works well wiith routing (I had a problem in one of the examples when the Router hallucinated agent names) that resulted in an empty normalization set, the system successfully triggered a fallback mechanism, that resulted in request proceeding, not crashing.

Memory management proved effective for follow-up queries, also, in exp 5 expanding on the prev hypothesis pulled chars of context, which allowed agent to refine the specific TRIZ principles generated in exp 2 without need of specific details.

Unexpected behaviour and failures:

In most of the experiments system invented new agent names which was solved by hardcoding them (mapping) - it is a subject of better prompting (as I guess). Also I think my invented tools for feasibility and novelty calculation as failures cuz they don't provide true intelligence.

Areas for growing:

Adding a reviwer agent would be perfect and using it before the synthesis step. Also usage of vector memory not just json file would be amazing cuz it would allow the system to retrieve specific segments of past conversations.

# Evaluation 

1) Routing

Router agent correctly classified experiment queries into appropriate types.

2) Subjective usefulness

Experiments output included real arXiv papers and generated structured hypotheses with novelty scores even though I used simple heuristics for novelty and feasibility calculations.

3) Did memory help?

Memory was most impactful in Experiment 5 (follow-up query) so I can say, that memory really helped.

4) Did tool calls make sense?

Yes, tool calls made sense, cause I used arXiv search tool, and again, even though novelty/feasibility calculation tools are synthetic to my taste, I still consider them as useful.