In [15]:
import json
import os
from datetime import  datetime, timedelta
from typing import Any, Dict, Optional, Tuple
 
import pandas as pd
from dotenv import load_dotenv
from langfuse import Langfuse
from langfuse.openai import AzureOpenAI
 
load_dotenv()
 
langfuse = Langfuse()  # Auto-reads LANGFUSE_* env vars
 
AZURE_CLIENT = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-08-01-preview",
)
JUDGE_MODEL = os.getenv("AZURE_OPENAI_MODEL")

In [22]:
x = langfuse.api.trace.get("3c7300a2c603e23e4cd87f07cdc5c34f")
y=x.dict()
y

{'id': '3c7300a2c603e23e4cd87f07cdc5c34f',
 'timestamp': datetime.datetime(2025, 11, 20, 9, 28, 20, 473000, tzinfo=datetime.timezone.utc),
 'name': 'call_llm',
 'input': {'model': 'azure/gpt-4o-mini',
  'config': {'system_instruction': '\n        **Role:** Host agent for the agent-to-agent protocol; delegates queries to specialized remote agents with maximum efficiency.\n\n**Core Directives:**\n\n* **Task Delegation:** Use the `send_message` function to assign precise, actionable tasks to remote agents.\n* **Full Context Provision:** If an agent repeatedly asks for user confirmation, it likely lacks conversation history. Include all relevant context in the task to prevent this.\n* **Autonomous Multi-Agent Engagement:** Engage any required agents directly—never seek user permission or preference. If multiple agents are needed, orchestrate them seamlessly.\n* **Intelligent Inter-Agent Collaboration:** Instruct agents to determine if they need data from another agent. **If Agent A says "I

In [25]:
input= y["input"]["contents"][0]["parts"][0]["text"]
print(input)
output=y["observations"][0]["output"]["result"]["artifacts"][0]["parts"][0]["text"]
print(output)
steps = []
import pprint
for item in y["input"]["contents"]:
    parts = item.get("parts", [])
    for part in parts:
        if "function_call" in part:
            args = part["function_call"]["args"]
            steps.append(f"Agent '{args.get('agent_name')}' is assigned a task {args.get('task')}")

        if "function_response" in part:
            artifacts = part["function_response"]["response"]["result"]["artifacts"]
            collected_texts = []
            for artifact in artifacts:
                for p in artifact.get("parts", []):
                    if p.get("kind") == "text":
                        collected_texts.append(p.get("text"))
            steps.append(f"Agent response: {' '.join(collected_texts)}")

output_parts = y["output"]["content"].get("parts", [])
collected_texts = []
for part in output_parts:
    if "function_call" in part:
        args = part["function_call"]["args"]
        collected_texts.append(f"Agent '{args.get('agent_name')}' was assigned a task {args.get('task')}")

steps.append(f"Final {' '.join(collected_texts)}")

pprint.pprint(steps)

what is the capital of Pakistan and write 10 lines about it.
Islamabad is the capital city of Pakistan, officially established in 1967 to replace Karachi as the nation's capital. Located in the northern part of the country near the Margalla Hills, the city was purposefully designed and built as a modern planned capital. The city is known for its well-organized layout, wide tree-lined avenues, and systematic sectoral divisions that create an orderly urban environment. Islamabad houses important government buildings, including the Presidential Palace, Parliament House, and Supreme Court of Pakistan. The city is characterized by its clean, green environment with numerous parks, gardens, and the scenic Margalla Hills National Park nearby. Home to several prestigious educational institutions like Quaid-i-Azam University and the International Islamic University, it serves as an important academic center. The population is diverse, consisting of government officials, diplomats, professionals,

In [None]:
# def extract_tool_calls(trace) -> str:
#     tools = []
#     for span in getattr(trace, "spans", []) or []:
#         name = getattr(span, "name", "")
#         if name in ["tool", "function", "retriever"] or ("tool" in name.lower()):
#             tools.append(f"- {name}: {getattr(span, 'input', '')}")
#     return "\n".join(tools) or "No tools used"

In [40]:
def extract_tool_calls(trace) -> int:
    """
    Accurately counts the number of agent reasoning steps:
    - Each LLM generation (call_llm)
    - Each meaningful tool execution (especially send_message to remote agents)
    """
    observations = trace.get('observations', [])
    
    llm_calls = 0
    tool_calls = 0

    for obs in observations:
        obs_type = obs.get('type')
        obs_name = str(obs.get('name', '')).lower()
        if obs_type == 'GENERATION' and 'call_llm' in obs_name:
            llm_calls += 1
        if obs_type == 'TOOL' and 'send_message' in obs_name:
            tool_calls += 1
        if obs_type == 'SPAN' and 'send_message' in obs_name:
            pass 

    total_steps = llm_calls + tool_calls
    print(f"LLM calls: {llm_calls}, Tool calls (send_message): {tool_calls}")
    return total_steps

In [39]:
count_agent_steps(y)

LLM calls: 1, Tool calls (send_message): 1


2

In [42]:
import json
import os
from datetime import  datetime, timedelta
from typing import Any, Dict, Optional, Tuple
 
import pandas as pd
from dotenv import load_dotenv
from langfuse import Langfuse
from langfuse.openai import AzureOpenAI
 
load_dotenv()
 
langfuse = Langfuse()  # Auto-reads LANGFUSE_* env vars
 
AZURE_CLIENT = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-08-01-preview",
)
JUDGE_MODEL = os.getenv("AZURE_OPENAI_MODEL")

def evaluate_agent_trace(trace_id: str) -> Dict[str, Any]:
    trace = y
    if trace is None:
        print(f"Trace {trace_id} could not be fetched; skipping score calculation.")
        return {}
    question = input
    final_answer = output
    full_conversation = steps
 
    scores: Dict[str, Any] = {}
 
    scores["goal_completion"] = llm_judge(
        prompt_template="""
Question: {question}
Final Answer: {answer}
 
Did the agent fully solve the user's original goal or question?
Score 0.0–1.0 (1.0 = completely solved, no missing parts)
Respond ONLY with valid JSON: {{"score": 0.XX, "explanation": "one short sentence"}}
        """,
        question=question,
        answer=final_answer,
    )
 
    scores["tool_selection_accuracy"] = llm_judge(
        prompt_template="""
Question: {question}
All tool calls made: {tools_used}
 
Did the agent select the correct tools and in a logical order?
Score 0.0–1.0
JSON only: {{"score": 0.XX, "explanation": "..."}}
        """,
        question=question,
        tools_used=extract_tool_calls(trace),
    )
 
    scores["reasoning_quality"] = llm_judge(
        prompt_template="""
Full conversation (including thoughts and tool results):
{conversation}
 
Are the intermediate reasoning steps logical, coherent, and free of major jumps?
Score 0.0–1.0
JSON only.
        """,
        conversation=full_conversation,
    )
 
    scores["hallucinated_tool_output"] = llm_judge(
        prompt_template="""
Question: {question}
Conversation with tool results: {conversation}
 
Did the agent ever invent or hallucinate a tool result that wasn't actually returned?
0.0 = yes (bad), 1.0 = no hallucinations
JSON only.
        """,
        question=question,
        conversation=full_conversation,
    )
 
    scores["conciseness"] = llm_judge(
        prompt_template="""
Final Answer: {answer}
 
Is the final response concise and to-the-point (no unnecessary fluff/repetition)?
Score 0.0–1.0
JSON only.
        """,
        answer=final_answer,
    )
 
    start = getattr(trace, "start_time", None) or getattr(trace, "timestamp", None)
    end = getattr(trace, "end_time", None) or datetime.now()
    latency = (end - start).total_seconds() if (start and end) else None
 
    scores["total_steps"] = steps
    scores["latency_seconds"] = round(latency, 2) if latency is not None else None
    usage = getattr(trace, "usage", None)
    scores["total_tokens"] = (
        usage.total_tokens
        if usage and getattr(usage, "total_tokens", None) is not None
        else getattr(trace, "total_tokens", 0)
    )
    scores["task_success_binary"] = 1.0 if scores.get("goal_completion", 0) >= 0.90 else 0.0


    scores["plan_adherence"] = llm_judge(
        prompt_template="""
    Full trace (thoughts + actions): {conversation}

    First, extract the agent's explicit or implicit plan from the reasoning steps.
    Then evaluate: Did the agent follow its own plan without major unnecessary deviations?
    Score 0.0–1.0 (1.0 = perfectly adhered to a reasonable plan)
    Respond ONLY with valid JSON: {{"score": 0.XX, "explanation": "short reasoning about adherence"}}
        """,
        conversation=full_conversation,
    )

    scores["plan_quality"] = llm_judge(
        prompt_template="""
    Task/Question: {question}
    Extracted plan from agent's thoughts: {conversation}

    Evaluate the quality of the plan itself (regardless of execution): Is it logical, efficient, comprehensive, and likely to solve the task with minimal steps?
    Score 0.0–1.0 (1.0 = optimal or near-optimal plan)
    Respond ONLY with valid JSON: {{"score": 0.XX, "explanation": "short reasoning about plan quality"}}
        """,
        question=question,
        conversation=full_conversation,
    )

 
    score_descriptions = {
    # Original LLM-as-judge metrics
    "goal_completion":              "Did the agent fully solve the user's original goal/question?",
    "tool_selection_accuracy":      "Did the agent select the correct tools in a logical order?",
    "reasoning_quality":            "Are intermediate reasoning steps logical, coherent, no major jumps?",
    "hallucinated_tool_output":     "1.0 = no invented/hallucinated tool results (lower = bad)",
    "conciseness":                  "Is the final answer concise with no unnecessary fluff/repetition?",

    # New requested LLM-as-judge metrics
    "argument_correctness":         "Are arguments passed to every tool call correct, complete, and appropriate?",
    "plan_adherence":               "Did the agent faithfully follow its own (explicit or implicit) plan?",
    "plan_quality":                 "How good is the plan itself: logical, efficient, comprehensive, minimal steps?",

    # Additional recommended LLM-as-judge metrics
    "tool_efficiency":              "Did the agent use the minimal reasonable number of tool calls (no redundancy)?",
    "error_recovery":               "When tools failed or gave unexpected results, did the agent notice and recover well?",
    "state_awareness":              "Did the agent track obtained information and avoid repeating queries/actions?",
    "reasoning_traceability":       "Are the agent's thoughts clearly explained and easy for a human to follow?",
    "answer_groundedness":          "Is every claim in the final answer directly supported by real tool results?",
    "step_efficiency":              "Given task difficulty, is the number of steps/reasoning cycles reasonable?",
    "total_steps":                  "Number of reasoning + tool steps (lower = more efficient)",
    "latency_seconds":              "End-to-end latency in seconds (lower = faster)",
    "total_tokens":                 "Total tokens consumed across the whole trajectory",
    "task_success_binary":          "Strict binary success (1 if goal_completion ≥ 0.9 else 0)",
    }
 
    for name, value in scores.items():
        langfuse.create_score(
            trace_id=trace_id,
            name=name,
            value=float(value) if isinstance(value, (int, float)) else 0.0,
            comment=score_descriptions.get(name, ""),
            data_type="NUMERIC",
        )
 
    metric_summary = ", ".join(
        f"{name}={value:.3f}"
        for name, value in scores.items()
        if isinstance(value, (int, float))
    )
    print(f"Evaluated trace {trace_id} → {metric_summary}")
    return scores
 
 
def llm_judge(prompt_template: str, **kwargs) -> float:
    # if not AZURE_CLIENT or not JUDGE_MODEL:
    #     print("Azure OpenAI configuration missing. Returning 0.0 score.")
    #     return 0.0
 
    full_prompt = prompt_template.strip().format(**kwargs)
 
    try:
        response = AZURE_CLIENT.chat.completions.create(
            model=JUDGE_MODEL,
            temperature=0.0,
            response_format={"type": "json_object"},
            max_tokens=500,
            messages=[{"role": "system", "content": full_prompt}],
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
        return float(result.get("score", 0.0))
    except Exception as e:
        raw_content = locals().get("content", "N/A")
        print(f"Judge failed: {e}\nRaw: {raw_content}")
        return 0.0
    
evaluate_agent_trace(trace_id="3c7300a2c603e23e4cd87f07cdc5c34f")

LLM calls: 1, Tool calls (send_message): 1
Evaluated trace 3c7300a2c603e23e4cd87f07cdc5c34f → goal_completion=1.000, tool_selection_accuracy=1.000, reasoning_quality=1.000, hallucinated_tool_output=0.000, conciseness=0.500, total_tokens=0.000, task_success_binary=1.000, plan_adherence=1.000, plan_quality=0.800


{'goal_completion': 1.0,
 'tool_selection_accuracy': 1.0,
 'reasoning_quality': 1.0,
 'hallucinated_tool_output': 0.0,
 'conciseness': 0.5,
 'total_steps': ["Agent 'Capital Agent' is assigned a task User requested the capital of Pakistan and a 10-line description about it.",
  'Agent response: Islamabad',
  "Final Agent 'General Q&A Agent' was assigned a task User asked for a 10-line description of Islamabad, the capital of Pakistan."],
 'latency_seconds': None,
 'total_tokens': 0,
 'task_success_binary': 1.0,
 'plan_adherence': 1.0,
 'plan_quality': 0.8}

In [29]:
evaluate_agent_trace(trace_id="3c7300a2c603e23e4cd87f07cdc5c34f")

Azure OpenAI configuration missing. Returning 0.0 score.
Azure OpenAI configuration missing. Returning 0.0 score.
Azure OpenAI configuration missing. Returning 0.0 score.
Azure OpenAI configuration missing. Returning 0.0 score.
Azure OpenAI configuration missing. Returning 0.0 score.
Evaluated trace 3c7300a2c603e23e4cd87f07cdc5c34f → goal_completion=0.000, tool_selection_accuracy=0.000, reasoning_quality=0.000, hallucinated_tool_output=0.000, conciseness=0.000, total_steps=0.000, total_tokens=0.000, task_success_binary=0.000


{'goal_completion': 0.0,
 'tool_selection_accuracy': 0.0,
 'reasoning_quality': 0.0,
 'hallucinated_tool_output': 0.0,
 'conciseness': 0.0,
 'total_steps': 0,
 'latency_seconds': None,
 'total_tokens': 0,
 'task_success_binary': 0.0}