In [1]:
import os
import pandas as pd
from datetime import datetime, timedelta
from langfuse import Langfuse
from dotenv import load_dotenv

load_dotenv()

langfuse = Langfuse()

In [11]:
x = langfuse.api.trace.get("8c7d7623175e8a8be5a57756694985ce").dict()

In [12]:
x

{'id': '8c7d7623175e8a8be5a57756694985ce',
 'timestamp': datetime.datetime(2025, 11, 19, 10, 52, 36, 215000, tzinfo=datetime.timezone.utc),
 'name': 'send_message',
 'input': {'args': [],
  'kwargs': {'message_request': {'id': 'f8f0f333-e973-4d4e-b505-4be358bdd492',
    'jsonrpc': '2.0',
    'method': 'message/send',
    'params': {'configuration': None,
     'message': {'contextId': 'c438b2c3-e9ca-4d53-acf6-34a5cc252660',
      'extensions': None,
      'kind': 'message',
      'messageId': 'f8f0f333-e973-4d4e-b505-4be358bdd492',
      'metadata': None,
      'parts': [{'kind': 'text',
        'metadata': None,
        'text': 'User is inquiring about the capital of India and wants a detailed description in ten lines.'}],
      'referenceTaskIds': None,
      'role': 'user',
      'taskId': None},
     'metadata': None}}}},
 'output': {'id': 'f8f0f333-e973-4d4e-b505-4be358bdd492',
  'jsonrpc': '2.0',
  'result': {'artifacts': [{'artifactId': '8c59de2a-2198-4d53-92c2-6a375b3f2b2f',
   

In [None]:
# FINAL_WORKING_DEEPEVAL_NOV19_2025.py
import ast
from typing import Dict, List, Any

# DeepEval - latest 2025 API
from deepeval import evaluate
from deepeval.metrics import GEval, FaithfulnessMetric, ToxicityMetric
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams  # ← CRITICAL
import pprint
# Langfuse
from langfuse import Langfuse

# ===============================================================
# 1. YOUR TRACE (paste full string)
# ===============================================================
# raw_trace = '''{'id': '59b7f43597ffd7104da70b5ecd0b2d4b', ... 'externalId': None}'''
# trace_dict: Dict[str, Any] = ast.literal_eval(raw_trace)

# ===============================================================
# 2. Extract steps
# ===============================================================
def extract_steps(contents: List[Dict]) -> str:
    steps = []
    for part in contents:
        if part.get("role") == "user":
            for p in part["parts"]:
                if "text" in p:
                    steps.append(f"USER: {p['text']}")
                elif "function_response" in p:
                    try:
                        txt = p["function_response"]["response"]["result"]["artifacts"][0]["parts"][0]["text"]
                        steps.append(f"TOOL OUTPUT:\n{txt.strip()}")
                    except:
                        pass
        elif part.get("role") == "model":
            for p in part["parts"]:
                if "function_call" in p:
                    fc = p["function_call"]
                    agent = fc["args"]["agent_name"]
                    task = fc["args"]["task"]
                    steps.append(f"→ CALL: send_message('{agent}')\n    Task: {task}")
                elif "text" in p:
                    steps.append(f"FINAL ANSWER: {p['text']}")
    return "\n\n".join(steps)



# ===============================================================
# 3. Build test case
# ===============================================================
def build_test_case(trace: Dict) -> LLMTestCase:
    contents = trace["input"]["contents"]
    output = trace["output"]["content"]["parts"][0]["text"]
    user_query = next((p["text"] for c in contents if c["role"] == "user" for p in c["parts"] if "text" in p), "")

    full_trace = "=== FULL AGENT ORCHESTRATION TRACE ===\n" + extract_steps(contents)

    return LLMTestCase(
        input=user_query,
        actual_output=output,
        retrieval_context=[full_trace],  # This is how you pass intermediate steps now
    )

# ===============================================================
# 4. METRICS — ONLY THIS WAY WORKS IN NOV 2025
# ===============================================================
def get_metrics():
    return [
        GEval(
            name="Multi-Agent Orchestration Quality",
            criteria="Rate how well the routing agent orchestrated other agents using the trace in retrieval_context.",
            evaluation_steps=[
                "Did it call Get Dept Number first?",
                "Then Get Department Manager with d001?",
                "Only 2 calls total?",
                "No loops, no user involvement?",
                "Final answer is direct and correct?"
            ],
            evaluation_params=[
                LLMTestCaseParams.INPUT,
                LLMTestCaseParams.ACTUAL_OUTPUT,
                LLMTestCaseParams.RETRIEVAL_CONTEXT,
            ],
            model="gpt-4o-mini",
            threshold=0.8
        ),
        GEval(
            name="Efficiency & Minimal Steps",
            criteria="Was the task solved in the absolute minimum number of steps?",
            evaluation_steps=[
                "Exactly 2 agent calls",
                "No redundancy",
                "Perfect dependency resolution"
            ],
            evaluation_params=[
                LLMTestCaseParams.ACTUAL_OUTPUT,
                LLMTestCaseParams.RETRIEVAL_CONTEXT,
            ],
            model="gpt-4o-mini",
            threshold=0.9
        ),
        FaithfulnessMetric(threshold=0.9),
        ToxicityMetric(threshold=0.1),
    ]

# ===============================================================
# 5. Evaluate + Log
# ===============================================================
def evaluate_and_log(trace: Dict):
    test_case = build_test_case(trace)
    metrics = get_metrics()

    print("Running DeepEval evaluation...")
    results = evaluate(test_cases=[test_case], metrics=metrics)  # ← Correct call
    result = results.test_results[0]
    pprint.pprint(result)

    # scores = {
    #     "orchestration_quality": result.score_breakdown["Multi-Agent Orchestration Quality"],
    #     "efficiency_score": result.score_breakdown["Efficiency & Minimal Steps"],
    #     "faithfulness": result.score_breakdown["Faithfulness"],
    #     "toxicity": result.score_breakdown["Toxicity"],
    # }
    # scores["task_success"] = 1.0 if scores["orchestration_quality"] >= 8.5 and scores["faithfulness"] >= 0.9 else 0.0

    # # Optional: Langfuse
    # try:
    #     langfuse = Langfuse()
    #     lf_trace = langfuse.trace(name="Routing Agent Eval", input=test_case.input, output=test_case.actual_output)
    #     for k, v in scores.items():
    #         lf_trace.score(name=k.replace("_", "-"), value=float(v))
    # except:
    #     pass  # Ignore Langfuse warning

    # print("\n" + "="*60)
    # print("EVALUATION COMPLETE - NOV 19, 2025")
    # print("="*60)
    # print(f"Orchestration Quality : {scores['orchestration_quality']:.2f}/10")
    # print(f"Efficiency            : {scores['efficiency_score']:.2f}/10")
    # print(f"Faithfulness          : {scores['faithfulness']:.2f}")
    # print(f"Toxicity              : {scores['toxicity']:.2f}")
    # print(f"Task Success          : {'YES' if scores['task_success'] else 'NO'}")
    # print("="*60)

# ===============================================================
# 6. RUN
# ===============================================================
if __name__ == "__main__":
    evaluate_and_log(x)

KeyError: 'task'