# ODSC 25 AI Agent Evaluation Workshop

## Initial setup

Install dependencies.

In [15]:
%pip install openai pandas pydantic weave --quiet

/Users/emmanuel.turlay/Code/odsc-2025-agent-eval/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


Set environment variables.

In [16]:
import os
import dotenv

dotenv.load_dotenv()

# os.environ["OPENAI_API_KEY"] = ""
#os.environ["WANDB_API_KEY"] = ""

True

Sign up at https://wandb.com and go to https://wandb.ai/authorize to get your API key.

## Data analysis agent

### Definition

In [17]:
from typing import Optional, Dict, List, Any, Union
import pandas as pd
import weave
from openai import OpenAI
from pydantic import Field
import json

class DataAnalysisAgent(weave.Model):

    df: Optional[pd.DataFrame] = None

    conversation_history: List[Dict[str, Any]] = Field(default_factory=list)

    client: OpenAI = Field(default_factory=lambda: OpenAI(api_key=os.environ.get("OPENAI_API_KEY")))

    SYSTEM_PROMPT: weave.StringPrompt = Field(
        default_factory=lambda: weave.StringPrompt("""You are a data analysis assistant. You help users analyze datasets by using available tools.
                
When analyzing data:
1. First load the dataset if not already loaded
2. Understand what the user is asking
3. Use appropriate tools to gather information
4. Provide clear, accurate answers based on the data

Always explain your findings clearly and relate them back to the user's question.

Files are located in the data directory. For example, tips.csv is at data/tips.csv. Always use the correct file path.
""")
    )
    
    def model_post_init(self, __context: Any) -> None:
        """Called after the model is initialized"""
        super().model_post_init(__context)
        weave.publish(self.SYSTEM_PROMPT)

    @property
    def tool_registry(self) -> Dict[str, Any]:
        return {
            "load_csv": self.load_csv,
            "get_summary_statistics": self.get_summary_statistics,
            "calculate_correlation": self.calculate_correlation,
            "group_and_aggregate": self.group_and_aggregate,
            "filter_data": self.filter_data
        }

    @property
    def tool_schemas(self) -> List[Dict[str, Any]]:
        """Define the tools available to the agent"""
        return [
            {
                "type": "function",
                "function": {
                    "name": "load_csv",
                    "description": "Load a CSV file into memory for analysis",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "file_path": {
                                "type": "string",
                                "description": "Path to the CSV file"
                            }
                        },
                        "required": ["file_path"]
                    }
                }
            },
            {
                "type": "function",
                "function": {
                    "name": "get_summary_statistics",
                    "description": "Get summary statistics (mean, std, min, max, etc.) for numeric columns",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "columns": {
                                "type": "array",
                                "items": {"type": "string"},
                                "description": "List of column names to analyze. If not provided, analyzes all numeric columns."
                            }
                        }
                    }
                }
            },
            {
                "type": "function",
                "function": {
                    "name": "calculate_correlation",
                    "description": "Calculate the correlation coefficient between two numeric columns",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "column1": {
                                "type": "string",
                                "description": "First column name"
                            },
                            "column2": {
                                "type": "string",
                                "description": "Second column name"
                            }
                        },
                        "required": ["column1", "column2"]
                    }
                }
            },
            {
                "type": "function",
                "function": {
                    "name": "group_and_aggregate",
                    "description": "Group data by a column and calculate aggregate statistics",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "group_by": {
                                "type": "string",
                                "description": "Column to group by"
                            },
                            "agg_column": {
                                "type": "string",
                                "description": "Column to aggregate"
                            },
                            "agg_function": {
                                "type": "string",
                                "enum": ["mean", "sum", "count", "median"],
                                "description": "Aggregation function to apply"
                            }
                        },
                        "required": ["group_by", "agg_column", "agg_function"]
                    }
                }
            },
            {
                "type": "function",
                "function": {
                    "name": "filter_data",
                    "description": "Filter the dataset based on a condition",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "column": {
                                "type": "string",
                                "description": "Column to filter on"
                            },
                            "operator": {
                                "type": "string",
                                "enum": [">", "<", "==", ">=", "<="],
                                "description": "Comparison operator"
                            },
                            "value": {
                                "type": "number",
                                "description": "Value to compare against"
                            }
                        },
                        "required": ["column", "operator", "value"]
                    }
                }
            }
        ]

    @weave.op
    def load_csv(self, file_path: str) -> Dict[str, Any]:
        """Load a CSV file into a pandas DataFrame"""
        try:
            self.df = pd.read_csv(file_path)
            return {
                "status": "success",
                "message": f"Loaded dataset with {len(self.df)} rows and {len(self.df.columns)} columns",
                "columns": list(self.df.columns),
                "shape": self.df.shape,
                "head": self.df.head(3).to_dict()
            }
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    @weave.op
    def get_summary_statistics(self, columns: Optional[List[str]] = None) -> Dict[str, Any]:
        """Get summary statistics for specified columns or all numeric columns"""
        if self.df is None:
            return {"status": "error", "message": "No dataset loaded"}
        
        try:
            if columns:
                stats = self.df[columns].describe().to_dict()
            else:
                stats = self.df.describe().to_dict()
            
            return {
                "status": "success",
                "statistics": stats
            }
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    @weave.op
    def calculate_correlation(self, column1: str, column2: str) -> Dict[str, Any]:
        """Calculate correlation between two columns"""
        if self.df is None:
            return {"status": "error", "message": "No dataset loaded"}
        
        try:
            correlation = self.df[column1].corr(self.df[column2])
            return {
                "status": "success",
                "column1": column1,
                "column2": column2,
                "correlation": float(correlation)
            }
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    @weave.op
    def group_and_aggregate(self, group_by: str, agg_column: str, agg_function: str = "mean") -> Dict[str, Any]:
        """Group by a column and aggregate another column"""
        if self.df is None:
            return {"status": "error", "message": "No dataset loaded"}
        
        try:
            if agg_function == "mean":
                result = self.df.groupby(group_by)[agg_column].mean()
            elif agg_function == "sum":
                result = self.df.groupby(group_by)[agg_column].sum()
            elif agg_function == "count":
                result = self.df.groupby(group_by)[agg_column].count()
            elif agg_function == "median":
                result = self.df.groupby(group_by)[agg_column].median()
            else:
                return {"status": "error", "message": f"Unsupported aggregation: {agg_function}"}
            
            return {
                "status": "success",
                "group_by": group_by,
                "agg_column": agg_column,
                "agg_function": agg_function,
                "result": result.to_dict()
            }
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    @weave.op
    def filter_data(self, column: str, operator: str, value: Union[int, float, str]) -> Dict[str, Any]:
        """Filter the dataset based on a condition"""
        if self.df is None:
            return {"status": "error", "message": "No dataset loaded"}
        
        try:
            if operator == ">":
                filtered = self.df[self.df[column] > value]
            elif operator == "<":
                filtered = self.df[self.df[column] < value]
            elif operator == "==":
                filtered = self.df[self.df[column] == value]
            elif operator == ">=":
                filtered = self.df[self.df[column] >= value]
            elif operator == "<=":
                filtered = self.df[self.df[column] <= value]
            else:
                return {"status": "error", "message": f"Unsupported operator: {operator}"}
            
            return {
                "status": "success",
                "rows_matched": len(filtered),
                "sample": filtered.head(5).to_dict()
            }
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    @weave.op
    def execute_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Execute a tool and return the result"""
        if tool_name in self.tool_registry:
            return self.tool_registry[tool_name](**arguments)

        return {"status": "error", "message": f"Unknown tool: {tool_name}"}
    
    @weave.op
    def predict(self, query: str, max_iterations: int = 10) -> Dict[str, Any]:
        """
        Run the agent on a query
        Returns: (final_answer, execution_trace)
        """
        # Initialize conversation
        messages = [
            {
                "role": "system",
                "content": self.SYSTEM_PROMPT.format()
            },
            {
                "role": "user",
                "content": query
            }
        ]
        
        # Track execution for evaluation
        execution_trace = {
            "query": query,
            "tool_calls": [],
            "iterations": 0
        }
        
        for iteration in range(max_iterations):
            execution_trace["iterations"] = iteration + 1
            
            # Get response from LLM
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                tools=self.tool_schemas,
                tool_choice="auto"
            )
            
            message = response.choices[0].message
            messages.append(message)
            
            # Check if we're done
            if not message.tool_calls:
                final_answer = message.content
                execution_trace["final_answer"] = final_answer
                return {
                    "answer": final_answer,
                    "execution_trace": execution_trace
                }
            
            # Execute tool calls
            for tool_call in message.tool_calls:
                tool_name = tool_call.function.name
                arguments = json.loads(tool_call.function.arguments)
                
                # Execute the tool
                result = self.execute_tool(tool_name, arguments)
                
                # Record tool call for evaluation
                execution_trace["tool_calls"].append({
                    "tool": tool_name,
                    "arguments": arguments,
                    "result": result
                })
                
                # Add tool result to conversation
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": json.dumps(result)
                })
        
        # Max iterations reached
        return {
            "answer": "Error: Maximum iterations reached",
            "execution_trace": execution_trace
        }

### Execution

In [18]:
weave.init("odsc-2025-agent-eval")
agent = DataAnalysisAgent()
agent.predict("Load the tips dataset from tips.csv and tell me how many rows it has.")

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb/odsc-2025-agent-eval/weave/objects/StringPrompt/versions/F8OC8RdKNTFAetj7uJYkUJHhHXK7GSewQGrAK7Z8jeI
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ec-5ee2-7e60-9fc3-c48188275715


{'answer': 'The tips dataset has a total of **244 rows** and **7 columns**. If you need further analysis or information from this dataset, feel free to ask!',
 'execution_trace': {'query': 'Load the tips dataset from tips.csv and tell me how many rows it has.',
  'tool_calls': [{'tool': 'load_csv',
    'arguments': {'file_path': 'data/tips.csv'},
    'result': {'status': 'success',
     'message': 'Loaded dataset with 244 rows and 7 columns',
     'columns': ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'],
     'shape': (244, 7),
     'head': {'total_bill': {0: 16.99, 1: 10.34, 2: 21.01},
      'tip': {0: 1.01, 1: 1.66, 2: 3.5},
      'sex': {0: 'Female', 1: 'Male', 2: 'Male'},
      'smoker': {0: 'No', 1: 'No', 2: 'No'},
      'day': {0: 'Sun', 1: 'Sun', 2: 'Sun'},
      'time': {0: 'Dinner', 1: 'Dinner', 2: 'Dinner'},
      'size': {0: 2, 1: 3, 2: 3}}}}],
  'iterations': 2,
  'final_answer': 'The tips dataset has a total of **244 rows** and **7 columns**. If you need fu

# Evaluation

Evaluations are made of three components:

* An evaluation dataset: a list of input prompts with associated ground truth.
* A set of scorers: the logic to evaluate the generated outputs against the ground truth.
* The agent to evaluate

### String-based evals

A basic evaluation technique that searches for exact string matches in the agent's output.
We don't expect this scorer to yield 100% pass scores, since LLMs can be fuzzy around math and rounding but we should keep a high threshold.

In [19]:
@weave.op
def exact_match_scorer(output: Dict[str, Any], expected_contains: str) -> Dict[str, bool]:
    """Score based on whether answer contains expected string"""
    answer: str = output.get("answer", "")
    contains = expected_contains.lower() in answer.lower()
    return {"correct": contains}

### Numerical accuracy scorer

This scorer will compare the actual numerical values and log the difference.

In [20]:
@weave.op
def numeric_accuracy_scorer(output: Dict[str, Any], ground_truth: float, tolerance: float = 0.1) -> Dict[str, Any]:
    """
    Extract numeric value from answer and compare to ground truth
    """
    answer = output.get("answer", "")
    
    # Simple numeric extraction (you might need more sophisticated parsing)
    import re
    numbers = re.findall(r'\d+\.?\d*', answer)
    
    if not numbers:
        return {"correct": False, "score": 0.0, "reason": "No numeric value found"}
    
    # Take the first number found
    for number in numbers:
        extracted = float(number)
        difference = abs(extracted - ground_truth)
        if difference <= tolerance:
            return {
                "correct": True,
                "score": 1.0,
                "extracted_value": extracted,
                "ground_truth": ground_truth,
                "difference": difference
            }

    return {
        "correct": False,
        "score": 0.0,
        "extracted_value": extracted,
        "ground_truth": ground_truth,
        "difference": difference
    }

### The evaluation dataset

In [21]:
GROUND_TRUTH_DATASET = weave.Dataset(
    name="Ground Truth Dataset",
    rows=[
        {
            "id": "avg_tip",
            "query": "What is the average tip amount?",
            "expected_contains": "2.99",
            "ground_truth": 2.99,
            "tolerance": 0.1,
            "expected_tools": ["get_summary_statistics"]
        },
        {
            "id": "avg_tip_percentage",
            "query": "What is the average tip percentage?",
            "expected_contains": "15.14",
            "ground_truth": 15.14,
            "tolerance": 0.5,
            "expected_tools": ["get_summary_statistics", "group_and_aggregate"]
        },
        {
            "id": "row_count",
            "query": "How many rows are in the dataset?",
            "expected_contains": "244",
            "ground_truth": 244,
            "tolerance": 0,
            "expected_tools": ["load_csv"]
        },
        {
            "id": "correlation",
            "query": "What is the correlation between total_bill and tip?",
            "expected_contains": "correlation",
            "ground_truth": 0.68,  # Approximate
            "tolerance": 0.1,
            "expected_tools": ["calculate_correlation"]
        }
    ]
)

### Run the evaluations

In [22]:
agent = DataAnalysisAgent()
    
# Load dataset first
agent.predict("Load tips.csv")

evaluation = weave.Evaluation(
    name="Ground Truth Evaluation",
    dataset=GROUND_TRUTH_DATASET,
    scorers=[
        exact_match_scorer,
        numeric_accuracy_scorer,
    ]
)
    
# Run evaluation
results = await evaluation.evaluate(agent)

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb/odsc-2025-agent-eval/weave/objects/StringPrompt/versions/F8OC8RdKNTFAetj7uJYkUJHhHXK7GSewQGrAK7Z8jeI
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ec-6937-71fe-88b2-e8959ba6168a
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ec-8552-75b4-b7fc-1ce7f9911fb5
[36m[1mweave[0m: Evaluated 1 of 4 examples
[36m[1mweave[0m: Evaluated 2 of 4 examples
[36m[1mweave[0m: Evaluated 3 of 4 examples
[36m[1mweave[0m: Evaluated 4 of 4 examples
[36m[1mweave[0m: Evaluation summary {
[36m[1mweave[0m:   "output": {
[36m[1mweave[0m:     "execution_trace": {
[36m[1mweave[0m:       "iterations": {
[36m[1mweave[0m:         "mean": 3.0
[36m[1mweave[0m:       }
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "exact_match_scorer": {
[36m[1mweave[0m:     "correct": {
[36m[1mweave[0m:       "true_count": 3,
[36m[1mweave[0m:       "tru

## Testing agentic behavior

### Tool selection

In [23]:
@weave.op
def tool_selection_scorer(output: Dict[str, Any], expected_tools: List[str], forbidden_tools: Optional[List[str]] = None) -> Dict[str, Any]:
    """
    Score based on whether correct tools were used
    """
    tools_used = output.get("tools_used", [])
    forbidden_tools = forbidden_tools or []
    
    # Check if all expected tools were used
    has_required = all(tool in tools_used for tool in expected_tools)
    
    # Check if any forbidden tools were used
    has_forbidden = any(tool in tools_used for tool in forbidden_tools)
    
    correct = has_required and not has_forbidden
    
    return {
        "correct": correct,
        "score": 1.0 if correct else 0.0,
        "expected_tools": expected_tools,
        "actual_tools": tools_used,
        "has_required_tools": has_required,
        "has_forbidden_tools": has_forbidden
    }

### Trajectory efficiency

In [24]:
@weave.op
def efficiency_scorer(output: Dict[str, Any], max_iterations: int = 5, 
                     max_tool_calls: int = 5) -> Dict[str, Any]:
    """
    Score based on execution efficiency
    """
    iterations = output.get("iterations", 0)
    num_tools = output.get("num_tool_calls", 0)
    
    efficient = iterations <= max_iterations and num_tools <= max_tool_calls
    
    # Score decreases with more iterations/tools
    score = 1.0
    if iterations > max_iterations:
        score *= (max_iterations / iterations)
    if num_tools > max_tool_calls:
        score *= (max_tool_calls / num_tools)
    
    return {
        "correct": efficient,
        "score": score,
        "iterations": iterations,
        "tool_calls": num_tools,
        "efficient": efficient
    }

### Evaluation dataset

In [25]:
TOOL_SELECTION_DATASET = weave.Dataset(
    name="Tool Selection Dataset",
    rows=[
        {
            "id": "correlation_test",
            "query": "What is the correlation between total_bill and tip?",
            "expected_tools": ["calculate_correlation"],
            "forbidden_tools": ["filter_data"]
        },
        {
            "id": "statistics_test",
            "query": "Show me statistics for the tip column",
            "expected_tools": ["get_summary_statistics"],
            "forbidden_tools": ["calculate_correlation"]
        },
        {
            "id": "groupby_test",
            "query": "What's the average tip by day of week?",
            "expected_tools": ["group_and_aggregate"],
            "forbidden_tools": ["filter_data"]
        },
        {
            "id": "comparison_test",
            "query": "Compare average tips between smokers and non-smokers",
            "expected_tools": ["group_and_aggregate"],
            "forbidden_tools": []
        }
    ]
)

### Run the evaluation

In [26]:
agent = DataAnalysisAgent()
agent.predict("Load tips.csv")
    
# Create Weave evaluation
evaluation = weave.Evaluation(
    name="Tool Selection Evaluation",
    dataset=TOOL_SELECTION_DATASET,
    scorers=[
        tool_selection_scorer,
        efficiency_scorer,
    ]
)

# Run evaluation
results = await evaluation.evaluate(agent)

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb/odsc-2025-agent-eval/weave/objects/StringPrompt/versions/F8OC8RdKNTFAetj7uJYkUJHhHXK7GSewQGrAK7Z8jeI
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ec-cb85-7ee8-bc09-87ff4c8b142c
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ec-df54-70be-beaf-356b2528dac8
[36m[1mweave[0m: Evaluated 1 of 4 examples
[36m[1mweave[0m: Evaluated 2 of 4 examples
[36m[1mweave[0m: Evaluated 3 of 4 examples
[36m[1mweave[0m: Evaluated 4 of 4 examples
[36m[1mweave[0m: Evaluation summary {
[36m[1mweave[0m:   "output": {
[36m[1mweave[0m:     "execution_trace": {
[36m[1mweave[0m:       "iterations": {
[36m[1mweave[0m:         "mean": 3.0
[36m[1mweave[0m:       }
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "tool_selection_scorer": {
[36m[1mweave[0m:     "correct": {
[36m[1mweave[0m:       "true_count": 0,
[36m[1mweave[0m:       "

## Qualitative Evaluation – LLM-as-a-judge scoring

In [27]:
@weave.op
def llm_judge_scorer(output: Dict[str, Any], query: str, ground_truth: Any = None) -> Dict[str, Any]:
    """
    Use GPT-4 as a judge to evaluate answer quality
    """
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    answer = output.get("answer", "")
    
    evaluation_prompt = f"""You are evaluating a data analysis agent's response.

Query: {query}

Agent's Answer: {answer}

Ground Truth Data (for reference): {ground_truth if ground_truth is not None else "Not provided"}

Evaluate the answer on these criteria (score 1-5 for each):
1. ACCURACY: Is the numerical information correct?
2. COMPLETENESS: Does it fully answer the question?
3. CLARITY: Is the explanation clear and well-structured?
4. RELEVANCE: Does it stay focused on the question?

Provide scores and brief justification in JSON format:
{{
    "accuracy": <1-5>,
    "completeness": <1-5>,
    "clarity": <1-5>,
    "relevance": <1-5>,
    "justification": "<explanation>",
    "overall_pass": <true/false>
}}
"""
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": evaluation_prompt}],
            response_format={"type": "json_object"}
        )
        
        evaluation = json.loads(response.choices[0].message.content)
        
        # Calculate average score (1-5 scale normalized to 0-1)
        avg_score = (
            evaluation["accuracy"] + 
            evaluation["completeness"] + 
            evaluation["clarity"] + 
            evaluation["relevance"]
        ) / 4.0 / 5.0  # Normalize to 0-1
        
        return {
            "correct": evaluation["overall_pass"],
            "score": avg_score,
            **evaluation
        }
    except Exception as e:
        return {
            "correct": False,
            "score": 0.0,
            "error": str(e)
        }

In [28]:
agent = DataAnalysisAgent()
agent.predict("Load tips.csv")

# Create Weave evaluation
evaluation = weave.Evaluation(
    name="LLM-as-Judge Evaluation",
    dataset=GROUND_TRUTH_DATASET,
    scorers=[
        llm_judge_scorer,
    ]
)

# Run evaluation
results = await evaluation.evaluate(agent)

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb/odsc-2025-agent-eval/weave/objects/StringPrompt/versions/F8OC8RdKNTFAetj7uJYkUJHhHXK7GSewQGrAK7Z8jeI
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ed-034c-7161-8e27-c5ecd5ff5522
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb/odsc-2025-agent-eval/r/call/019a27ed-0fab-7567-823c-172029c8bfcf
[36m[1mweave[0m: Evaluated 1 of 4 examples
[36m[1mweave[0m: Evaluated 2 of 4 examples
[36m[1mweave[0m: Evaluated 3 of 4 examples
[36m[1mweave[0m: Evaluated 4 of 4 examples
[36m[1mweave[0m: Evaluation summary {
[36m[1mweave[0m:   "output": {
[36m[1mweave[0m:     "execution_trace": {
[36m[1mweave[0m:       "iterations": {
[36m[1mweave[0m:         "mean": 4.0
[36m[1mweave[0m:       }
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "llm_judge_scorer": {
[36m[1mweave[0m:     "correct": {
[36m[1mweave[0m:       "true_count": 2,
[36m[1mweave[0m:       "true_