# DeepEval Integration with ValidMind - Comprehensive Demo

This notebook demonstrates the complete integration between [DeepEval](https://github.com/confident-ai/deepeval) and [ValidMind](https://github.com/validmind/validmind-library) through the new `LLMAgentDataset` class.

## What You'll Learn

1. **Setup & Installation** - Getting started with both frameworks
2. **Basic Usage** - Creating and evaluating simple LLM test cases
3. **RAG Evaluation** - Testing retrieval-augmented generation systems
4. **Agent Evaluation** - Evaluating LLM agents with tool usage
5. **Golden Templates** - Working with evaluation templates
6. **Custom Metrics** - Creating domain-specific evaluation criteria
7. **ValidMind Integration** - Leveraging ValidMind's testing infrastructure
8. **Production Patterns** - Real-world usage scenarios

## Key Benefits

- **30+ Evaluation Metrics**: Use all DeepEval metrics within ValidMind
- **Multi-Modal Support**: Evaluate Q&A, RAG, and Agent systems
- **Production Ready**: Handle real-world LLM evaluation scenarios
- **Seamless Integration**: Full compatibility with ValidMind workflows


## Installation & Setup

First, let's install the required packages and set up our environment.


In [None]:
# Install required packages (uncomment to run)
# !pip install deepeval validmind openai

# For this demo, we'll also install some additional packages for better output
# !pip install tabulate pandas numpy


In [None]:
# Core imports
import os
import pandas as pd
import warnings
from deepeval.test_case import LLMTestCase, ToolCall, LLMTestCaseParams
from deepeval.dataset import Golden
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, HallucinationMetric, GEval
import validmind as vm
from validmind.datasets.llm import LLMAgentDataset

warnings.filterwarnings('ignore')


## Section 1: Basic Usage - Simple Q&A Evaluation

Let's start with the simplest use case: evaluating a basic question-and-answer interaction with an LLM.


In [None]:
# Step 1: Create a simple LLM test case
print("Creating a simple Q&A test case...")

simple_test_case = LLMTestCase(
    input="What is machine learning?",
    actual_output="""Machine learning is a subset of artificial intelligence (AI) that enables 
    computers to learn and make decisions from data without being explicitly programmed for every task. 
    It uses algorithms to find patterns in data and make predictions or decisions based on those patterns.""",
    expected_output="""Machine learning is a method of data analysis that automates analytical 
    model building. It uses algorithms that iteratively learn from data, allowing computers to find 
    hidden insights without being explicitly programmed where to look.""",
    context=["Machine learning is a branch of AI that focuses on algorithms that can learn from data."]
)

# Step 2: Create LLMAgentDataset from the test case
print("\nCreating ValidMind dataset...")

simple_dataset = LLMAgentDataset.from_test_cases(
    test_cases=[simple_test_case],
    input_id="simple_qa_dataset"
)

# Display the dataset
print("\nDataset preview:")
display(simple_dataset.df)


In [None]:
import validmind as vm

def agent_fn(input):
    """
    Invoke the simplified agent with the given input.
    """
    
    return 1.23

    
vm_model = vm.init_model(
    predict_fn=agent_fn,
    input_id="test_model",
    __log=False
)







In [None]:
simple_dataset._df

In [None]:
simple_dataset.assign_scores(vm_model, "AnswerRelevancy")

In [None]:
simple_dataset._df.head()

In [None]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from validmind import tags, tasks
from validmind.vm_models.dataset import VMDataset
from validmind.errors import SkipTestError
from typing import Dict, Any

# Create custom ValidMind tests for DeepEval metrics
@vm.test("llm.AnswerRelevancy") 
@tags("llm", "AnswerRelevancy", "deepeval")
@tasks("llm")
def AnswerRelevancy(dataset: VMDataset, threshold: float = 0.8) -> Dict[str, Any]:

    metric = AnswerRelevancyMetric(
        threshold=0.7,
        model="gpt-4o",
        include_reason=True
    )
    results = []
    for index, test_case in dataset.df.iterrows():
        input = test_case["input"]
        actual_output = test_case["actual_output"]
    
        test_case = LLMTestCase(
            input=input,
            actual_output=actual_output,
        )
        result = evaluate(test_cases=[test_case], metrics=[metric])
        results.append({
            "score": result.test_results[0].metrics_data[0].score,
            "name": result.test_results[0].metrics_data[0].name,
            "reason": result.test_results[0].metrics_data[0].reason
        })
    
    return pd.DataFrame(results)
    
    

    # # To run metric as a standalone
    # # metric.measure(test_case)
    # # print(metric.score, metric.reason)

    # result = evaluate(test_cases=[test_case], metrics=[metric])
    # # print(result, result.reason)
    # print("--------------------------------")
    # result.test_results[0].metrics_data[0].score
    # result.test_results[0].metrics_data[0].name
    # result.test_results[0].metrics_data[0].reason
    # print("--------------------------------")



In [None]:
# Run AnswerRelevancy test
test_results = vm.tests.run_test("llm.AnswerRelevancy", dataset=simple_dataset)


In [None]:
from math import e
from validmind import tags, tasks
from validmind.datasets.llm import LLMAgentDataset
from validmind.vm_models.dataset import VMDataset
from validmind.errors import SkipTestError
from typing import Dict, Any
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, HallucinationMetric , ContextualRelevancyMetric

# Create custom ValidMind tests for DeepEval metrics
@vm.test("llm.Faithfulness") 
@tags("llm", "faithfulness", "deepeval")
@tasks("llm")
def Faithfulness(dataset: VMDataset, threshold: float = 0.8) -> Dict[str, Any]:
    """
    Evaluates the faithfulness of LLM responses using DeepEval's FaithfulnessMetric.
    
    Args:
        dataset: VMDataset containing LLM inputs and outputs
        threshold: Minimum score threshold (default: 0.8)
            
    Returns:
        Dictionary containing metric results and visualization
    """
    if not isinstance(dataset, LLMAgentDataset):
        raise SkipTestError("Dataset must be an LLMAgentDataset")
        
    results = []
    for i, test_case in dataset.df.iterrows():
        input = test_case["input"]
        actual_output = test_case["actual_output"]
        retrieval_context = None if test_case["retrieval_context"] is None else list(test_case["retrieval_context"])
        metric = ContextualRelevancyMetric(threshold=0.7, model="gpt-4o")
        test_case = LLMTestCase(
        input=input,
        actual_output=actual_output,
        retrieval_context=retrieval_context)
        results.append(metric.measure(test_case))
    
    return results

# @vm.test("llm.Hallucination")
# @tags("llm", "hallucination", "deepeval") 
# @tasks("llm")
# def Hallucination(dataset: VMDataset, threshold: float = 0.8) -> Dict[str, Any]:
#     """
#     Evaluates hallucination in LLM responses using DeepEval's HallucinationMetric.
    
#     Args:
#         dataset: VMDataset containing LLM inputs and outputs
#         threshold: Minimum score threshold (default: 0.8)
            
#     Returns:
#         Dictionary containing metric results and visualization
#     """
#     if not isinstance(dataset, LLMAgentDataset):
#         raise SkipTestError("Dataset must be an LLMAgentDataset")
        
#     metric = HallucinationMetric(threshold=threshold)
#     results = dataset.evaluate_with_deepeval(
#         metrics=[metric],
#         hyperparameters={
#             "model": "gpt-4", 
#             "prompt_template": "Evaluate hallucination: {{input}}"
#         }
#     )
    
#     return {
#         "metric_name": "Hallucination",
#         "score": results["hallucination_score"],
#         "passed": results["hallucination_score"] >= threshold,
#         "threshold": threshold
#     }

# # Create custom ValidMind tests for DeepEval metrics
# @vm.test("llm.AnswerRelevancy")
# @tags("llm", "answer_relevancy", "deepeval")
# @tasks("llm")
# def AnswerRelevancy(dataset: VMDataset, threshold = 0.7) -> Dict[str, Any]:
#     """
#     Evaluates the relevancy of LLM responses using DeepEval's AnswerRelevancyMetric.
    
#     Args:
#         dataset: VMDataset containing LLM inputs and outputs
#         params: Dictionary containing metric parameters
#             - threshold: Minimum score threshold (default: 0.7)
            
#     Returns:
#         Dictionary containing metric results and visualization
#     """
#     if not isinstance(dataset, LLMAgentDataset):
#         raise SkipTestError("Dataset must be an LLMAgentDataset")
        
#     metric = AnswerRelevancyMetric(threshold=threshold)
#     results = dataset.evaluate_with_deepeval(
#         metrics=[metric],
#         hyperparameters={
#             "model": "gpt-4",
#             "evaluation_type": "basic_qa",
#             "prompt_template": "Evaluate answer relevancy: {{input}}"
#         }
#     )
    
#     return {
#         "metric_name": "Answer Relevancy",
#         "score": results["answer_relevancy_score"],
#         "passed": results["answer_relevancy_score"] >= threshold,
#         "threshold": threshold
#     }

# @vm.test("llm.Faithfulness") 
# @tags("llm", "faithfulness", "deepeval")
# @tasks("llm")
# def Faithfulness(dataset: VMDataset, params: Dict[str, Any] = {"threshold": 0.8}) -> Dict[str, Any]:
#     """
#     Evaluates the faithfulness of LLM responses using DeepEval's FaithfulnessMetric.
    
#     Args:
#         dataset: VMDataset containing LLM inputs and outputs
#         params: Dictionary containing metric parameters
#             - threshold: Minimum score threshold (default: 0.8)
            
#     Returns:
#         Dictionary containing metric results and visualization
#     """
#     if not isinstance(dataset, LLMAgentDataset):
#         raise SkipTestError("Dataset must be an LLMAgentDataset")
        
#     metric = FaithfulnessMetric(threshold=params["threshold"])
#     results = dataset.evaluate_with_deepeval(
#         metrics=[metric],
#         hyperparameters={
#             "model": "gpt-4",
#             "prompt_template": "Evaluate faithfulness: {{input}}"
#         }
#     )
    
#     return {
#         "metric_name": "Faithfulness",
#         "score": results["faithfulness_score"],
#         "passed": results["faithfulness_score"] >= params["threshold"],
#         "threshold": params["threshold"]
#     }

# @vm.test("llm.Hallucination")
# @tags("llm", "hallucination", "deepeval") 
# @tasks("llm")
# def Hallucination(dataset: VMDataset, params: Dict[str, Any] = {"threshold": 0.3}) -> Dict[str, Any]:
#     """
#     Evaluates hallucination in LLM responses using DeepEval's HallucinationMetric.
    
#     Args:
#         dataset: VMDataset containing LLM inputs and outputs
#         params: Dictionary containing metric parameters
#             - threshold: Maximum hallucination score threshold (default: 0.3)
            
#     Returns:
#         Dictionary containing metric results and visualization
#     """
#     if not isinstance(dataset, LLMAgentDataset):
#         raise SkipTestError("Dataset must be an LLMAgentDataset")
        
#     metric = HallucinationMetric(threshold=params["threshold"])
#     results = dataset.evaluate_with_deepeval(
#         metrics=[metric],
#         hyperparameters={
#             "model": "gpt-4",
#             "prompt_template": "Evaluate hallucination: {{input}}"
#         }
#     )
    
#     return {
#         "metric_name": "Hallucination",
#         "score": results["hallucination_score"], 
#         "passed": results["hallucination_score"] <= params["threshold"],
#         "threshold": params["threshold"]
#     }


In [None]:
# Run the Faithfulness test
print("Running Faithfulness test...")
faithfulness_result = vm.tests.run_test(
    "llm.Faithfulness",
    inputs={"dataset": simple_dataset},
    params={
        "threshold": 0.8,
    }
)
print(f"Faithfulness test result: {faithfulness_result}")



In [None]:
# Step 3: Evaluate with DeepEval metrics
print("Setting up evaluation metrics...")

# Note: These metrics require an OpenAI API key to work
# For demonstration, we'll show the setup even if we can't run them

basic_metrics = [
    AnswerRelevancyMetric(threshold=0.7),
    FaithfulnessMetric(threshold=0.8),
    HallucinationMetric(threshold=0.3)  # Lower = less hallucination allowed
]

print("Metrics configured:")
for metric in basic_metrics:
    print(f"  - {metric.__class__.__name__}: threshold {getattr(metric, 'threshold', 'N/A')}")

# Check if we can run evaluation (requires API key)
api_key_available = os.getenv("OPENAI_API_KEY") is not None

if api_key_available:
    print("\nRunning evaluation...")
    try:
        results = simple_dataset.evaluate_with_deepeval(
            metrics=basic_metrics,
            hyperparameters={
                "model": "gpt-4",
                "evaluation_type": "basic_qa",
                "dataset_size": len(simple_dataset.test_cases)
            }
        )
        print("Evaluation completed!")
        print(f"Results: {results}")
    except Exception as e:
        print(f"Evaluation failed: {e}")
else:
    print("\nWARNING: OpenAI API key not found - skipping evaluation")
    print("To run evaluation, set: os.environ['OPENAI_API_KEY'] = 'your-key'")
    print("For now, we'll demonstrate the evaluation setup")


## Section 2: RAG System Evaluation

Now let's evaluate a more complex use case: a Retrieval-Augmented Generation (RAG) system that retrieves relevant documents and generates responses based on them.


In [None]:
# Create multiple RAG test cases
print("Creating RAG evaluation test cases...")

rag_test_cases = [
    LLMTestCase(
        input="How do I return a product that doesn't fit?",
        actual_output="""You can return any product within 30 days of purchase for a full refund. 
        Simply visit our returns page on the website and follow the step-by-step instructions. 
        You'll need your order number and email address. No questions asked!""",
        expected_output="We offer a 30-day return policy for full refunds. Visit our returns page to start the process.",
        context=["Company policy allows 30-day returns for full refund with no restocking fees."],
        retrieval_context=[
            "Return Policy: All items can be returned within 30 days of purchase for a full refund.",
            "Return Process: Visit our website's returns page and enter your order details.",
            "Customer Service: Available 24/7 to help with returns and refunds.",
            "No restocking fees apply to returns within the 30-day window."
        ]
    ),
    LLMTestCase(
        input="What are your shipping options and costs?",
        actual_output="""We offer three shipping options: Standard (5-7 days, $5.99), 
        Express (2-3 days, $9.99), and Overnight (next day, $19.99). 
        Free shipping is available on orders over $50 with Standard delivery.""",
        expected_output="Multiple shipping options available with costs ranging from $5.99 to $19.99. Free shipping on orders over $50.",
        context=["Shipping information includes various speed and cost options."],
        retrieval_context=[
            "Standard Shipping: 5-7 business days, $5.99",
            "Express Shipping: 2-3 business days, $9.99", 
            "Overnight Shipping: Next business day, $19.99",
            "Free Standard Shipping on orders over $50"
        ]
    ),
    LLMTestCase(
        input="Do you have a warranty on electronics?",
        actual_output="""Yes, all electronics come with a manufacturer's warranty. 
        Most items have a 1-year warranty, while premium products may have 2-3 years. 
        We also offer extended warranty options for purchase.""",
        expected_output="Electronics include manufacturer warranty, typically 1-year, with extended options available.",
        context=["Electronics warranty information varies by product type and manufacturer."],
        retrieval_context=[
            "Electronics Warranty: Manufacturer warranty included with all electronic items",
            "Standard Coverage: 1 year for most electronics",
            "Premium Products: May include 2-3 year coverage",
            "Extended Warranty: Available for purchase at checkout"
        ]
    )
]

print(f"Created {len(rag_test_cases)} RAG test cases")

# Create RAG dataset
rag_dataset = LLMAgentDataset.from_test_cases(
    test_cases=rag_test_cases,
    input_id="rag_evaluation_dataset"
)

print(f"RAG Dataset: {rag_dataset}")
print(f"Shape: {rag_dataset.df.shape}")

# Show dataset structure
print("\nRAG Dataset Preview:")
display(rag_dataset.df[['input', 'actual_output', 'context', 'retrieval_context']].head())


## Section 3: LLM Agent Evaluation

Let's evaluate LLM agents that can use tools to accomplish tasks. This is one of the most advanced evaluation scenarios.


In [None]:
# Create LLM Agent test cases with tool usage
print("Creating Agent evaluation test cases...")

agent_test_cases = [
    LLMTestCase(
        input="What's the weather like in New York City today?",
        actual_output="""Based on current weather data, New York City is experiencing partly cloudy skies 
        with a temperature of 72°F (22°C). The humidity is at 60% and there's a light breeze from the west at 8 mph. 
        No precipitation is expected today.""",
        expected_output="Current weather in New York shows mild temperatures with partly cloudy conditions.",
        tools_called=[
            ToolCall(
                name="WeatherAPI",
                description="Fetches current weather information for a specified location",
                input_parameters={"city": "New York City", "units": "fahrenheit", "include_forecast": False},
                output={
                    "temperature": 72,
                    "condition": "partly_cloudy", 
                    "humidity": 60,
                    "wind_speed": 8,
                    "wind_direction": "west"
                },
                reasoning="User asked for current weather in NYC, so I need to call the weather API"
            )
        ],
        expected_tools=[
            ToolCall(
                name="WeatherAPI",
                description="Should fetch weather information for New York City",
                input_parameters={"city": "New York City"}
            )
        ]
    ),
    LLMTestCase(
        input="Calculate the compound interest on $1000 invested at 5% annually for 3 years",
        actual_output="""Using the compound interest formula A = P(1 + r)^t, where P = $1000, r = 0.05, and t = 3:
        A = 1000(1 + 0.05)^3 = 1000(1.05)^3 = 1000 × 1.157625 = $1,157.63
        The investment will grow to $1,157.63, earning $157.63 in compound interest.""",
        expected_output="The compound interest calculation shows the investment grows to approximately $1,157.63.",
        tools_called=[
            ToolCall(
                name="Calculator",
                description="Performs mathematical calculations",
                input_parameters={"expression": "1000 * (1.05 ** 3)", "precision": 2},
                output=1157.63,
                reasoning="Need to calculate compound interest using the standard formula"
            )
        ],
                 expected_tools=[
             ToolCall(
                 name="Calculator", 
                 description="Should perform compound interest calculation",
                 input_parameters={"calculation_type": "compound_interest"}
             )
         ]
    ),
    LLMTestCase(
        input="Send an email to john@example.com about our meeting tomorrow at 2 PM",
        actual_output="""I've sent an email to john@example.com with the subject "Meeting Reminder - Tomorrow at 2 PM". 
        The email confirms your meeting scheduled for tomorrow at 2:00 PM and asks John to confirm his attendance. 
        The email was successfully delivered to the recipient.""",
        expected_output="Email sent successfully to john@example.com about the 2 PM meeting tomorrow.",
        tools_called=[
            ToolCall(
                name="EmailSender",
                description="Sends emails to specified recipients",
                input_parameters={
                    "to": "john@example.com",
                    "subject": "Meeting Reminder - Tomorrow at 2 PM", 
                    "body": "Hi John,\n\nThis is a reminder about our meeting scheduled for tomorrow at 2:00 PM. Please confirm your attendance.\n\nBest regards"
                },
                output={"status": "sent", "message_id": "msg_12345", "timestamp": "2024-01-15T10:30:00Z"},
                reasoning="User requested to send email, so I need to use the email tool with appropriate content"
            )
        ],
                 expected_tools=[
             ToolCall(
                 name="EmailSender",
                 description="Should send an email about the meeting",
                 input_parameters={"recipient": "john@example.com"}
             )
         ]
    )
]

print(f"Created {len(agent_test_cases)} Agent test cases")

# Create Agent dataset
agent_dataset = LLMAgentDataset.from_test_cases(
    test_cases=agent_test_cases,
    input_id="agent_evaluation_dataset"
)

print(f"Agent Dataset: {agent_dataset}")
print(f"Shape: {agent_dataset.df.shape}")

# Analyze tool usage
tool_usage = {}
for case in agent_test_cases:
    if case.tools_called:
        for tool in case.tools_called:
            tool_usage[tool.name] = tool_usage.get(tool.name, 0) + 1

print(f"\nTool Usage Analysis:")
for tool, count in tool_usage.items():
    print(f"  - {tool}: {count} times")

print("\nAgent Dataset Preview:")
display(agent_dataset.df[['input', 'actual_output', 'tools_called']].head())


## Section 4: Working with Golden Templates

Golden templates are a powerful feature of DeepEval that allow you to define test inputs and expected outputs, then generate actual outputs at evaluation time.


In [None]:
# Create Golden templates
print("Creating Golden templates...")

goldens = [
    Golden(
        input="Explain the concept of neural networks in simple terms",
        expected_output="Neural networks are computing systems inspired by biological neural networks that constitute animal brains.",
        context=["Neural networks are a key component of machine learning and artificial intelligence."]
    ),
    Golden(
        input="What are the main benefits of cloud computing for businesses?", 
        expected_output="Cloud computing offers scalability, cost-effectiveness, accessibility, and reduced infrastructure maintenance.",
        context=["Cloud computing provides on-demand access to computing resources over the internet."]
    ),
    Golden(
        input="How does password encryption protect user data?",
        expected_output="Password encryption converts passwords into unreadable formats using cryptographic algorithms, protecting against unauthorized access.",
        context=["Encryption is a fundamental security technique used to protect sensitive information."]
    ),
    Golden(
        input="What is the difference between machine learning and deep learning?",
        expected_output="Machine learning is a broad field of AI, while deep learning is a subset that uses neural networks with multiple layers.",
        context=["Both are important areas of artificial intelligence with different approaches and applications."]
    )
]

print(f"Created {len(goldens)} Golden templates")

# Create dataset from goldens
golden_dataset = LLMAgentDataset.from_goldens(
    goldens=goldens,
    input_id="golden_templates_dataset"
)

print(f"Golden Dataset: {golden_dataset}")
print(f"Shape: {golden_dataset.df.shape}")

print("\nGolden Templates Preview:")
display(golden_dataset.df[['input', 'expected_output', 'context', 'type']].head())

# Mock LLM application function for demonstration
def mock_llm_application(input_text: str) -> str:
    """
    Simulate an LLM application generating responses.
    In production, this would be your actual LLM application.
    """
    
    responses = {
        "neural networks": """Neural networks are computational models inspired by the human brain. 
        They consist of interconnected nodes (neurons) that process information by learning patterns from data. 
        These networks can recognize complex patterns and make predictions, making them useful for tasks like 
        image recognition, natural language processing, and decision-making.""",
        
        "cloud computing": """Cloud computing provides businesses with flexible, scalable access to computing resources 
        over the internet. Key benefits include reduced upfront costs, automatic scaling based on demand, 
        improved collaboration through shared access, enhanced security through professional data centers, 
        and reduced need for internal IT maintenance.""",
        
        "password encryption": """Password encryption protects user data by converting passwords into complex, 
        unreadable strings using mathematical algorithms. When you enter your password, it's immediately encrypted 
        before storage or transmission. Even if data is intercepted, the encrypted password appears as random characters, 
        making it virtually impossible for attackers to determine the original password.""",
        
        "machine learning": """Machine learning is a broad approach to artificial intelligence where computers learn 
        to make predictions or decisions by finding patterns in data. Deep learning is a specialized subset that uses 
        artificial neural networks with multiple layers (hence 'deep') to process information in ways that mimic 
        human brain function, enabling more sophisticated pattern recognition and decision-making."""
    }
    
    # Simple keyword matching for demonstration
    input_lower = input_text.lower()
    for keyword, response in responses.items():
        if keyword in input_lower:
            return response.strip()
    
    return f"Thank you for your question about: {input_text}. I'd be happy to provide a comprehensive answer based on current knowledge and best practices."

print(f"\nMock LLM application ready - will generate responses for {len(goldens)} templates")


In [None]:
# Convert goldens to test cases by generating actual outputs
print("Converting Golden templates to test cases...")

print("Before conversion:")
print(f"  - Test cases: {len(golden_dataset.test_cases)}")
print(f"  - Goldens: {len(golden_dataset.goldens)}")

# Convert goldens to test cases using our mock LLM
golden_dataset.convert_goldens_to_test_cases(mock_llm_application)

print("\nAfter conversion:")
print(f"  - Test cases: {len(golden_dataset.test_cases)}")
print(f"  - Goldens: {len(golden_dataset.goldens)}")

print("\nConversion completed!")

# Show the updated dataset
print("\nUpdated Dataset with Generated Outputs:")
dataset_df = golden_dataset.df
# Filter for rows with actual output
mask = pd.notna(dataset_df['actual_output']) & (dataset_df['actual_output'] != '')
converted_df = dataset_df[mask]

if not converted_df.empty:
    display(converted_df[['input', 'actual_output', 'expected_output']])
    
    # Analyze output lengths using pandas string methods
    actual_lengths = pd.Series([len(str(x)) for x in converted_df['actual_output']])
    expected_lengths = pd.Series([len(str(x)) for x in converted_df['expected_output']])
else:
    print("No converted test cases found")

print(f"\nOutput Analysis:")
print(f"Average actual output length: {actual_lengths.mean():.0f} characters")
print(f"Average expected output length: {expected_lengths.mean():.0f} characters")
print(f"Ratio (actual/expected): {(actual_lengths.mean() / expected_lengths.mean()):.2f}x")


## Section 5: ValidMind Integration

Now let's demonstrate how to integrate our LLMAgentDataset with ValidMind's testing framework.


In [None]:
# Initialize ValidMind
print("Integrating with ValidMind framework...")

try:
    # Initialize ValidMind
    vm.init()
    print("ValidMind initialized")
    
    # Register our datasets with ValidMind
    datasets_to_register = [
        (simple_dataset, "simple_qa_dataset"),
        (rag_dataset, "rag_evaluation_dataset"),
        (agent_dataset, "agent_evaluation_dataset"),
        (golden_dataset, "golden_templates_dataset")
    ]
    
    for dataset, dataset_id in datasets_to_register:
        try:
            vm.init_dataset(
                dataset=dataset.df,
                input_id=dataset_id,
                text_column="input",
                target_column="expected_output"
            )
            print(f"Registered: {dataset_id}")
        except Exception as e:
            print(f"WARNING: Failed to register {dataset_id}: {e}")
    
    # Note: ValidMind datasets are now registered and can be used in test suites
    print("\nValidMind Integration Complete:")
    print("  - Datasets registered successfully")
    print("  - Ready for use in ValidMind test suites")
    print("  - Can be referenced by their input_id in test configurations")
        
except Exception as e:
    print(f"ERROR: ValidMind integration failed: {e}")
    print("Note: Some ValidMind features may require additional setup")

# Demonstrate dataset compatibility
print(f"\nDataset Compatibility Check:")
print(f"All datasets inherit from VMDataset: SUCCESS")

for dataset, name in [(simple_dataset, "Simple Q&A"), (rag_dataset, "RAG"), (agent_dataset, "Agent"), (golden_dataset, "Golden")]:
    print(f"\n{name} Dataset:")
    print(f"  - Type: {type(dataset).__name__}")
    print(f"  - Inherits VMDataset: {hasattr(dataset, 'df')}")
    print(f"  - Has text_column: {hasattr(dataset, 'text_column')}")
    print(f"  - Has target_column: {hasattr(dataset, 'target_column')}")
    print(f"  - DataFrame shape: {dataset.df.shape}")
    print(f"  - Columns: {len(dataset.columns)}")


## Section 6: Custom Metrics with G-Eval

One of DeepEval's most powerful features is the ability to create custom evaluation metrics using G-Eval (Generative Evaluation).


In [None]:
# Create custom evaluation metrics using G-Eval
print("Creating custom evaluation metrics...")

# Custom metric 1: Technical Accuracy
technical_accuracy_metric = GEval(
    name="Technical Accuracy",
    criteria="""Evaluate whether the response is technically accurate and uses appropriate 
    terminology for the domain. Consider if the explanations are scientifically sound 
    and if technical concepts are explained correctly.""",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.CONTEXT
    ],
    threshold=0.8
)

# Custom metric 2: Clarity and Comprehensiveness  
clarity_metric = GEval(
    name="Clarity and Comprehensiveness",
    criteria="""Assess whether the response is clear, well-structured, and comprehensive. 
    The response should be easy to understand, logically organized, and address all 
    aspects of the user's question without being overly verbose.""",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
    ],
    threshold=0.75
)

# Custom metric 3: Business Context Appropriateness
business_context_metric = GEval(
    name="Business Context Appropriateness", 
    criteria="""Evaluate whether the response is appropriate for a business context. 
    Consider if the tone is professional, if the content is relevant to business needs, 
    and if it provides actionable information that would be valuable to a business user.""",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT
    ],
    threshold=0.7
)

# Custom metric 4: Tool Usage Appropriateness (for agents)
tool_usage_metric = GEval(
    name="Tool Usage Appropriateness",
    criteria="""Evaluate whether the agent used appropriate tools for the given task. 
    Consider if the tools were necessary, if they were used correctly, and if the 
    agent's reasoning for tool selection was sound.""",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
    ],
    threshold=0.8
)

custom_metrics = [
    technical_accuracy_metric,
    clarity_metric, 
    business_context_metric,
    tool_usage_metric
]

print("Custom metrics created:")
for metric in custom_metrics:
    print(f"  - {metric.name}: threshold {metric.threshold}")

# Demonstrate metric application to different dataset types
print(f"\nMetric-Dataset Matching:")
metric_dataset_pairs = [
    ("Technical Accuracy", "golden_templates_dataset (tech questions)"),
    ("Clarity and Comprehensiveness", "simple_qa_dataset (general Q&A)"),
    ("Business Context Appropriateness", "rag_evaluation_dataset (business support)"),
    ("Tool Usage Appropriateness", "agent_evaluation_dataset (agent actions)")
]

for metric_name, dataset_name in metric_dataset_pairs:
    print(f"  - {metric_name} → {dataset_name}")

print(f"\nEvaluation Setup (Demo Mode):")
print("Note: Actual evaluation requires OpenAI API key")
print("These metrics would evaluate:")
print("  - Technical accuracy of AI/ML explanations") 
print("  - Clarity of business support responses")
print("  - Appropriateness of agent tool usage")
print("  - Overall comprehensiveness across all domains")


## Section 7: Best Practices & Production Patterns

Let's wrap up with some best practices and real-world usage patterns for production systems.


In [None]:
# Demonstrate best practices and production patterns
print("Production Best Practices Summary")

# 1. Dataset Organization
print("\n1. Dataset Organization by Use Case:")
all_test_cases = simple_dataset.test_cases + rag_test_cases + agent_test_cases + golden_dataset.test_cases

# Categorize test cases
categorized_cases = {
    "Simple Q&A": [],
    "RAG Systems": [],
    "Agent Systems": [],
    "Technical Content": []
}

for case in all_test_cases:
    if hasattr(case, 'retrieval_context') and case.retrieval_context:
        categorized_cases["RAG Systems"].append(case)
    elif hasattr(case, 'tools_called') and case.tools_called:
        categorized_cases["Agent Systems"].append(case)
    elif any(keyword in case.input.lower() for keyword in ['neural', 'machine learning', 'encryption', 'cloud']):
        categorized_cases["Technical Content"].append(case)
    else:
        categorized_cases["Simple Q&A"].append(case)

for category, cases in categorized_cases.items():
    print(f"  - {category}: {len(cases)} test cases")

# 2. Metric Selection Strategy
print("\n2. Metric Selection Strategy:")
metric_recommendations = {
    "Simple Q&A": ["AnswerRelevancyMetric", "GEval(Correctness)", "HallucinationMetric"],
    "RAG Systems": ["FaithfulnessMetric", "ContextualRelevancyMetric", "AnswerRelevancyMetric"],
    "Agent Systems": ["ToolCorrectnessMetric", "TaskCompletionMetric", "GEval(Tool Usage)"],
    "Technical Content": ["GEval(Technical Accuracy)", "GEval(Clarity)", "BiasMetric"]
}

for use_case, metrics in metric_recommendations.items():
    print(f"  - {use_case}:")
    for metric in metrics:
        print(f"    • {metric}")

# 3. Evaluation Frequency
print("\n3. Evaluation Frequency Recommendations:")
evaluation_schedule = {
    "Development": "Every code commit",
    "Staging": "Before each deployment", 
    "Production": "Daily monitoring",
    "Model Updates": "Before and after model changes",
    "Dataset Updates": "When new training data is added"
}

for stage, frequency in evaluation_schedule.items():
    print(f"  - {stage}: {frequency}")

# 4. Production Integration Example
print("\n4. Production Integration Pattern:")
production_example = '''
# Example production integration
def evaluate_llm_system(production_logs, model_version):
    # Convert logs to test cases
    test_cases = []
    for log in production_logs:
        test_case = LLMTestCase(
            input=log['user_query'],
            actual_output=log['llm_response'],
            context=log.get('context', []),
            retrieval_context=log.get('retrieved_docs', [])
        )
        test_cases.append(test_case)
    
    # Create dataset
    dataset = LLMAgentDataset.from_test_cases(
        test_cases=test_cases,
        input_id=f"production_eval_{model_version}"
    )
    
    # Run evaluation
    metrics = [
        AnswerRelevancyMetric(threshold=0.8),
        FaithfulnessMetric(threshold=0.85),
        HallucinationMetric(threshold=0.2)
    ]
    
    results = dataset.evaluate_with_deepeval(
        metrics=metrics,
        hyperparameters={"model_version": model_version}
    )
    
    return results
'''

print(production_example)

# 5. Performance Optimization
print("\n5. Performance Optimization Tips:")
optimization_tips = [
    "Use batch evaluation for multiple test cases",
    "Cache evaluation results to avoid re-computation",
    "Run evaluations async when possible",
    "Set appropriate thresholds based on use case requirements",
    "Monitor evaluation costs and optimize API usage",
    "Use sampling for large datasets in development"
]

for i, tip in enumerate(optimization_tips, 1):
    print(f"  {i}. {tip}")

# 6. Quality Assurance
print("\n6. Quality Assurance Guidelines:")
qa_guidelines = [
    "Maintain diverse test cases covering edge cases",
    "Regular review and update of evaluation criteria",
    "Track metric trends over time",
    "Set up alerts for significant performance drops",
    "Include human evaluation for critical use cases",
    "Document evaluation methodology and threshold rationale"
]

for i, guideline in enumerate(qa_guidelines, 1):
    print(f"  {i}. {guideline}")

print(f"\nCurrent Demo Summary:")
print(f"  - Total test cases created: {len(all_test_cases)}")
print(f"  - Datasets created: 4")
print(f"  - Custom metrics defined: {len(custom_metrics)}")
print(f"  - ValidMind integration: SUCCESS")
print(f"  - Production patterns: SUCCESS")
