# AI Agent Evaluation Analysis Dashboard

This notebook provides analysis tools for the monitoring and evaluation framework implemented for the AI agent project. It helps you visualize metrics, analyze performance, and compare different agent configurations.

## Features
- Load and process evaluation results from the evaluation framework
- Visualize key metrics for agent performance
- Analyze user feedback and satisfaction
- Compare different agent configurations (A/B testing)
- Generate insights and recommendations

## Setup and Import Libraries

First, let's install the necessary packages and import libraries for data analysis and visualization.

In [None]:
# Install required packages (if not already installed)
!pip install matplotlib seaborn pandas numpy scikit-learn plotly tqdm

In [None]:
# Import necessary libraries
import os
import json
import glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix

# Set plotting style
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

# Ensure plots are displayed inline in the notebook
%matplotlib inline

## 1. Load Evaluation Data

Let's load the evaluation results from the `evaluation_results` directory.

In [None]:
# Define paths
EVAL_RESULTS_PATH = Path('../evals/evaluation_results')
FEEDBACK_PATH = Path('../evals/feedback_data.json')

def load_latest_evaluation():
    """Load the latest evaluation result file"""
    try:
        # First check if there's a latest.json file
        latest_path = EVAL_RESULTS_PATH / 'latest.json'
        if latest_path.exists():
            with open(latest_path, 'r') as f:
                return json.load(f)

        # If not, find the most recent file
        files = list(EVAL_RESULTS_PATH.glob('*.json'))
        if not files:
            return None

        # Sort by modification time (most recent first)
        latest_file = max(files, key=lambda f: f.stat().st_mtime)

        with open(latest_file, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading evaluation results: {e}")
        return None

def load_all_evaluations():
    """Load all evaluation results as a list"""
    results = []
    try:
        for file_path in EVAL_RESULTS_PATH.glob('*.json'):
            # Skip the latest.json symlink/copy
            if file_path.name == 'latest.json':
                continue

            with open(file_path, 'r') as f:
                data = json.load(f)
                # Add filename/date information
                data['_filename'] = file_path.name
                data['_date'] = datetime.fromtimestamp(file_path.stat().st_mtime)
                results.append(data)

        # Sort by date
        results.sort(key=lambda x: x.get('_date'))
        return results
    except Exception as e:
        print(f"Error loading evaluation results: {e}")
        return []

def load_user_feedback():
    """Load user feedback data"""
    try:
        if FEEDBACK_PATH.exists():
            with open(FEEDBACK_PATH, 'r') as f:
                return json.load(f)
        return {"query_feedback": {}}
    except Exception as e:
        print(f"Error loading feedback data: {e}")
        return {"query_feedback": {}}

# Load the data
latest_eval = load_latest_evaluation()
all_evals = load_all_evaluations()
feedback_data = load_user_feedback()

# Display basic information
print(f"Number of evaluation results found: {len(all_evals)}")

if latest_eval:
    print(f"Latest evaluation has {len(latest_eval.get('metrics', {}))} metrics")
else:
    print("No evaluation results found. Generate some data first using enhanced_evaluation.py")

print(f"Number of user feedback entries: {len(feedback_data.get('query_feedback', {}))}")

## 2. Analyze Operational Metrics

Let's visualize the operational metrics from the evaluation results.

In [None]:
def extract_operational_metrics(eval_data):
    """Extract operational metrics from evaluation data"""
    if not eval_data or 'metrics' not in eval_data:
        return {}

    metrics = eval_data['metrics']

    # Extract operational metrics
    operational = {
        k: v for k, v in metrics.items()
        if any(term in k.lower() for term in ['duration', 'time', 'latency', 'tokens', 'rate'])
    }

    return operational

def plot_operational_metrics(metrics):
    """Plot operational metrics with appropriate visualizations"""
    if not metrics:
        print("No operational metrics available.")
        return

    # Create subplots with appropriate layout
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            "Response Time (seconds)",
            "Token Usage",
            "Token Rate (tokens/second)",
            "Operational Efficiency"
        ]
    )

    # 1. Response Time metrics
    time_metrics = {k: v for k, v in metrics.items() if 'time' in k.lower() or 'duration' in k.lower() or 'latency' in k.lower()}
    if time_metrics:
        fig.add_trace(
            go.Bar(
                x=list(time_metrics.keys()),
                y=list(time_metrics.values()),
                marker_color='royalblue'
            ),
            row=1, col=1
        )

    # 2. Token Usage metrics
    token_metrics = {k: v for k, v in metrics.items() if 'token' in k.lower() and 'rate' not in k.lower()}
    if token_metrics:
        fig.add_trace(
            go.Bar(
                x=list(token_metrics.keys()),
                y=list(token_metrics.values()),
                marker_color='mediumseagreen'
            ),
            row=1, col=2
        )

    # 3. Token Rate metrics
    rate_metrics = {k: v for k, v in metrics.items() if 'rate' in k.lower()}
    if rate_metrics:
        fig.add_trace(
            go.Bar(
                x=list(rate_metrics.keys()),
                y=list(rate_metrics.values()),
                marker_color='darkorange'
            ),
            row=2, col=1
        )

    # 4. Efficiency metrics (custom calculated if not present)
    efficiency_metrics = {}
    if 'completion-tokens' in metrics and 'prompt-tokens' in metrics:
        efficiency_metrics['completion-prompt-ratio'] = metrics['completion-tokens'] / max(metrics['prompt-tokens'], 1)
    if 'server-run-duration-in-seconds' in metrics and 'completion-tokens' in metrics:
        efficiency_metrics['tokens-per-second'] = metrics['completion-tokens'] / max(metrics['server-run-duration-in-seconds'], 0.001)

    if efficiency_metrics:
        fig.add_trace(
            go.Bar(
                x=list(efficiency_metrics.keys()),
                y=list(efficiency_metrics.values()),
                marker_color='purple'
            ),
            row=2, col=2
        )

    # Update layout
    fig.update_layout(
        height=600,
        title_text="Operational Metrics Analysis",
        showlegend=False,
    )

    fig.show()

# Extract and plot operational metrics if available
if latest_eval:
    op_metrics = extract_operational_metrics(latest_eval)
    plot_operational_metrics(op_metrics)
else:
    print("No evaluation data available to analyze operational metrics.")

## 3. Response Quality Analysis

Let's analyze the quality metrics of agent responses.

In [None]:
def extract_quality_metrics(eval_data):
    """Extract quality metrics from evaluation data"""
    if not eval_data or 'metrics' not in eval_data:
        return {}

    metrics = eval_data['metrics']

    # Extract quality metrics
    quality = {
        k: v for k, v in metrics.items()
        if any(term in k.lower() for term in ['quality', 'response_', 'coherence', 'relevance', 'completeness', 'conciseness'])
    }

    return quality

def plot_quality_radar(metrics):
    """Create a radar chart for quality metrics"""
    if not metrics:
        print("No quality metrics available.")
        return

    # Select metrics to display (exclude overall score to avoid skewing)
    display_metrics = {k: v for k, v in metrics.items() if 'overall' not in k.lower()}

    # Create radar chart
    categories = list(display_metrics.keys())
    values = list(display_metrics.values())

    # Ensure we have a closed polygon
    categories.append(categories[0])
    values.append(values[0])

    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories,
        fill='toself',
        line=dict(color='rgb(67, 160, 71)'),
        fillcolor='rgba(67, 160, 71, 0.2)',
        name='Quality Metrics'
    ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )),
        title="Response Quality Metrics",
        showlegend=False
    )

    fig.show()

    # Show overall quality score if available
    overall_score = metrics.get('overall_quality_score')
    if overall_score is not None:
        print(f"Overall Quality Score: {overall_score:.2f}/1.00")

        # Create a gauge chart for overall score
        fig = go.Figure(go.Indicator(
            mode="gauge+number",
            value=overall_score,
            domain={'x': [0, 1], 'y': [0, 1]},
            title={'text': "Overall Quality Score"},
            gauge={
                'axis': {'range': [0, 1]},
                'bar': {'color': "darkgreen"},
                'steps': [
                    {'range': [0, 0.33], 'color': "lightcoral"},
                    {'range': [0.33, 0.67], 'color': "khaki"},
                    {'range': [0.67, 1], 'color': "lightgreen"},
                ],
                'threshold': {
                    'line': {'color': "red", 'width': 4},
                    'thickness': 0.75,
                    'value': 0.8
                }
            }
        ))

        fig.show()

# Extract and plot quality metrics if available
if latest_eval:
    quality_metrics = extract_quality_metrics(latest_eval)
    plot_quality_radar(quality_metrics)
else:
    print("No evaluation data available to analyze quality metrics.")

## 4. Factual Accuracy Analysis

Let's analyze the factual accuracy metrics from the evaluation.

In [None]:
def extract_accuracy_metrics(eval_data):
    """Extract accuracy metrics from evaluation data"""
    if not eval_data or 'metrics' not in eval_data:
        return {}

    metrics = eval_data['metrics']

    # Extract accuracy metrics
    accuracy = {
        k: v for k, v in metrics.items()
        if any(term in k.lower() for term in ['accuracy', 'factual', 'correct', 'error', 'mistake'])
    }

    return accuracy

def plot_accuracy_metrics(metrics):
    """Plot accuracy metrics with appropriate visualizations"""
    if not metrics:
        print("No accuracy metrics available.")
        return

    # Sort metrics by value
    sorted_metrics = {k: v for k, v in sorted(metrics.items(), key=lambda item: item[1], reverse=True)}

    # Create horizontal bar chart
    fig = go.Figure(go.Bar(
        x=list(sorted_metrics.values()),
        y=list(sorted_metrics.keys()),
        orientation='h',
        marker=dict(
            color='rgba(58, 71, 180, 0.6)',
            line=dict(color='rgba(58, 71, 180, 1.0)', width=3)
        )
    ))

    fig.update_layout(
        title="Factual Accuracy Metrics",
        xaxis_title="Score",
        yaxis_title="Metric",
        xaxis=dict(range=[0, 1]),
        height=400 + (len(metrics) * 30),
    )

    fig.show()

# Extract and plot accuracy metrics if available
if latest_eval:
    accuracy_metrics = extract_accuracy_metrics(latest_eval)
    plot_accuracy_metrics(accuracy_metrics)
else:
    print("No evaluation data available to analyze accuracy metrics.")

## 5. User Feedback Analysis

Let's analyze the user feedback data to understand user satisfaction.

In [None]:
def analyze_feedback(feedback_data):
    """Analyze user feedback data"""
    query_feedback = feedback_data.get('query_feedback', {})

    if not query_feedback:
        print("No user feedback available.")
        return None

    # Convert to DataFrame for easier analysis
    feedback_list = []

    for query, entries in query_feedback.items():
        for entry in entries:
            feedback_list.append({
                'query': query,
                'timestamp': entry.get('timestamp', ''),
                'rating': entry.get('rating', 0),
                'comments': entry.get('comments', ''),
                'date': datetime.fromtimestamp(int(entry.get('timestamp', 0)) / 1000) if entry.get('timestamp') else None
            })

    if not feedback_list:
        print("No feedback entries found.")
        return None

    df = pd.DataFrame(feedback_list)
    return df

def plot_feedback(feedback_df):
    """Visualize user feedback"""
    if feedback_df is None or feedback_df.empty:
        print("No feedback data to visualize.")
        return

    # Create subplots for different visualizations
    fig = make_subplots(
        rows=2, cols=2,
        specs=[
            [{'type': 'pie'}, {'type': 'bar'}],
            [{'type': 'scatter', 'colspan': 2}, None]
        ],
        subplot_titles=[
            "Rating Distribution",
            "Average Rating by Date",
            "Rating Trend Over Time"
        ]
    )

    # 1. Rating distribution pie chart
    rating_counts = feedback_df['rating'].value_counts().sort_index()
    fig.add_trace(
        go.Pie(
            labels=[f"{i} Star{'s' if i > 1 else ''}" for i in rating_counts.index],
            values=rating_counts.values,
            marker=dict(colors=['#FF4136', '#FF851B', '#FFDC00', '#2ECC40', '#0074D9']),
            hole=0.4,
            textinfo='label+percent'
        ),
        row=1, col=1
    )

    # Only continue if we have date information
    if feedback_df['date'].notna().any():
        # 2. Average rating by date
        feedback_df['date_only'] = feedback_df['date'].dt.date
        daily_ratings = feedback_df.groupby('date_only')['rating'].mean().reset_index()

        if not daily_ratings.empty:
            fig.add_trace(
                go.Bar(
                    x=daily_ratings['date_only'],
                    y=daily_ratings['rating'],
                    marker_color='teal'
                ),
                row=1, col=2
            )

        # 3. Rating trend over time
        fig.add_trace(
            go.Scatter(
                x=feedback_df.sort_values('date')['date'],
                y=feedback_df.sort_values('date')['rating'],
                mode='markers+lines',
                marker=dict(size=8),
                line=dict(width=2, dash='solid')
            ),
            row=2, col=1
        )

    # Update layout
    fig.update_layout(
        height=800,
        title_text="User Feedback Analysis",
    )

    fig.show()

    # Print summary statistics
    print(f"Total feedback entries: {len(feedback_df)}")
    print(f"Average rating: {feedback_df['rating'].mean():.2f}/5.00")
    print(f"Rating distribution:\n{feedback_df['rating'].value_counts().sort_index()}")

# Analyze feedback data
feedback_df = analyze_feedback(feedback_data)
plot_feedback(feedback_df)

## 6. A/B Testing Comparison

Let's compare different agent configurations or versions from multiple evaluation results.

In [None]:
def prepare_ab_testing_data(all_evaluations):
    """Prepare data for A/B testing comparison"""
    if not all_evaluations or len(all_evaluations) < 2:
        print("Need at least two evaluation results for A/B testing comparison.")
        return None

    # Extract key metrics from each evaluation
    comparison_data = []

    for eval_data in all_evaluations:
        metrics = eval_data.get('metrics', {})

        # Skip if no metrics available
        if not metrics:
            continue

        # Get key metrics we want to compare
        entry = {
            'version': eval_data.get('_filename', 'Unknown'),
            'date': eval_data.get('_date', datetime.now()).strftime('%Y-%m-%d'),
            'response_time': metrics.get('server-run-duration-in-seconds', metrics.get('client-run-duration-in-seconds', 0)),
            'tokens': metrics.get('completion-tokens', 0) + metrics.get('prompt-tokens', 0),
            'quality': metrics.get('overall_quality_score', 0),
            'accuracy': metrics.get('factual_accuracy_score', metrics.get('overall_accuracy', 0)),
            'feedback': metrics.get('average_user_rating', 0),
        }

        comparison_data.append(entry)

    if not comparison_data:
        print("No valid metrics found for comparison.")
        return None

    # Convert to DataFrame
    return pd.DataFrame(comparison_data)

def plot_ab_comparison(comparison_df):
    """Visualize A/B testing comparison"""
    if comparison_df is None or comparison_df.empty:
        print("No comparison data to visualize.")
        return

    # Normalize the data for fair comparison
    normalized_df = comparison_df.copy()

    # Response time: lower is better, so we invert it for visualization
    if normalized_df['response_time'].max() > 0:
        normalized_df['response_time_normalized'] = 1 - (normalized_df['response_time'] / normalized_df['response_time'].max())
    else:
        normalized_df['response_time_normalized'] = 0

    # Tokens: we normalize but don't invert (neutral metric)
    if normalized_df['tokens'].max() > 0:
        normalized_df['tokens_normalized'] = normalized_df['tokens'] / normalized_df['tokens'].max()
    else:
        normalized_df['tokens_normalized'] = 0

    # Quality, accuracy, feedback: higher is better
    normalized_df['quality_normalized'] = normalized_df['quality']
    normalized_df['accuracy_normalized'] = normalized_df['accuracy']
    normalized_df['feedback_normalized'] = normalized_df['feedback'] / 5  # Assuming 5-star scale

    # Create radar chart for each version
    fig = go.Figure()

    categories = ['Speed', 'Token Efficiency', 'Quality', 'Accuracy', 'User Satisfaction']

    for i, row in normalized_df.iterrows():
        values = [
            row['response_time_normalized'],
            1 - (row['tokens_normalized'] if row['tokens_normalized'] <= 1 else 1),  # Invert tokens for efficiency
            row['quality_normalized'],
            row['accuracy_normalized'],
            row['feedback_normalized']
        ]

        # Make the polygon closed
        categories_closed = categories + [categories[0]]
        values_closed = values + [values[0]]

        fig.add_trace(go.Scatterpolar(
            r=values_closed,
            theta=categories_closed,
            fill='toself',
            name=f"{row['version']} ({row['date']})"
        ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )),
        title="A/B Testing Comparison",
        showlegend=True
    )

    fig.show()

    # Create a comparison table
    display_df = comparison_df[['version', 'date', 'response_time', 'tokens', 'quality', 'accuracy', 'feedback']]
    display_df = display_df.sort_values('date', ascending=False)

    display(display_df)

# Prepare and plot A/B testing comparison
comparison_df = prepare_ab_testing_data(all_evals)
plot_ab_comparison(comparison_df)

## 7. Generate Insights and Recommendations

Based on the analysis, let's generate insights and recommendations.

In [None]:
def generate_insights(latest_eval, all_evals, feedback_df):
    """Generate insights and recommendations based on the evaluation data"""
    insights = []
    recommendations = []

    # Check if we have enough data
    if not latest_eval or 'metrics' not in latest_eval:
        insights.append("No recent evaluation data available.")
        recommendations.append("Run evaluation tests using enhanced_evaluation.py to collect performance data.")
        return insights, recommendations

    metrics = latest_eval['metrics']

    # 1. Response time insights
    response_time = metrics.get('server-run-duration-in-seconds', 0)
    if response_time > 5:
        insights.append(f"Response time is relatively high ({response_time:.2f}s).")
        recommendations.append("Consider optimizing agent configuration or model selection for faster responses.")
    else:
        insights.append(f"Response time is good ({response_time:.2f}s).")

    # 2. Token usage insights
    completion_tokens = metrics.get('completion-tokens', 0)
    prompt_tokens = metrics.get('prompt-tokens', 0)
    if prompt_tokens > 0 and completion_tokens / prompt_tokens < 0.5:
        insights.append(f"Low completion-to-prompt ratio ({completion_tokens/prompt_tokens:.2f}).")
        recommendations.append("Review agent instructions to reduce unnecessary context in prompts.")

    # 3. Quality insights
    quality_score = metrics.get('overall_quality_score', 0)
    if quality_score < 0.7:
        insights.append(f"Response quality score is below target ({quality_score:.2f}).")
        recommendations.append("Improve agent instructions or consider using a more capable model.")
    else:
        insights.append(f"Response quality score is good ({quality_score:.2f}).")

    # 4. Accuracy insights
    accuracy_score = metrics.get('factual_accuracy_score', metrics.get('overall_accuracy', 0))
    if accuracy_score < 0.8:
        insights.append(f"Factual accuracy is below target ({accuracy_score:.2f}).")
        recommendations.append("Enhance knowledge sources or improve grounding in responses.")
    else:
        insights.append(f"Factual accuracy is good ({accuracy_score:.2f}).")

    # 5. User feedback insights
    if feedback_df is not None and not feedback_df.empty:
        avg_rating = feedback_df['rating'].mean()
        if avg_rating < 4:
            insights.append(f"Average user rating is below target ({avg_rating:.2f}/5.00).")
            recommendations.append("Review user feedback comments to identify improvement areas.")
        else:
            insights.append(f"User satisfaction is good ({avg_rating:.2f}/5.00).")
    else:
        insights.append("No user feedback data available.")
        recommendations.append("Implement feedback collection in the user interface.")

    # 6. Trend insights
    if len(all_evals) >= 2:
        # Check for trends in quality or performance
        latest_metrics = metrics
        previous_metrics = all_evals[-2].get('metrics', {})

        if previous_metrics:
            prev_quality = previous_metrics.get('overall_quality_score', 0)
            curr_quality = latest_metrics.get('overall_quality_score', 0)

            if curr_quality > prev_quality:
                insights.append(f"Quality improved since last evaluation (+{curr_quality-prev_quality:.2f}).")
            elif curr_quality < prev_quality:
                insights.append(f"Quality decreased since last evaluation (-{prev_quality-curr_quality:.2f}).")
                recommendations.append("Review recent changes to identify potential regressions.")

    # Add general recommendations if we don't have many specific ones
    if len(recommendations) < 2:
        recommendations.append("Continue monitoring agent performance with regular evaluations.")
        recommendations.append("Consider implementing A/B testing to compare different agent configurations.")

    return insights, recommendations

# Generate insights and recommendations
insights, recommendations = generate_insights(latest_eval, all_evals, feedback_df)

# Display insights and recommendations
print("📊 Key Insights:")
for i, insight in enumerate(insights, 1):
    print(f"  {i}. {insight}")

print("\n📝 Recommendations:")
for i, rec in enumerate(recommendations, 1):
    print(f"  {i}. {rec}")

## 8. Export Dashboard as HTML Report

Let's export the dashboard as an HTML report that can be shared with stakeholders.

In [None]:
def create_html_report(latest_eval, all_evals, feedback_df, insights, recommendations):
    """Create an HTML report with the evaluation results"""
    if not latest_eval:
        print("No evaluation data available to create report.")
        return

    import datetime

    # Create a HTML string
    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>AI Agent Evaluation Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            .header {{ background-color: #4285f4; color: white; padding: 20px; margin-bottom: 20px; }}
            .section {{ margin-bottom: 30px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }}
            .insights {{ background-color: #f8f9fa; }}
            .recommendations {{ background-color: #e8f0fe; }}
            h1 {{ margin: 0; }}
            h2 {{ color: #4285f4; }}
            .metric {{ display: inline-block; width: 30%; margin: 10px; padding: 15px; background-color: #f5f5f5; border-radius: 5px; text-align: center; }}
            .metric-value {{ font-size: 24px; font-weight: bold; color: #4285f4; }}
            .metric-label {{ font-size: 14px; color: #555; }}
            table {{ width: 100%; border-collapse: collapse; }}
            th, td {{ padding: 8px; text-align: left; border-bottom: 1px solid #ddd; }}
            th {{ background-color: #f2f2f2; }}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>AI Agent Evaluation Report</h1>
            <p>Generated on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}</p>
        </div>

        <div class="section">
            <h2>Summary Metrics</h2>
    """

    # Add key metrics
    metrics = latest_eval.get('metrics', {})
    key_metrics = [
        ('Response Time', f"{metrics.get('server-run-duration-in-seconds', 0):.2f}s"),
        ('Quality Score', f"{metrics.get('overall_quality_score', 0):.2f}/1.00"),
        ('Accuracy', f"{metrics.get('factual_accuracy_score', metrics.get('overall_accuracy', 0)):.2f}/1.00"),
        ('Token Usage', f"{metrics.get('completion-tokens', 0) + metrics.get('prompt-tokens', 0)}"),
    ]

    if feedback_df is not None and not feedback_df.empty:
        key_metrics.append(('User Rating', f"{feedback_df['rating'].mean():.2f}/5.00"))

    for label, value in key_metrics:
        html += f"""
            <div class="metric">
                <div class="metric-value">{value}</div>
                <div class="metric-label">{label}</div>
            </div>
        """

    # Add insights section
    html += f"""
        </div>

        <div class="section insights">
            <h2>Key Insights</h2>
            <ul>
    """

    for insight in insights:
        html += f"<li>{insight}</li>\n"

    html += """
            </ul>
        </div>

        <div class="section recommendations">
            <h2>Recommendations</h2>
            <ul>
    """

    for rec in recommendations:
        html += f"<li>{rec}</li>\n"

    # Add comparison table if available
    if len(all_evals) >= 2:
        comparison_df = prepare_ab_testing_data(all_evals)
        if comparison_df is not None and not comparison_df.empty:
            html += """
            </ul>
        </div>

        <div class="section">
            <h2>Evaluation Comparison</h2>
            <table>
                <tr>
                    <th>Version</th>
                    <th>Date</th>
                    <th>Response Time</th>
                    <th>Tokens</th>
                    <th>Quality</th>
                    <th>Accuracy</th>
                    <th>User Rating</th>
                </tr>
            """

            for _, row in comparison_df.sort_values('date', ascending=False).iterrows():
                html += f"""
                <tr>
                    <td>{row['version']}</td>
                    <td>{row['date']}</td>
                    <td>{row['response_time']:.2f}s</td>
                    <td>{row['tokens']}</td>
                    <td>{row['quality']:.2f}</td>
                    <td>{row['accuracy']:.2f}</td>
                    <td>{row['feedback']:.2f}</td>
                </tr>
                """

            html += """
            </table>
        </div>
            """
    else:
        html += """
            </ul>
        </div>
        """

    html += """
        <div class="section">
            <h2>About</h2>
            <p>This report was generated by the AI Agent Evaluation Dashboard.</p>
        </div>
    </body>
    </html>
    """

    # Save to file
    report_path = Path('../evals/evaluation_results/evaluation_report.html')
    report_path.parent.mkdir(parents=True, exist_ok=True)

    with open(report_path, 'w') as f:
        f.write(html)

    print(f"HTML report saved to {report_path}")
    return report_path

# Generate HTML report
if latest_eval:
    report_path = create_html_report(latest_eval, all_evals, feedback_df, insights, recommendations)
else:
    print("No evaluation data available to create report.")

## 9. Add Sample Data for Development

If you don't have real evaluation data yet, you can use this cell to generate sample data for development and testing.

In [None]:
def generate_sample_data():
    """Generate sample evaluation and feedback data for development"""
    import random
    import json
    from pathlib import Path
    from datetime import datetime, timedelta

    # Create directories if they don't exist
    EVAL_RESULTS_PATH = Path('../evals/evaluation_results')
    FEEDBACK_PATH = Path('../evals/feedback_data.json')

    EVAL_RESULTS_PATH.mkdir(parents=True, exist_ok=True)

    # Sample queries
    sample_queries = [
        "How does this agent work?",
        "What are the key features of this application?",
        "Can you explain how the file search works?",
        "Tell me about agent personalities",
        "How is monitoring and evaluation implemented?"
    ]

    # Generate 3 sample evaluation results
    for i in range(3):
        # Create base metrics with some randomness
        base_quality = 0.7 + (i * 0.05) + random.uniform(-0.05, 0.05)
        base_accuracy = 0.75 + (i * 0.05) + random.uniform(-0.05, 0.05)
        base_response_time = 3 - (i * 0.3) + random.uniform(-0.2, 0.2)

        metrics = {
            # Operational metrics
            "server-run-duration-in-seconds": max(0.5, base_response_time),
            "client-run-duration-in-seconds": max(0.6, base_response_time + 0.1),
            "completion-tokens": random.randint(150, 250),
            "prompt-tokens": random.randint(300, 500),
            "token-generation-rate": random.uniform(10, 15),

            # Quality metrics
            "response_completeness": min(1.0, base_quality + random.uniform(-0.1, 0.1)),
            "response_relevance": min(1.0, base_quality + random.uniform(-0.1, 0.1)),
            "response_conciseness": min(1.0, base_quality + random.uniform(-0.1, 0.1)),
            "overall_quality_score": min(1.0, base_quality),

            # Accuracy metrics
            "factual_accuracy_score": min(1.0, base_accuracy),
            "citation_accuracy": min(1.0, base_accuracy + random.uniform(-0.1, 0.1)),
            "hallucination_rate": max(0, 1 - base_accuracy + random.uniform(-0.1, 0.1)),

            # User feedback metrics
            "average_user_rating": min(5.0, (base_quality + base_accuracy) * 2.5),

            # Tool metrics
            "tool_call_accuracy": min(1.0, 0.85 + (i * 0.03) + random.uniform(-0.05, 0.05)),
            "intent_resolution": min(1.0, 0.87 + (i * 0.03) + random.uniform(-0.05, 0.05)),
            "task_adherence": min(1.0, 0.9 + (i * 0.02) + random.uniform(-0.05, 0.05)),

            # Safety metrics
            "content_safety_score": min(1.0, 0.95 + random.uniform(-0.03, 0.03)),
            "attack_resistance": min(1.0, 0.92 + random.uniform(-0.03, 0.03)),
        }

        # Create the evaluation result
        result = {
            "version": f"v1.{i}",
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "evaluation_name": f"sample-evaluation-{i+1}",
            "metrics": metrics,
            "config": {
                "model": "gpt-4o",
                "evaluators": ["operational", "quality", "accuracy", "feedback", "safety"]
            }
        }

        # Save to file
        timestamp = (datetime.now() - timedelta(days=2-i)).strftime("%Y%m%d%H%M%S")
        filename = f"sample_eval_{timestamp}.json"

        with open(EVAL_RESULTS_PATH / filename, 'w') as f:
            json.dump(result, f, indent=2)

        # Also save the last one as latest.json
        if i == 2:
            with open(EVAL_RESULTS_PATH / 'latest.json', 'w') as f:
                json.dump(result, f, indent=2)

    # Generate sample feedback data
    feedback_data = {"query_feedback": {}}

    for query in sample_queries:
        # Generate 2-5 feedback entries for each query
        entries = []
        for _ in range(random.randint(2, 5)):
            # Create timestamps over the past week
            days_ago = random.randint(0, 7)
            timestamp = int((datetime.now() - timedelta(days=days_ago)).timestamp() * 1000)

            entries.append({
                "timestamp": timestamp,
                "rating": random.randint(3, 5),  # Skewed toward positive ratings
                "comments": random.choice([
                    "Good response, thanks!",
                    "Very helpful information",
                    "Could be more detailed",
                    "Perfect answer",
                    ""  # Some entries without comments
                ])
            })

        feedback_data["query_feedback"][query] = entries

    # Save feedback data
    with open(FEEDBACK_PATH, 'w') as f:
        json.dump(feedback_data, f, indent=2)

    print(f"Generated {3} sample evaluation files in {EVAL_RESULTS_PATH}")
    print(f"Generated sample feedback data with {len(sample_queries)} queries in {FEEDBACK_PATH}")

# Uncomment this line if you want to generate sample data
# generate_sample_data()

## Conclusion

This notebook has provided comprehensive analysis tools for the AI agent monitoring and evaluation framework. You can use these tools to:

1. Monitor agent performance over time
2. Compare different agent configurations
3. Analyze user feedback and satisfaction
4. Generate insights and recommendations for improvement

For ongoing monitoring, you can schedule this notebook to run regularly or integrate it into your continuous evaluation pipeline.