# Evaluation Metrics Analysis (Groq - Qwen3-32B)

Notebook ini menganalisis metrik evaluasi dari hasil agentic inference.

## Prerequisites
- All 5 batches completed (batch-1 to batch-5)
- Consolidated results available in `results_v2/`

## Metrik yang dianalisis:
- **String Metrics**: BLEU, Rouge-L, Jaro-Winkler, Jaccard Cypher
- **Output Metrics**: Pass@1, Jaccard Output
- **Composite Metrics**: LLMetric-Q, LLMetric
- **Agentic Metrics**: Iterations, Recovery Rate, First Attempt Success

In [None]:
# Setup
import sys
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['font.size'] = 10

print(f"Project root: {project_root}")

## 1. Load Consolidated Results

In [None]:
# Load experiment summary
results_dir = project_root / "results_v2"
summary_path = results_dir / "experiment_summary.json"

if summary_path.exists():
    with open(summary_path, "r") as f:
        summary = json.load(f)
    print(f"Loaded results from: {summary['timestamp']}")
    print(f"LLM Provider: {summary.get('llm_provider', 'groq')}")
    print(f"LLM Model: {summary.get('llm_model', 'qwen3-32b')}")
    print(f"Configurations: {list(summary['configurations'].keys())}")
else:
    print("No consolidated results found.")
    print("Please run all 5 batch notebooks first, then consolidate in batch-5.")
    summary = None

In [None]:
# Load detailed results for each configuration
detailed_results = {}

if summary:
    for batch_num in range(1, 6):
        batch_dir = results_dir / f"batch_{batch_num}"
        if batch_dir.exists():
            for config_dir in batch_dir.iterdir():
                if config_dir.is_dir() and config_dir.name != "checkpoints":
                    csv_path = config_dir / "agentic_results.csv"
                    if csv_path.exists():
                        detailed_results[config_dir.name] = pd.read_csv(csv_path)
                        print(f"Loaded {len(detailed_results[config_dir.name])} results for {config_dir.name}")

In [None]:
# Load consolidated summary CSV
consolidated_path = results_dir / "consolidated_summary.csv"

if consolidated_path.exists():
    df_summary = pd.read_csv(consolidated_path)
    print(f"\nLoaded consolidated summary: {len(df_summary)} configurations")
    display(df_summary)
else:
    print("Consolidated summary not found. Creating from batch summaries...")
    
    all_summaries = []
    for batch_num in range(1, 6):
        batch_summary = results_dir / f"batch_{batch_num}" / f"batch_{batch_num}_summary.csv"
        if batch_summary.exists():
            df = pd.read_csv(batch_summary)
            all_summaries.append(df)
    
    if all_summaries:
        df_summary = pd.concat(all_summaries, ignore_index=True)
        display(df_summary)

## 2. Calculate Additional Metrics

In [None]:
# Import metrics calculators
from metrics.string_metrics import calculate_bleu, calculate_rouge_l, calculate_jaro_winkler
from metrics.output_metrics import calculate_jaccard_output
from metrics.composite_metrics import calculate_llmetric_q, calculate_llmetric

# Calculate detailed metrics for each configuration
metrics_data = []

for config_name, df in detailed_results.items():
    config_metrics = {
        "Configuration": config_name,
        "Total": len(df),
    }
    
    # Basic rates
    config_metrics["Pass@1 (%)"] = round(df['pass_at_1'].mean() * 100, 2) if 'pass_at_1' in df else 0
    config_metrics["Success (%)"] = round(df['success'].mean() * 100, 2)
    config_metrics["KG Valid (%)"] = round(df['kg_valid'].mean() * 100, 2) if 'kg_valid' in df else 0
    
    # Iterations
    config_metrics["Avg Iterations"] = round(df['total_iterations'].mean(), 2)
    
    # Calculate string metrics for each row
    bleu_scores = []
    rouge_scores = []
    jaro_scores = []
    
    for _, row in df.iterrows():
        if pd.notna(row['final_query']) and pd.notna(row['ground_truth']):
            bleu_scores.append(calculate_bleu(row['ground_truth'], row['final_query']))
            rouge_scores.append(calculate_rouge_l(row['ground_truth'], row['final_query']))
            jaro_scores.append(calculate_jaro_winkler(row['ground_truth'], row['final_query']))
    
    config_metrics["Avg BLEU"] = round(np.mean(bleu_scores), 4) if bleu_scores else 0
    config_metrics["Avg Rouge-L"] = round(np.mean(rouge_scores), 4) if rouge_scores else 0
    config_metrics["Avg Jaro-Winkler"] = round(np.mean(jaro_scores), 4) if jaro_scores else 0
    
    # Calculate LLMetric-Q for each row
    llmetric_q_scores = []
    for _, row in df.iterrows():
        if pd.notna(row['final_query']) and pd.notna(row['ground_truth']):
            llm_q = calculate_llmetric_q(
                ground_truth=row['ground_truth'],
                generated=row['final_query'],
                pass_at_1=row.get('pass_at_1', False)
            )
            llmetric_q_scores.append(llm_q)
    
    config_metrics["Avg LLMetric-Q"] = round(np.mean(llmetric_q_scores), 2) if llmetric_q_scores else 0
    
    # Calculate LLMetric (configuration level)
    config_metrics["LLMetric"] = round(calculate_llmetric(
        pass_at_1_rate=config_metrics["Pass@1 (%)"] / 100,
        kg_valid_rate=config_metrics["KG Valid (%)"] / 100,
        avg_llmetric_q=config_metrics["Avg LLMetric-Q"]
    ), 2)
    
    # Agentic metrics
    first_attempt = df[df['total_iterations'] == 1]['success'].sum()
    recovered = df[(df['total_iterations'] > 1) & (df['success'] == True)].shape[0]
    initially_failed = df[df['total_iterations'] > 1].shape[0]
    
    config_metrics["First Attempt Success (%)"] = round(first_attempt / len(df) * 100, 2)
    config_metrics["Recovery Rate (%)"] = round(recovered / initially_failed * 100, 2) if initially_failed > 0 else 0
    
    metrics_data.append(config_metrics)

df_metrics = pd.DataFrame(metrics_data)
print("\nDetailed Metrics:")
display(df_metrics)

## 3. Visualizations

In [None]:
# Pass@1 Rate by Configuration
if len(df_metrics) > 0:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    configs = df_metrics['Configuration'].tolist()
    pass_rates = df_metrics['Pass@1 (%)'].tolist()
    
    colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(configs)))
    bars = ax.bar(configs, pass_rates, color=colors)
    
    ax.set_xlabel('Configuration', fontsize=12)
    ax.set_ylabel('Pass@1 Rate (%)', fontsize=12)
    ax.set_title('Pass@1 Rate by Configuration (Groq - Qwen3-32B)', fontsize=14)
    ax.set_xticklabels(configs, rotation=45, ha='right')
    
    for bar, val in zip(bars, pass_rates):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{val:.1f}%', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'pass_at_1_by_config.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# LLMetric Comparison
if len(df_metrics) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # LLMetric-Q
    axes[0].bar(df_metrics['Configuration'], df_metrics['Avg LLMetric-Q'], color='coral')
    axes[0].set_xlabel('Configuration')
    axes[0].set_ylabel('LLMetric-Q Score')
    axes[0].set_title('Average LLMetric-Q by Configuration')
    axes[0].set_xticklabels(df_metrics['Configuration'], rotation=45, ha='right')
    
    # LLMetric
    axes[1].bar(df_metrics['Configuration'], df_metrics['LLMetric'], color='seagreen')
    axes[1].set_xlabel('Configuration')
    axes[1].set_ylabel('LLMetric Score')
    axes[1].set_title('LLMetric by Configuration')
    axes[1].set_xticklabels(df_metrics['Configuration'], rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'llmetric_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# Agentic Metrics
if len(df_metrics) > 0:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Average Iterations
    axes[0].bar(df_metrics['Configuration'], df_metrics['Avg Iterations'], color='mediumpurple')
    axes[0].set_xlabel('Configuration')
    axes[0].set_ylabel('Average Iterations')
    axes[0].set_title('Average Iterations Needed')
    axes[0].set_xticklabels(df_metrics['Configuration'], rotation=45, ha='right')
    axes[0].axhline(y=1, color='red', linestyle='--', alpha=0.5, label='Single attempt')
    
    # Recovery Rate
    axes[1].bar(df_metrics['Configuration'], df_metrics['Recovery Rate (%)'], color='darkorange')
    axes[1].set_xlabel('Configuration')
    axes[1].set_ylabel('Recovery Rate (%)')
    axes[1].set_title('Recovery Rate (Initially Failed -> Success)')
    axes[1].set_xticklabels(df_metrics['Configuration'], rotation=45, ha='right')
    
    # First Attempt Success
    axes[2].bar(df_metrics['Configuration'], df_metrics['First Attempt Success (%)'], color='teal')
    axes[2].set_xlabel('Configuration')
    axes[2].set_ylabel('First Attempt Success (%)')
    axes[2].set_title('First Attempt Success Rate')
    axes[2].set_xticklabels(df_metrics['Configuration'], rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'agentic_metrics.png', dpi=150, bbox_inches='tight')
    plt.show()

## 4. Analysis by Prompt Type and Schema Format

In [None]:
# Parse configuration names
def parse_config(config_name):
    parts = config_name.split('_')
    if len(parts) >= 2:
        prompt = parts[0]
        schema = '_'.join(parts[1:])
        return prompt, schema
    return config_name, 'Unknown'

df_metrics[['Prompt', 'Schema']] = df_metrics['Configuration'].apply(
    lambda x: pd.Series(parse_config(x))
)

# Aggregate by Prompt Type
prompt_agg = df_metrics.groupby('Prompt').agg({
    'Pass@1 (%)': 'mean',
    'KG Valid (%)': 'mean',
    'LLMetric': 'mean',
    'Avg Iterations': 'mean',
    'Recovery Rate (%)': 'mean'
}).round(2)

print("\nAggregate by Prompt Type:")
display(prompt_agg)

In [None]:
# Aggregate by Schema Format
schema_agg = df_metrics.groupby('Schema').agg({
    'Pass@1 (%)': 'mean',
    'KG Valid (%)': 'mean',
    'LLMetric': 'mean',
    'Avg Iterations': 'mean',
    'Recovery Rate (%)': 'mean'
}).round(2)

print("\nAggregate by Schema Format:")
display(schema_agg)

In [None]:
# Heatmap: Prompt vs Schema
if len(df_metrics) >= 9:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # LLMetric Heatmap
    pivot_llmetric = df_metrics.pivot(index='Prompt', columns='Schema', values='LLMetric')
    sns.heatmap(pivot_llmetric, annot=True, fmt='.2f', cmap='YlGnBu', ax=axes[0])
    axes[0].set_title('LLMetric: Prompt x Schema')
    
    # Pass@1 Heatmap
    pivot_pass = df_metrics.pivot(index='Prompt', columns='Schema', values='Pass@1 (%)')
    sns.heatmap(pivot_pass, annot=True, fmt='.1f', cmap='YlGnBu', ax=axes[1])
    axes[1].set_title('Pass@1 Rate (%): Prompt x Schema')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'heatmap_prompt_schema.png', dpi=150, bbox_inches='tight')
    plt.show()

## 5. Rankings

In [None]:
# Rank configurations by LLMetric
df_ranked = df_metrics.sort_values('LLMetric', ascending=False).reset_index(drop=True)
df_ranked.index = df_ranked.index + 1
df_ranked.index.name = 'Rank'

print("\nConfigurations Ranked by LLMetric:")
display(df_ranked[['Configuration', 'Pass@1 (%)', 'KG Valid (%)', 'LLMetric', 'Avg Iterations', 'Recovery Rate (%)']])

## 6. Export Results

In [None]:
# Export metrics summary
export_path = results_dir / 'metrics_summary.csv'
df_metrics.to_csv(export_path, index=False)
print(f"Exported metrics to: {export_path}")

# Export to Excel with multiple sheets
try:
    with pd.ExcelWriter(results_dir / 'metrics_summary.xlsx', engine='openpyxl') as writer:
        df_metrics.to_excel(writer, sheet_name='Summary', index=False)
        df_ranked.to_excel(writer, sheet_name='Ranked')
        prompt_agg.to_excel(writer, sheet_name='By Prompt')
        schema_agg.to_excel(writer, sheet_name='By Schema')
    print(f"Exported Excel to: {results_dir / 'metrics_summary.xlsx'}")
except Exception as e:
    print(f"Could not export to Excel: {e}")

In [None]:
print("\nEvaluation metrics analysis complete!")
print("Proceed to 03_comparative_analysis.ipynb for comparison with baseline.")