# Comparative Analysis: Baseline vs Agentic

Notebook ini membandingkan hasil agentic loop dengan baseline (linear pipeline).

## Perbandingan:
- Pass@1 Rate improvement
- KG Validity improvement
- LLMetric improvement
- Impact of self-correction

In [None]:
# Setup
import sys
from pathlib import Path

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [14, 6]

## 1. Load Baseline Results (from kg-axel)

In [None]:
# Define baseline results from Axel's research
# These values should be updated with actual baseline results from kg-axel

baseline_results = {
    "Zero-Shot_Full": {
        "pass_at_1_rate": 0.0,  # Update with actual value
        "kg_valid_rate": 0.0,   # Update with actual value
        "llmetric": 0.0,        # Update with actual value
    },
    "Zero-Shot_Nodes+Paths": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "Zero-Shot_Paths": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "Few-Shot_Full": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "Few-Shot_Nodes+Paths": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "Few-Shot_Paths": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "CoT_Full": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "CoT_Nodes+Paths": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
    "CoT_Paths": {
        "pass_at_1_rate": 0.0,
        "kg_valid_rate": 0.0,
        "llmetric": 0.0,
    },
}

print("Baseline results loaded (update with actual values from kg-axel)")

In [None]:
# Load baseline from CSV if available
# kg_axel_results_path = Path("/Users/tsimiscouse/Docs/Sarjana/Skripsi/kg-axel/results/...")

# Alternative: Load from kg-axel evaluation results
# if kg_axel_results_path.exists():
#     baseline_df = pd.read_csv(kg_axel_results_path)
#     print(baseline_df.head())

## 2. Load Agentic Results

In [None]:
# Load agentic results
results_dir = project_root / "results"
summary_path = results_dir / "experiment_summary.json"

if summary_path.exists():
    with open(summary_path, "r") as f:
        agentic_summary = json.load(f)
    print(f"Loaded agentic results from: {agentic_summary['timestamp']}")
else:
    print("No agentic results found. Please run inference first.")
    agentic_summary = None

## 3. Calculate Improvements

In [None]:
# Compare baseline vs agentic
if agentic_summary:
    comparison_rows = []
    
    for config_name, agentic_config in agentic_summary['configurations'].items():
        baseline = baseline_results.get(config_name, {})
        
        baseline_pass = baseline.get('pass_at_1_rate', 0)
        baseline_kg = baseline.get('kg_valid_rate', 0)
        baseline_llm = baseline.get('llmetric', 0)
        
        agentic_pass = agentic_config['pass_at_1_rate']
        agentic_kg = agentic_config['kg_valid_rate']
        agentic_llm = agentic_config['llmetric']
        
        agentic_metrics = agentic_config.get('agentic_metrics', {})
        
        comparison_rows.append({
            "Configuration": config_name,
            "Baseline Pass@1 (%)": baseline_pass,
            "Agentic Pass@1 (%)": round(agentic_pass, 2),
            "Pass@1 Δ (pp)": round(agentic_pass - baseline_pass, 2),
            "Baseline KG Valid (%)": baseline_kg,
            "Agentic KG Valid (%)": round(agentic_kg, 2),
            "KG Valid Δ (pp)": round(agentic_kg - baseline_kg, 2),
            "Baseline LLMetric": baseline_llm,
            "Agentic LLMetric": round(agentic_llm, 2),
            "LLMetric Δ": round(agentic_llm - baseline_llm, 2),
            "Recovery Rate (%)": round(agentic_metrics.get('recovery_rate', 0) * 100, 2),
            "Avg Iterations": round(agentic_metrics.get('avg_iterations', 0), 2),
        })
    
    df_comparison = pd.DataFrame(comparison_rows)
    df_comparison

## 4. Visualize Improvements

In [None]:
# Pass@1 Improvement
if agentic_summary and baseline_results:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    x = np.arange(len(df_comparison))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, df_comparison['Baseline Pass@1 (%)'], width, 
                   label='Baseline (Linear)', color='gray', alpha=0.7)
    bars2 = ax.bar(x + width/2, df_comparison['Agentic Pass@1 (%)'], width,
                   label='Agentic (Loop)', color='steelblue')
    
    ax.set_xlabel('Configuration')
    ax.set_ylabel('Pass@1 Rate (%)')
    ax.set_title('Pass@1 Rate: Baseline vs Agentic')
    ax.set_xticks(x)
    ax.set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    ax.legend()
    
    plt.tight_layout()
    plt.savefig(results_dir / 'baseline_vs_agentic_pass1.png', dpi=150)
    plt.show()

In [None]:
# Improvement Delta
if agentic_summary and baseline_results:
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    
    # Pass@1 Delta
    colors = ['green' if x > 0 else 'red' for x in df_comparison['Pass@1 Δ (pp)']]
    axes[0].bar(df_comparison['Configuration'], df_comparison['Pass@1 Δ (pp)'], color=colors)
    axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[0].set_xlabel('Configuration')
    axes[0].set_ylabel('Improvement (pp)')
    axes[0].set_title('Pass@1 Improvement')
    axes[0].set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    
    # KG Valid Delta
    colors = ['green' if x > 0 else 'red' for x in df_comparison['KG Valid Δ (pp)']]
    axes[1].bar(df_comparison['Configuration'], df_comparison['KG Valid Δ (pp)'], color=colors)
    axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[1].set_xlabel('Configuration')
    axes[1].set_ylabel('Improvement (pp)')
    axes[1].set_title('KG Validity Improvement')
    axes[1].set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    
    # LLMetric Delta
    colors = ['green' if x > 0 else 'red' for x in df_comparison['LLMetric Δ']]
    axes[2].bar(df_comparison['Configuration'], df_comparison['LLMetric Δ'], color=colors)
    axes[2].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[2].set_xlabel('Configuration')
    axes[2].set_ylabel('Improvement')
    axes[2].set_title('LLMetric Improvement')
    axes[2].set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'improvement_delta.png', dpi=150)
    plt.show()

## 5. Self-Correction Impact Analysis

In [None]:
# Analyze where self-correction helped
if agentic_summary:
    print("\n=== Self-Correction Impact Analysis ===")
    print("\nQuestions that were initially wrong but corrected after refinement:")
    
    for config_name, config in agentic_summary['configurations'].items():
        agentic_metrics = config.get('agentic_metrics', {})
        
        first_attempt_rate = agentic_metrics.get('first_attempt_success_rate', 0) * 100
        final_success_rate = config['kg_valid_rate']
        recovery_rate = agentic_metrics.get('recovery_rate', 0) * 100
        
        improvement_from_correction = final_success_rate - first_attempt_rate
        
        print(f"\n{config_name}:")
        print(f"  First Attempt Success: {first_attempt_rate:.1f}%")
        print(f"  Final Success Rate: {final_success_rate:.1f}%")
        print(f"  Improvement from Correction: +{improvement_from_correction:.1f}pp")
        print(f"  Recovery Rate: {recovery_rate:.1f}%")

In [None]:
# Iterations to Success Distribution
if agentic_summary:
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.flatten()
    
    for i, (config_name, config) in enumerate(agentic_summary['configurations'].items()):
        agentic_metrics = config.get('agentic_metrics', {})
        iter_to_success = agentic_metrics.get('iterations_to_success', {})
        
        if iter_to_success:
            iterations = list(iter_to_success.keys())
            counts = list(iter_to_success.values())
            
            axes[i].bar(iterations, counts, color='steelblue')
            axes[i].set_xlabel('Iterations Needed')
            axes[i].set_ylabel('Count')
            axes[i].set_title(config_name)
        else:
            axes[i].text(0.5, 0.5, 'No data', ha='center', va='center')
            axes[i].set_title(config_name)
    
    plt.suptitle('Iterations to Success Distribution', fontsize=14)
    plt.tight_layout()
    plt.savefig(results_dir / 'iterations_distribution.png', dpi=150)
    plt.show()

## 6. Statistical Summary

In [None]:
# Overall statistics
if agentic_summary:
    print("\n=== Overall Experiment Statistics ===")
    
    all_pass1 = [c['pass_at_1_rate'] for c in agentic_summary['configurations'].values()]
    all_kg = [c['kg_valid_rate'] for c in agentic_summary['configurations'].values()]
    all_llm = [c['llmetric'] for c in agentic_summary['configurations'].values()]
    
    print(f"\nPass@1 Rate:")
    print(f"  Mean: {np.mean(all_pass1):.2f}%")
    print(f"  Std: {np.std(all_pass1):.2f}%")
    print(f"  Best: {max(all_pass1):.2f}%")
    print(f"  Worst: {min(all_pass1):.2f}%")
    
    print(f"\nKG Valid Rate:")
    print(f"  Mean: {np.mean(all_kg):.2f}%")
    print(f"  Std: {np.std(all_kg):.2f}%")
    print(f"  Best: {max(all_kg):.2f}%")
    print(f"  Worst: {min(all_kg):.2f}%")
    
    print(f"\nLLMetric:")
    print(f"  Mean: {np.mean(all_llm):.2f}")
    print(f"  Std: {np.std(all_llm):.2f}")
    print(f"  Best: {max(all_llm):.2f}")
    print(f"  Worst: {min(all_llm):.2f}")

In [None]:
# Best configuration
if agentic_summary:
    best_pass1_config = max(agentic_summary['configurations'].items(), 
                           key=lambda x: x[1]['pass_at_1_rate'])
    best_kg_config = max(agentic_summary['configurations'].items(), 
                        key=lambda x: x[1]['kg_valid_rate'])
    best_llm_config = max(agentic_summary['configurations'].items(), 
                         key=lambda x: x[1]['llmetric'])
    
    print("\n=== Best Configurations ===")
    print(f"\nBest Pass@1: {best_pass1_config[0]} ({best_pass1_config[1]['pass_at_1_rate']:.2f}%)")
    print(f"Best KG Valid: {best_kg_config[0]} ({best_kg_config[1]['kg_valid_rate']:.2f}%)")
    print(f"Best LLMetric: {best_llm_config[0]} ({best_llm_config[1]['llmetric']:.2f})")

## 7. Export Comparison Results

In [None]:
# Export comparison to CSV
if agentic_summary:
    comparative_dir = results_dir / "comparative"
    comparative_dir.mkdir(exist_ok=True)
    
    export_path = comparative_dir / "baseline_vs_agentic.csv"
    df_comparison.to_csv(export_path, index=False)
    print(f"Exported comparison to: {export_path}")