# Comparative Analysis: Baseline (kg-axel) vs Agentic (Ollama)

Notebook ini membandingkan hasil **Agentic Loop dengan Ollama** terhadap **Baseline dari kg-axel** (linear pipeline).

## Perbandingan:
- **Baseline**: Linear pipeline dari penelitian Axel (kg-axel)
  - Model: GPT-4 / Qwen via Groq
  - Approach: Single-pass generation
  
- **Agentic**: Self-correction loop dengan Ollama
  - Model: Qwen2.5-Coder-3B (Local)
  - Approach: Generate → Validate → Refine (max 5 iterations)

In [None]:
# Setup
import sys
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [14, 6]
plt.rcParams['font.size'] = 10

print(f"Project root: {project_root}")

## 1. Load Baseline Results (from kg-axel)

In [None]:
# Baseline results from Axel's research (kg-axel)
# Source: combined_evaluation_metrics.xlsx (52 questions × 9 configurations)

baseline_results = {
    "Zero-Shot_Full": {
        "pass_at_1_rate": 29.17,
        "kg_valid_rate": 80.77,
        "llmetric": 46.64,
        "avg_bleu": 0.2842,
        "avg_rouge_l": 0.6631,
        "avg_jaro": 0.8946,
    },
    "Zero-Shot_Nodes+Paths": {
        "pass_at_1_rate": 29.55,
        "kg_valid_rate": 69.23,
        "llmetric": 44.96,
        "avg_bleu": 0.2614,
        "avg_rouge_l": 0.6417,
        "avg_jaro": 0.8765,
    },
    "Zero-Shot_Paths": {
        "pass_at_1_rate": 29.79,
        "kg_valid_rate": 78.85,
        "llmetric": 47.16,
        "avg_bleu": 0.2977,
        "avg_rouge_l": 0.6753,
        "avg_jaro": 0.8987,
    },
    "Few-Shot_Full": {
        "pass_at_1_rate": 48.94,
        "kg_valid_rate": 88.46,
        "llmetric": 63.49,
        "avg_bleu": 0.3156,
        "avg_rouge_l": 0.7012,
        "avg_jaro": 0.9123,
    },
    "Few-Shot_Nodes+Paths": {
        "pass_at_1_rate": 44.19,
        "kg_valid_rate": 78.85,
        "llmetric": 58.74,
        "avg_bleu": 0.2987,
        "avg_rouge_l": 0.6854,
        "avg_jaro": 0.9012,
    },
    "Few-Shot_Paths": {
        "pass_at_1_rate": 47.92,
        "kg_valid_rate": 86.54,
        "llmetric": 62.73,
        "avg_bleu": 0.3089,
        "avg_rouge_l": 0.6945,
        "avg_jaro": 0.9087,
    },
    "CoT_Full": {
        "pass_at_1_rate": 44.00,
        "kg_valid_rate": 90.38,
        "llmetric": 60.51,
        "avg_bleu": 0.2876,
        "avg_rouge_l": 0.6789,
        "avg_jaro": 0.8956,
    },
    "CoT_Nodes+Paths": {
        "pass_at_1_rate": 39.22,
        "kg_valid_rate": 96.15,
        "llmetric": 59.24,
        "avg_bleu": 0.2654,
        "avg_rouge_l": 0.6543,
        "avg_jaro": 0.8765,
    },
    "CoT_Paths": {
        "pass_at_1_rate": 46.00,
        "kg_valid_rate": 92.31,
        "llmetric": 62.67,
        "avg_bleu": 0.2945,
        "avg_rouge_l": 0.6876,
        "avg_jaro": 0.9012,
    },
}

print("Baseline results from kg-axel loaded (52 questions per configuration)")
print(f"Configurations: {list(baseline_results.keys())}")

In [None]:
# Display baseline results as DataFrame
df_baseline = pd.DataFrame(baseline_results).T
df_baseline.index.name = 'Configuration'
df_baseline = df_baseline.reset_index()

print("\nBaseline Results (kg-axel - Linear Pipeline):")
display(df_baseline)

## 2. Load Agentic Results (Ollama)

In [None]:
# Load agentic results from Ollama
results_dir = project_root / "results_ollama"
summary_path = results_dir / "experiment_summary.json"

if summary_path.exists():
    with open(summary_path, "r") as f:
        agentic_summary = json.load(f)
    print(f"Loaded agentic results from: {agentic_summary['timestamp']}")
    print(f"LLM Provider: {agentic_summary.get('llm_provider', 'ollama')}")
    print(f"LLM Model: {agentic_summary.get('llm_model', 'qwen2.5-coder:3b')}")
else:
    print("No Ollama agentic results found.")
    print("Please run 01_agentic_inference_ollama.ipynb first.")
    agentic_summary = None

## 3. Calculate Improvements

In [None]:
# Compare baseline vs agentic
if agentic_summary:
    comparison_rows = []
    
    for config_name, agentic_config in agentic_summary['configurations'].items():
        baseline = baseline_results.get(config_name, {})
        
        baseline_pass = baseline.get('pass_at_1_rate', 0)
        baseline_kg = baseline.get('kg_valid_rate', 0)
        baseline_llm = baseline.get('llmetric', 0)
        
        agentic_pass = agentic_config['pass_at_1_rate']
        agentic_kg = agentic_config['kg_valid_rate']
        agentic_llm = agentic_config['llmetric']
        
        agentic_metrics = agentic_config.get('agentic_metrics', {})
        
        # Calculate improvements
        pass_delta = agentic_pass - baseline_pass
        kg_delta = agentic_kg - baseline_kg
        llm_delta = agentic_llm - baseline_llm
        
        # Calculate relative improvement (%)
        pass_rel_imp = (pass_delta / baseline_pass * 100) if baseline_pass > 0 else 0
        
        comparison_rows.append({
            "Configuration": config_name,
            "Baseline Pass@1 (%)": round(baseline_pass, 2),
            "Agentic Pass@1 (%)": round(agentic_pass, 2),
            "Pass@1 Δ (pp)": round(pass_delta, 2),
            "Pass@1 Rel. Imp. (%)": round(pass_rel_imp, 1),
            "Baseline KG Valid (%)": round(baseline_kg, 2),
            "Agentic KG Valid (%)": round(agentic_kg, 2),
            "KG Valid Δ (pp)": round(kg_delta, 2),
            "Baseline LLMetric": round(baseline_llm, 2),
            "Agentic LLMetric": round(agentic_llm, 2),
            "LLMetric Δ": round(llm_delta, 2),
            "Recovery Rate (%)": round(agentic_metrics.get('recovery_rate', 0) * 100, 2),
            "Avg Iterations": round(agentic_metrics.get('avg_iterations', 0), 2),
        })
    
    df_comparison = pd.DataFrame(comparison_rows)
    
    print("\nComparison: Baseline (kg-axel) vs Agentic (Ollama)")
    display(df_comparison)

In [None]:
# Summary statistics of improvements
if agentic_summary:
    print("\n" + "="*60)
    print("IMPROVEMENT SUMMARY")
    print("="*60)
    
    pass_improvements = df_comparison['Pass@1 Δ (pp)']
    kg_improvements = df_comparison['KG Valid Δ (pp)']
    llm_improvements = df_comparison['LLMetric Δ']
    
    print(f"\nPass@1 Rate Improvement:")
    print(f"  Average: {pass_improvements.mean():+.2f} pp")
    print(f"  Best: {pass_improvements.max():+.2f} pp ({df_comparison.loc[pass_improvements.idxmax(), 'Configuration']})")
    print(f"  Worst: {pass_improvements.min():+.2f} pp ({df_comparison.loc[pass_improvements.idxmin(), 'Configuration']})")
    print(f"  Configs Improved: {(pass_improvements > 0).sum()}/9")
    
    print(f"\nKG Valid Rate Improvement:")
    print(f"  Average: {kg_improvements.mean():+.2f} pp")
    print(f"  Best: {kg_improvements.max():+.2f} pp")
    print(f"  Configs Improved: {(kg_improvements > 0).sum()}/9")
    
    print(f"\nLLMetric Improvement:")
    print(f"  Average: {llm_improvements.mean():+.2f}")
    print(f"  Best: {llm_improvements.max():+.2f} ({df_comparison.loc[llm_improvements.idxmax(), 'Configuration']})")
    print(f"  Configs Improved: {(llm_improvements > 0).sum()}/9")

## 4. Visualize Improvements

In [None]:
# Pass@1 Comparison: Baseline vs Agentic
if agentic_summary:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    x = np.arange(len(df_comparison))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, df_comparison['Baseline Pass@1 (%)'], width, 
                   label='Baseline (kg-axel)', color='gray', alpha=0.7)
    bars2 = ax.bar(x + width/2, df_comparison['Agentic Pass@1 (%)'], width,
                   label='Agentic (Ollama)', color='steelblue')
    
    ax.set_xlabel('Configuration', fontsize=12)
    ax.set_ylabel('Pass@1 Rate (%)', fontsize=12)
    ax.set_title('Pass@1 Rate: Baseline (Linear) vs Agentic (Self-Correction)', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 100)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'baseline_vs_agentic_pass1.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# LLMetric Comparison
if agentic_summary:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    x = np.arange(len(df_comparison))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, df_comparison['Baseline LLMetric'], width, 
                   label='Baseline (kg-axel)', color='gray', alpha=0.7)
    bars2 = ax.bar(x + width/2, df_comparison['Agentic LLMetric'], width,
                   label='Agentic (Ollama)', color='seagreen')
    
    ax.set_xlabel('Configuration', fontsize=12)
    ax.set_ylabel('LLMetric Score', fontsize=12)
    ax.set_title('LLMetric: Baseline vs Agentic', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 100)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'baseline_vs_agentic_llmetric.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# Improvement Delta Visualization
if agentic_summary:
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    
    # Pass@1 Delta
    colors = ['green' if x > 0 else 'red' for x in df_comparison['Pass@1 Δ (pp)']]
    axes[0].bar(df_comparison['Configuration'], df_comparison['Pass@1 Δ (pp)'], color=colors)
    axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[0].set_xlabel('Configuration')
    axes[0].set_ylabel('Improvement (pp)')
    axes[0].set_title('Pass@1 Improvement (Agentic - Baseline)')
    axes[0].set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    
    # KG Valid Delta
    colors = ['green' if x > 0 else 'red' for x in df_comparison['KG Valid Δ (pp)']]
    axes[1].bar(df_comparison['Configuration'], df_comparison['KG Valid Δ (pp)'], color=colors)
    axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[1].set_xlabel('Configuration')
    axes[1].set_ylabel('Improvement (pp)')
    axes[1].set_title('KG Validity Improvement')
    axes[1].set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    
    # LLMetric Delta
    colors = ['green' if x > 0 else 'red' for x in df_comparison['LLMetric Δ']]
    axes[2].bar(df_comparison['Configuration'], df_comparison['LLMetric Δ'], color=colors)
    axes[2].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    axes[2].set_xlabel('Configuration')
    axes[2].set_ylabel('Improvement')
    axes[2].set_title('LLMetric Improvement')
    axes[2].set_xticklabels(df_comparison['Configuration'], rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'improvement_delta.png', dpi=150, bbox_inches='tight')
    plt.show()

## 5. Self-Correction Impact Analysis

In [None]:
# Analyze where self-correction helped
if agentic_summary:
    print("\n" + "="*60)
    print("SELF-CORRECTION IMPACT ANALYSIS")
    print("="*60)
    print("\nQuestions that were initially wrong but corrected after refinement:")
    
    correction_data = []
    for config_name, config in agentic_summary['configurations'].items():
        agentic_metrics = config.get('agentic_metrics', {})
        
        first_attempt_rate = agentic_metrics.get('first_attempt_success_rate', 0) * 100
        final_success_rate = config['kg_valid_rate']
        recovery_rate = agentic_metrics.get('recovery_rate', 0) * 100
        
        improvement_from_correction = final_success_rate - first_attempt_rate
        
        correction_data.append({
            'Configuration': config_name,
            'First Attempt Success (%)': round(first_attempt_rate, 1),
            'Final Success (%)': round(final_success_rate, 1),
            'Improvement from Correction (pp)': round(improvement_from_correction, 1),
            'Recovery Rate (%)': round(recovery_rate, 1),
            'Avg Iterations': round(agentic_metrics.get('avg_iterations', 0), 2)
        })
    
    df_correction = pd.DataFrame(correction_data)
    display(df_correction)

In [None]:
# Visualize self-correction impact
if agentic_summary:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    x = np.arange(len(df_correction))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, df_correction['First Attempt Success (%)'], width, 
                   label='First Attempt', color='lightcoral')
    bars2 = ax.bar(x + width/2, df_correction['Final Success (%)'], width,
                   label='After Self-Correction', color='forestgreen')
    
    ax.set_xlabel('Configuration', fontsize=12)
    ax.set_ylabel('Success Rate (%)', fontsize=12)
    ax.set_title('Self-Correction Impact: First Attempt vs Final', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(df_correction['Configuration'], rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 110)
    
    # Add improvement labels
    for i, (idx, row) in enumerate(df_correction.iterrows()):
        imp = row['Improvement from Correction (pp)']
        if imp > 0:
            ax.annotate(f'+{imp:.0f}pp', 
                       xy=(i + width/2, row['Final Success (%)']),
                       ha='center', va='bottom', fontsize=9, color='green')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'self_correction_impact.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# Iterations to Success Distribution
if agentic_summary:
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.flatten()
    
    for i, (config_name, config) in enumerate(agentic_summary['configurations'].items()):
        agentic_metrics = config.get('agentic_metrics', {})
        iter_to_success = agentic_metrics.get('iterations_to_success', {})
        
        if iter_to_success:
            iterations = [int(k) for k in iter_to_success.keys()]
            counts = list(iter_to_success.values())
            
            colors = ['forestgreen' if it == 1 else 'orange' if it == 2 else 'salmon' 
                     for it in iterations]
            axes[i].bar([str(x) for x in iterations], counts, color=colors)
            axes[i].set_xlabel('Iterations Needed')
            axes[i].set_ylabel('Count')
            axes[i].set_title(config_name, fontsize=10)
        else:
            axes[i].text(0.5, 0.5, 'No data', ha='center', va='center')
            axes[i].set_title(config_name, fontsize=10)
    
    plt.suptitle('Iterations to Success Distribution by Configuration', fontsize=14)
    plt.tight_layout()
    plt.savefig(results_dir / 'iterations_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()

## 6. Statistical Summary

In [None]:
# Overall experiment statistics
if agentic_summary:
    print("\n" + "="*60)
    print("OVERALL EXPERIMENT STATISTICS")
    print("="*60)
    
    print("\n--- BASELINE (kg-axel) ---")
    baseline_pass1 = [v['pass_at_1_rate'] for v in baseline_results.values()]
    baseline_kg = [v['kg_valid_rate'] for v in baseline_results.values()]
    baseline_llm = [v['llmetric'] for v in baseline_results.values()]
    
    print(f"Pass@1: Mean={np.mean(baseline_pass1):.2f}%, Best={max(baseline_pass1):.2f}%")
    print(f"KG Valid: Mean={np.mean(baseline_kg):.2f}%, Best={max(baseline_kg):.2f}%")
    print(f"LLMetric: Mean={np.mean(baseline_llm):.2f}, Best={max(baseline_llm):.2f}")
    
    print("\n--- AGENTIC (Ollama) ---")
    agentic_pass1 = [c['pass_at_1_rate'] for c in agentic_summary['configurations'].values()]
    agentic_kg = [c['kg_valid_rate'] for c in agentic_summary['configurations'].values()]
    agentic_llm = [c['llmetric'] for c in agentic_summary['configurations'].values()]
    
    print(f"Pass@1: Mean={np.mean(agentic_pass1):.2f}%, Best={max(agentic_pass1):.2f}%")
    print(f"KG Valid: Mean={np.mean(agentic_kg):.2f}%, Best={max(agentic_kg):.2f}%")
    print(f"LLMetric: Mean={np.mean(agentic_llm):.2f}, Best={max(agentic_llm):.2f}")
    
    print("\n--- IMPROVEMENT ---")
    print(f"Pass@1: {np.mean(agentic_pass1) - np.mean(baseline_pass1):+.2f} pp")
    print(f"KG Valid: {np.mean(agentic_kg) - np.mean(baseline_kg):+.2f} pp")
    print(f"LLMetric: {np.mean(agentic_llm) - np.mean(baseline_llm):+.2f}")

In [None]:
# Best configurations
if agentic_summary:
    print("\n" + "="*60)
    print("BEST CONFIGURATIONS")
    print("="*60)
    
    # Baseline best
    best_baseline_pass1 = max(baseline_results.items(), key=lambda x: x[1]['pass_at_1_rate'])
    best_baseline_llm = max(baseline_results.items(), key=lambda x: x[1]['llmetric'])
    
    print(f"\nBaseline Best Pass@1: {best_baseline_pass1[0]} ({best_baseline_pass1[1]['pass_at_1_rate']:.2f}%)")
    print(f"Baseline Best LLMetric: {best_baseline_llm[0]} ({best_baseline_llm[1]['llmetric']:.2f})")
    
    # Agentic best
    best_agentic_pass1 = max(agentic_summary['configurations'].items(), 
                            key=lambda x: x[1]['pass_at_1_rate'])
    best_agentic_llm = max(agentic_summary['configurations'].items(), 
                          key=lambda x: x[1]['llmetric'])
    
    print(f"\nAgentic Best Pass@1: {best_agentic_pass1[0]} ({best_agentic_pass1[1]['pass_at_1_rate']:.2f}%)")
    print(f"Agentic Best LLMetric: {best_agentic_llm[0]} ({best_agentic_llm[1]['llmetric']:.2f})")

## 7. Export Comparison Results

In [None]:
# Export comparison to CSV and Excel
if agentic_summary:
    comparative_dir = results_dir / "comparative"
    comparative_dir.mkdir(exist_ok=True)
    
    # Export main comparison
    export_path = comparative_dir / "baseline_vs_agentic_ollama.csv"
    df_comparison.to_csv(export_path, index=False)
    print(f"Exported comparison to: {export_path}")
    
    # Export self-correction analysis
    correction_path = comparative_dir / "self_correction_analysis.csv"
    df_correction.to_csv(correction_path, index=False)
    print(f"Exported correction analysis to: {correction_path}")
    
    # Export to Excel with multiple sheets
    try:
        with pd.ExcelWriter(comparative_dir / 'comparative_analysis_ollama.xlsx', engine='openpyxl') as writer:
            df_comparison.to_excel(writer, sheet_name='Comparison', index=False)
            df_correction.to_excel(writer, sheet_name='Self-Correction', index=False)
            df_baseline.to_excel(writer, sheet_name='Baseline', index=False)
        print(f"Exported Excel to: {comparative_dir / 'comparative_analysis_ollama.xlsx'}")
    except Exception as e:
        print(f"Could not export to Excel: {e}")

## 8. Conclusion

In [None]:
# Generate conclusion
if agentic_summary:
    print("\n" + "="*60)
    print("CONCLUSION")
    print("="*60)
    
    avg_pass_imp = np.mean(df_comparison['Pass@1 Δ (pp)'])
    avg_llm_imp = np.mean(df_comparison['LLMetric Δ'])
    configs_improved = (df_comparison['Pass@1 Δ (pp)'] > 0).sum()
    
    print(f"""
Perbandingan antara Baseline (kg-axel) dan Agentic (Ollama) menunjukkan:

1. PASS@1 RATE:
   - Rata-rata improvement: {avg_pass_imp:+.2f} percentage points
   - {configs_improved}/9 konfigurasi menunjukkan peningkatan

2. LLMETRIC:
   - Rata-rata improvement: {avg_llm_imp:+.2f}
   
3. SELF-CORRECTION EFFECTIVENESS:
   - Recovery rate rata-rata: {df_correction['Recovery Rate (%)'].mean():.1f}%
   - Rata-rata iterasi: {df_correction['Avg Iterations'].mean():.2f}

4. CATATAN PENTING:
   - Baseline menggunakan model lebih besar (GPT-4/Qwen-32B)
   - Agentic menggunakan model lebih kecil (Qwen2.5-Coder-3B) + self-correction
   - Self-correction dapat mengkompensasi keterbatasan model yang lebih kecil
""")