# Evaluation Metrics Analysis (Ollama - Local LLM)

Notebook ini menganalisis metrik evaluasi dari hasil agentic inference menggunakan **Ollama**.

## Metrik yang dianalisis:
- **String Metrics**: BLEU, Rouge-L, Jaro-Winkler, Jaccard Cypher
- **Output Metrics**: Pass@1, Jaccard Output
- **Composite Metrics**: LLMetric-Q, LLMetric
- **Agentic Metrics**: Iterations, Recovery Rate, First Attempt Success

In [13]:
# Setup
import sys
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['font.size'] = 10

print(f"Project root: {project_root}")

Project root: /Users/tsimiscouse/Docs/Sarjana/Skripsi/kg-luthfi


## 1. Load Ollama Results

In [14]:
# Load experiment summary from Ollama results
results_dir = project_root / "results_ollama"
summary_path = results_dir / "experiment_summary.json"

if summary_path.exists():
    with open(summary_path, "r") as f:
        summary = json.load(f)
    print(f"Loaded results from: {summary['timestamp']}")
    print(f"LLM Provider: {summary.get('llm_provider', 'ollama')}")
    print(f"LLM Model: {summary.get('llm_model', 'qwen2.5-coder:3b')}")
    print(f"Configurations: {list(summary['configurations'].keys())}")
else:
    print("No Ollama results found. Please run 01_agentic_inference_ollama.ipynb first.")
    summary = None

No Ollama results found. Please run 01_agentic_inference_ollama.ipynb first.


In [3]:
# Load detailed results for each configuration
detailed_results = {}

if summary:
    for config_name in summary['configurations'].keys():
        csv_path = results_dir / config_name / "agentic_results.csv"
        if csv_path.exists():
            detailed_results[config_name] = pd.read_csv(csv_path)
            print(f"Loaded {len(detailed_results[config_name])} results for {config_name}")

Loaded 5 results for Zero-Shot_Full
Loaded 5 results for Zero-Shot_Nodes+Paths
Loaded 5 results for Zero-Shot_Paths
Loaded 5 results for Few-Shot_Full
Loaded 5 results for Few-Shot_Nodes+Paths
Loaded 5 results for Few-Shot_Paths
Loaded 5 results for CoT_Full
Loaded 5 results for CoT_Nodes+Paths
Loaded 5 results for CoT_Paths


## 2. Summary Statistics

In [15]:
# Create summary DataFrame
if summary:
    rows = []
    for name, config in summary['configurations'].items():
        agentic = config.get('agentic_metrics', {})
        rows.append({
            "Configuration": name,
            "Prompt": config['prompt_type'],
            "Schema": config['schema_format'],
            "Pass@1 (%)": round(config['pass_at_1_rate'], 2),
            "KG Valid (%)": round(config['kg_valid_rate'], 2),
            "Avg BLEU": round(config['avg_bleu'], 4),
            "Avg Rouge-L": round(config['avg_rouge_l_f1'], 4),
            "Avg Jaro-Winkler": round(config['avg_jaro_winkler'], 4),
            "Avg Jaccard Output": round(config['avg_jaccard_output'], 4),
            "LLMetric-Q": round(config['avg_llmetric_q'], 2),
            "LLMetric": round(config['llmetric'], 2),
            "Avg Iterations": round(agentic.get('avg_iterations', 0), 2),
            "Recovery Rate (%)": round(agentic.get('recovery_rate', 0) * 100, 2),
            "First Attempt Success (%)": round(agentic.get('first_attempt_success_rate', 0) * 100, 2),
        })
    
    df_summary = pd.DataFrame(rows)
    display(df_summary)

In [5]:
# Rank configurations by LLMetric
if summary:
    df_ranked = df_summary.sort_values('LLMetric', ascending=False).reset_index(drop=True)
    df_ranked.index = df_ranked.index + 1  # Start ranking from 1
    df_ranked.index.name = 'Rank'
    print("\nConfigurations Ranked by LLMetric:")
    display(df_ranked[['Configuration', 'Pass@1 (%)', 'KG Valid (%)', 'LLMetric', 'Avg Iterations', 'Recovery Rate (%)']])


Configurations Ranked by LLMetric:


Unnamed: 0_level_0,Configuration,Pass@1 (%),KG Valid (%),LLMetric,Avg Iterations,Recovery Rate (%)
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Zero-Shot_Nodes+Paths,80.0,100.0,88.74,1.2,100.0
2,Zero-Shot_Paths,80.0,100.0,88.74,1.2,100.0
3,Few-Shot_Full,80.0,100.0,88.69,1.0,0.0
4,Few-Shot_Nodes+Paths,80.0,100.0,88.69,1.0,0.0
5,Few-Shot_Paths,80.0,100.0,88.69,1.0,0.0
6,CoT_Paths,80.0,100.0,88.69,1.0,0.0
7,Zero-Shot_Full,80.0,100.0,88.64,1.2,100.0
8,CoT_Full,80.0,100.0,88.57,1.2,100.0
9,CoT_Nodes+Paths,80.0,100.0,88.57,1.2,100.0


## 3. Visualizations

In [16]:
# Pass@1 Rate by Configuration
if summary:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    configs = list(df_summary['Configuration'])
    pass_rates = df_summary['Pass@1 (%)']
    
    colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(configs)))
    bars = ax.bar(configs, pass_rates, color=colors)
    
    ax.set_xlabel('Configuration', fontsize=12)
    ax.set_ylabel('Pass@1 Rate (%)', fontsize=12)
    ax.set_title('Pass@1 Rate by Configuration (Ollama - Qwen2.5-Coder-3B)', fontsize=14)
    ax.set_xticklabels(configs, rotation=45, ha='right')
    
    # Add value labels
    for bar, val in zip(bars, pass_rates):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{val:.1f}%', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'pass_at_1_by_config_ollama.png', dpi=150, bbox_inches='tight')
    plt.show()

In [18]:
# LLMetric Comparison
if summary:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # LLMetric-Q Distribution
    axes[0].bar(df_summary['Configuration'], df_summary['LLMetric-Q'], color='coral')
    axes[0].set_xlabel('Configuration')
    axes[0].set_ylabel('LLMetric-Q Score')
    axes[0].set_title('Average LLMetric-Q by Configuration')
    axes[0].set_xticklabels(df_summary['Configuration'], rotation=45, ha='right')
    
    # LLMetric
    axes[1].bar(df_summary['Configuration'], df_summary['LLMetric'], color='seagreen')
    axes[1].set_xlabel('Configuration')
    axes[1].set_ylabel('LLMetric Score')
    axes[1].set_title('LLMetric by Configuration')
    axes[1].set_xticklabels(df_summary['Configuration'], rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'llmetric_comparison_ollama.png', dpi=150, bbox_inches='tight')
    plt.show()

In [19]:
# Agentic Metrics: Iterations and Recovery
if summary:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Average Iterations
    axes[0].bar(df_summary['Configuration'], df_summary['Avg Iterations'], color='mediumpurple')
    axes[0].set_xlabel('Configuration')
    axes[0].set_ylabel('Average Iterations')
    axes[0].set_title('Average Iterations Needed')
    axes[0].set_xticklabels(df_summary['Configuration'], rotation=45, ha='right')
    axes[0].axhline(y=1, color='red', linestyle='--', alpha=0.5, label='Single attempt')
    
    # Recovery Rate
    axes[1].bar(df_summary['Configuration'], df_summary['Recovery Rate (%)'], color='darkorange')
    axes[1].set_xlabel('Configuration')
    axes[1].set_ylabel('Recovery Rate (%)')
    axes[1].set_title('Recovery Rate (Initially Failed → Success)')
    axes[1].set_xticklabels(df_summary['Configuration'], rotation=45, ha='right')
    
    # First Attempt Success
    axes[2].bar(df_summary['Configuration'], df_summary['First Attempt Success (%)'], color='teal')
    axes[2].set_xlabel('Configuration')
    axes[2].set_ylabel('First Attempt Success (%)')
    axes[2].set_title('First Attempt Success Rate')
    axes[2].set_xticklabels(df_summary['Configuration'], rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'agentic_metrics_ollama.png', dpi=150, bbox_inches='tight')
    plt.show()

## 4. Analysis by Prompt Type and Schema Format

In [20]:
# Aggregate by Prompt Type
if summary:
    prompt_agg = df_summary.groupby('Prompt').agg({
        'Pass@1 (%)': 'mean',
        'KG Valid (%)': 'mean',
        'LLMetric': 'mean',
        'Avg Iterations': 'mean',
        'Recovery Rate (%)': 'mean'
    }).round(2)
    
    print("\nAggregate by Prompt Type:")
    display(prompt_agg)

In [21]:
# Aggregate by Schema Format
if summary:
    schema_agg = df_summary.groupby('Schema').agg({
        'Pass@1 (%)': 'mean',
        'KG Valid (%)': 'mean',
        'LLMetric': 'mean',
        'Avg Iterations': 'mean',
        'Recovery Rate (%)': 'mean'
    }).round(2)
    
    print("\nAggregate by Schema Format:")
    display(schema_agg)

In [22]:
# Heatmap: Prompt vs Schema
if summary:
    # Create pivot table for LLMetric
    pivot_llmetric = df_summary.pivot(index='Prompt', columns='Schema', values='LLMetric')
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # LLMetric Heatmap
    sns.heatmap(pivot_llmetric, annot=True, fmt='.2f', cmap='YlGnBu', ax=axes[0])
    axes[0].set_title('LLMetric: Prompt × Schema')
    
    # Pass@1 Heatmap
    pivot_pass = df_summary.pivot(index='Prompt', columns='Schema', values='Pass@1 (%)')
    sns.heatmap(pivot_pass, annot=True, fmt='.1f', cmap='YlGnBu', ax=axes[1])
    axes[1].set_title('Pass@1 Rate (%): Prompt × Schema')
    
    plt.tight_layout()
    plt.savefig(results_dir / 'heatmap_prompt_schema_ollama.png', dpi=150, bbox_inches='tight')
    plt.show()

## 5. Per-Question Analysis

In [12]:
# Analyze by complexity level
if detailed_results:
    # Combine all configurations
    all_results = pd.concat([df.assign(config=name) for name, df in detailed_results.items()])
    
    complexity_stats = all_results.groupby('complexity').agg({
        'success': 'mean',
        'pass_at_1': 'mean',
        'total_iterations': 'mean',
        'llmetric_q': 'mean'
    }).round(4)
    
    complexity_stats.columns = ['Success Rate', 'Pass@1 Rate', 'Avg Iterations', 'Avg LLMetric-Q']
    print("\nAnalysis by Complexity Level (All Configurations):")
    display(complexity_stats)


Analysis by Complexity Level (All Configurations):


Unnamed: 0_level_0,Success Rate,Pass@1 Rate,Avg Iterations,Avg LLMetric-Q
complexity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Easy,1.0,0.8,1.1111,90.0103


In [None]:
# Analyze by reasoning level
if detailed_results:
    reasoning_stats = all_results.groupby('reasoning_level').agg({
        'success': 'mean',
        'pass_at_1': 'mean',
        'total_iterations': 'mean',
        'llmetric_q': 'mean'
    }).round(4)
    
    reasoning_stats.columns = ['Success Rate', 'Pass@1 Rate', 'Avg Iterations', 'Avg LLMetric-Q']
    print("\nAnalysis by Reasoning Level (All Configurations):")
    display(reasoning_stats)

## 6. Error Analysis

In [None]:
# Error type distribution
if summary:
    print("\n=== Error Type Distribution ===")
    
    error_data = []
    for config_name, config in summary['configurations'].items():
        agentic = config.get('agentic_metrics', {})
        error_dist = agentic.get('error_type_distribution', {})
        recovery = agentic.get('error_recovery_by_type', {})
        
        for error_type, count in error_dist.items():
            error_data.append({
                'Configuration': config_name,
                'Error Type': error_type,
                'Count': count,
                'Recovery Rate': recovery.get(error_type, 0) * 100
            })
    
    if error_data:
        df_errors = pd.DataFrame(error_data)
        display(df_errors)
        
        # Visualize error distribution
        fig, ax = plt.subplots(figsize=(12, 5))
        
        error_pivot = df_errors.pivot(index='Configuration', columns='Error Type', values='Count').fillna(0)
        error_pivot.plot(kind='bar', ax=ax)
        
        ax.set_xlabel('Configuration')
        ax.set_ylabel('Error Count')
        ax.set_title('Error Type Distribution by Configuration')
        ax.legend(title='Error Type')
        plt.xticks(rotation=45, ha='right')
        
        plt.tight_layout()
        plt.savefig(results_dir / 'error_distribution_ollama.png', dpi=150, bbox_inches='tight')
        plt.show()

## 7. Export Summary

In [None]:
# Export summary to CSV
if summary:
    export_path = results_dir / 'metrics_summary_ollama.csv'
    df_summary.to_csv(export_path, index=False)
    print(f"Exported summary to: {export_path}")
    
    # Also export to Excel with multiple sheets
    try:
        with pd.ExcelWriter(results_dir / 'metrics_summary_ollama.xlsx', engine='openpyxl') as writer:
            df_summary.to_excel(writer, sheet_name='Summary', index=False)
            if 'prompt_agg' in dir():
                prompt_agg.to_excel(writer, sheet_name='By Prompt')
            if 'schema_agg' in dir():
                schema_agg.to_excel(writer, sheet_name='By Schema')
        print(f"Exported Excel to: {results_dir / 'metrics_summary_ollama.xlsx'}")
    except Exception as e:
        print(f"Could not export to Excel: {e}")