# Batch 1: Agentic Text2Cypher Inference (Day 1/5)

**Configurations**: Zero-Shot_Full, Zero-Shot_Nodes+Paths

**Model**: Qwen2.5-Coder-32B-Instruct via OpenRouter API

**Questions**: 52 × 2 = 104 inferences

---

## Metrics Used (Formal Naming from Established Research)

| Category | Metric | Source |
|----------|--------|--------|
| First Attempt | Pass@1, KG Valid@1 | HumanEval/kg-axel |
| After Refinement | Pass@k, KG Valid@k | HumanEval/kg-axel |
| Improvement | Refinement Gain, Recovery Rate | Self-Refine (Madaan, 2023) |

---

## Rate Limit Strategy
- OpenRouter API quota management
- Estimated tokens per inference: ~5000 (with retries)
- This batch: 104 inferences

In [1]:
# Setup
import sys
import os
from pathlib import Path

# Set project root
project_root = Path.cwd().parent
os.chdir(project_root)

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Working directory: {os.getcwd()}")

Project root: /Users/tsimiscouse/Docs/Sarjana/Skripsi/kg-luthfi/agentic
Working directory: /Users/tsimiscouse/Docs/Sarjana/Skripsi/kg-luthfi/agentic


In [2]:
# Imports
import json
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm

from config.settings import Settings
from config.llm_config import LLMConfig
from data.ground_truth_loader import GroundTruthLoader
from prompts.prompt_manager import PromptManager, PromptType, SchemaFormat
from experiment.batch_processor import BatchProcessor

In [3]:
# Initialize settings
settings = Settings()
llm_config = LLMConfig()

print(f"LLM Provider: {llm_config.provider}")
print(f"Model: {llm_config.model}")
print(f"Max Iterations: {settings.max_iterations}")

LLM Provider: openrouter
Model: qwen/qwen-2.5-coder-32b-instruct
Max Iterations: 3


In [4]:
# Validate API Key
try:
    llm_config.validate()
    print("API Key validated successfully!")
except ValueError as e:
    print(f"Error: {e}")

API Key validated successfully!


In [5]:
# Load ground truth data
loader = GroundTruthLoader()
items = loader.load()
print(f"Loaded {len(items)} questions")

Loaded 52 questions


## Batch 1 Configurations

| # | Prompt Type | Schema Format | Config Name |
|---|-------------|---------------|-------------|
| 1 | Zero-Shot | Full Schema | Zero-Shot_Full |
| 2 | Zero-Shot | Nodes+Paths | Zero-Shot_Nodes+Paths |

In [6]:
# Define Batch 1 configurations
BATCH_1_CONFIGS = [
    {"prompt": PromptType.ZERO_SHOT, "schema": SchemaFormat.FULL_SCHEMA},
    {"prompt": PromptType.ZERO_SHOT, "schema": SchemaFormat.NODES_PATHS},
]

print(f"Batch 1: {len(BATCH_1_CONFIGS)} configurations")
print(f"Total inferences: {len(BATCH_1_CONFIGS) * len(items)} = {len(BATCH_1_CONFIGS)} × {len(items)}")

Batch 1: 2 configurations
Total inferences: 104 = 2 × 52


In [7]:
# Setup results directory
results_dir = project_root / "results_v2"
results_dir.mkdir(parents=True, exist_ok=True)

batch_results_dir = results_dir / "batch_1"
batch_results_dir.mkdir(parents=True, exist_ok=True)

print(f"Results will be saved to: {batch_results_dir}")

Results will be saved to: /Users/tsimiscouse/Docs/Sarjana/Skripsi/kg-luthfi/agentic/results_v2/batch_1


In [9]:
# Initialize batch processor
processor = BatchProcessor(
    settings=settings,
    llm_config=llm_config,
    checkpoint_dir=str(batch_results_dir / "checkpoints")
)

prompt_manager = PromptManager()

## Run Batch 1 Experiments

In [None]:
# Run experiments for Batch 1
batch_1_results = {}

for i, config in enumerate(BATCH_1_CONFIGS):
    config_name = prompt_manager.get_configuration_name(config["prompt"], config["schema"])
    print(f"\n{'='*60}")
    print(f"Configuration {i+1}/{len(BATCH_1_CONFIGS)}: {config_name}")
    print(f"{'='*60}")
    
    # Create config directory
    config_dir = batch_results_dir / config_name
    config_dir.mkdir(parents=True, exist_ok=True)
    
    # Process batch
    try:
        results = processor.process_batch(
            items=items,
            prompt_type=config["prompt"],
            schema_format=config["schema"],
            batch_id=f"batch1_{config_name}",
            resume=True
        )
        
        batch_1_results[config_name] = results
        
        # Save results
        results_data = [state.to_dict() for state in results]
        
        with open(config_dir / "agentic_results.json", "w") as f:
            json.dump(results_data, f, indent=2, default=str)
        
        # Save as CSV with all formal metrics
        # Using formal naming from established research:
        # - Pass@1, Pass@k: HumanEval (OpenAI, 2021)
        # - KG Valid: kg-axel baseline
        # - Refinement Gain, Recovery Rate: Self-Refine (Madaan, 2023)
        df = pd.DataFrame([{
            "question_id": s.question_id,
            "question": s.question,
            "ground_truth": s.ground_truth_query,
            "final_query": s.final_query,
            # Pass@1 Metrics (First Attempt - Baseline Comparable)
            "pass_at_1": s.pass_at_1,
            "kg_valid_at_1": s.kg_valid_at_1,
            # Pass@k Metrics (Final - After Refinement)
            "pass_at_k": s.pass_at_k,
            "execution_accuracy": s.execution_accuracy,
            "kg_valid_at_k": s.kg_valid_at_k,
            # Self-Refine Metrics (Improvement)
            "refinement_gain": s.refinement_gain,
            "kg_valid_refinement_gain": s.kg_valid_refinement_gain,
            "was_recovered": s.was_recovered,
            "was_kg_recovered": s.was_kg_recovered,
            # Iteration info
            "total_iterations": s.total_iterations,
            # Backward compatibility
            "success": s.success,
            "kg_valid": s.kg_valid,
        } for s in results])
        df.to_csv(config_dir / "agentic_results.csv", index=False)
        
        # Quick stats using formal metric names
        total = len(results)
        pass_at_1_rate = sum(1 for s in results if s.pass_at_1) / total * 100
        kg_valid_at_1_rate = sum(1 for s in results if s.kg_valid_at_1) / total * 100
        pass_at_k_rate = sum(1 for s in results if s.pass_at_k) / total * 100
        kg_valid_at_k_rate = sum(1 for s in results if s.kg_valid_at_k) / total * 100
        refinement_gain = pass_at_k_rate - pass_at_1_rate
        recovered_count = sum(1 for s in results if s.was_recovered)
        initially_failed = total - sum(1 for s in results if s.pass_at_1)
        recovery_rate = (recovered_count / initially_failed * 100) if initially_failed > 0 else 0
        avg_iterations = sum(s.total_iterations for s in results) / total
        
        print(f"\n{config_name} Results (Formal Metrics):")
        print(f"  --- Pass@1 (First Attempt - Baseline Comparable) ---")
        print(f"  Pass@1 Rate: {pass_at_1_rate:.1f}%")
        print(f"  KG Valid@1 Rate: {kg_valid_at_1_rate:.1f}%")
        print(f"  --- Pass@k (After Refinement, k={settings.max_iterations}) ---")
        print(f"  Pass@k Rate: {pass_at_k_rate:.1f}%")
        print(f"  KG Valid@k Rate: {kg_valid_at_k_rate:.1f}%")
        print(f"  --- Self-Refine Metrics ---")
        print(f"  Refinement Gain: {refinement_gain:+.1f} pp")
        print(f"  Recovery Rate: {recovery_rate:.1f}%")
        print(f"  Avg Iterations: {avg_iterations:.2f}")
        
    except Exception as e:
        print(f"Error processing {config_name}: {e}")
        import traceback
        traceback.print_exc()

## Batch 1 Summary

### Formal Metric Names (Based on Established Research)

| Metric | Source | Description |
|--------|--------|-------------|
| **Pass@1** | HumanEval (OpenAI, 2021) | First attempt success rate |
| **Pass@k** | HumanEval (OpenAI, 2021) | Success within k iterations |
| **KG Valid@1** | kg-axel (Baseline) | First attempt KG validity (syntax+schema+properties) |
| **KG Valid@k** | kg-axel (Baseline) | Final KG validity after refinement |
| **Refinement Gain** | Self-Refine (Madaan, 2023) | Pass@k - Pass@1 (absolute improvement) |
| **Recovery Rate** | Self-Refine (Madaan, 2023) | % of initial failures that were recovered |

In [None]:
# Generate batch summary with formal metric names
# Using naming from established research:
# - Pass@1, Pass@k: HumanEval (OpenAI, Kulal et al. 2019)
# - KG Valid: kg-axel baseline research
# - Refinement Gain, Recovery Rate: Self-Refine (Madaan et al., 2023)

if batch_1_results:
    summary_rows = []
    
    for config_name, results in batch_1_results.items():
        total = len(results)
        
        # Pass@1 Metrics (First Attempt - Baseline Comparable)
        pass_at_1_count = sum(1 for s in results if s.pass_at_1)
        kg_valid_at_1_count = sum(1 for s in results if s.kg_valid_at_1)
        
        # Pass@k Metrics (After Refinement)
        pass_at_k_count = sum(1 for s in results if s.pass_at_k)
        kg_valid_at_k_count = sum(1 for s in results if s.kg_valid_at_k)
        
        # Self-Refine Metrics
        recovered_count = sum(1 for s in results if s.was_recovered)
        initially_failed = total - pass_at_1_count
        recovery_rate = (recovered_count / initially_failed * 100) if initially_failed > 0 else 0
        
        kg_recovered_count = sum(1 for s in results if s.was_kg_recovered)
        kg_initially_invalid = total - kg_valid_at_1_count
        kg_recovery_rate = (kg_recovered_count / kg_initially_invalid * 100) if kg_initially_invalid > 0 else 0
        
        summary_rows.append({
            "Configuration": config_name,
            "N": total,
            # Pass@1 (Baseline Comparable)
            "Pass@1 (%)": round(pass_at_1_count / total * 100, 2),
            "KG Valid@1 (%)": round(kg_valid_at_1_count / total * 100, 2),
            # Pass@k (After Refinement)
            "Pass@k (%)": round(pass_at_k_count / total * 100, 2),
            "KG Valid@k (%)": round(kg_valid_at_k_count / total * 100, 2),
            # Self-Refine Metrics
            "Refine Gain (pp)": round(pass_at_k_count / total * 100 - pass_at_1_count / total * 100, 2),
            "Recovery (%)": round(recovery_rate, 2),
            "KG Recovery (%)": round(kg_recovery_rate, 2),
            # Iteration Stats
            "Avg Iter": round(sum(s.total_iterations for s in results) / total, 2),
        })
    
    df_summary = pd.DataFrame(summary_rows)
    display(df_summary)
    
    # Save summary
    df_summary.to_csv(batch_results_dir / "batch_1_summary.csv", index=False)
    print(f"\nSummary saved to: {batch_results_dir / 'batch_1_summary.csv'}")
    
    # Metric explanation
    print("\n" + "="*70)
    print("METRIC NAMING (Based on Established Research)")
    print("="*70)
    print("Pass@1, Pass@k    : HumanEval Benchmark (OpenAI, Kulal et al. 2019)")
    print("KG Valid@1/k      : kg-axel baseline research (comparable metric)")
    print("Refinement Gain   : Self-Refine (Madaan et al., 2023) - Pass@k - Pass@1")
    print("Recovery Rate     : Self-Refine - % of initial failures that were fixed")
    print("="*70)
    print("\nNOTE: 'KG Valid@1 (%)' should match kg-axel's 'KG Valid Query Rate'")
    print("      for baseline comparison.")

In [12]:
# Save batch metadata
batch_metadata = {
    "batch_number": 1,
    "date": datetime.now().isoformat(),
    "configurations": [prompt_manager.get_configuration_name(c["prompt"], c["schema"]) for c in BATCH_1_CONFIGS],
    "total_questions": len(items),
    "total_inferences": len(BATCH_1_CONFIGS) * len(items),
    "llm_model": llm_config.model,
    "max_iterations": settings.max_iterations,
}

with open(batch_results_dir / "batch_1_metadata.json", "w") as f:
    json.dump(batch_metadata, f, indent=2)

print("Batch 1 completed!")
print(f"Continue with batch-2_01_agentic_inference.ipynb tomorrow.")

Batch 1 completed!
Continue with batch-2_01_agentic_inference.ipynb tomorrow.


In [13]:
# Cleanup
processor.close()