# Wisent-Guard: Benchmark Evaluation Demo

This notebook demonstrates a full hallucination detection pipeline using Wisent-Guard on `truthfulqa_mc`.
We load a benchmark, generate training data, train a classifier, and evaluate results.

In [None]:
# Install dependencies (if running for the first time)
!pip install -e .
!pip install datasets torch transformers scikit-learn

In [None]:
# Imports and configuration
import logging
import json
from pathlib import Path
import torch

from wisent_guard.benchmarking.benchmark_runner import BenchmarkRunner

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Parameters
DATASET_NAME = "truthfulqa_mc"
MODEL_NAME = "meta-llama/Llama-3.1-8B"
LAYER = 15
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR = "benchmark_results"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [None]:
# Initialize and run benchmark
runner = BenchmarkRunner()

results = runner.run_benchmark(
    benchmark_names=[DATASET_NAME],
    model_name=MODEL_NAME,
    layer=LAYER,
    device=DEVICE,
    model=None,
    tokenizer=None,
    output_dir=OUTPUT_DIR
)

In [None]:
# Display main results
print("\n📊 Evaluation Results:")
for task, metrics in results.items():
    print(f"\nTask: {task}")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

In [None]:
# Load results from file (optional)
result_file = Path(OUTPUT_DIR) / f"{DATASET_NAME}_results.json"
if result_file.exists():
    with open(result_file) as f:
        saved = json.load(f)
    print("\n💾 Saved Results:")
    print(json.dumps(saved, indent=2))
else:
    print("No saved results found. Please run the benchmark first.")