# Refrag Evaluation Results

Load evaluation summaries under the current run's `eval/` folder and display them in a paper-style table.

In [5]:
from pathlib import Path
import json
import pandas as pd

EVAL_ROOT = Path('eval')


In [6]:
def load_runs(eval_root: Path) -> list[dict]:
    """Collect per-run summaries (single runs or suites)."""
    runs: list[dict] = []

    def _append_run(run: dict, source: Path) -> None:
        metrics = run.get('metrics', {}) if isinstance(run, dict) else {}
        runs.append(
            {
                'name': run.get('baseline') or run.get('run_name') or run.get('model'),
                'model': run.get('model'),
                'accuracy_avg': metrics.get('accuracy_avg'),
                'perplexity_avg': metrics.get('perplexity_avg'),
                'lm_eval': run.get('lm_eval', {}),
                'ragas': run.get('ragas', {}),
                'source': str(source),
            }
        )

    for summary_path in eval_root.rglob('summary.json'):
        data = json.loads(summary_path.read_text())
        if isinstance(data, dict) and 'runs' in data:
            for run in data['runs']:
                _append_run(run, summary_path.parent)
        elif isinstance(data, dict):
            _append_run(data, summary_path.parent)

    return runs

def build_metric_rows(runs: list[dict]) -> list[dict]:
    """Flatten lm-eval metrics across runs and tasks."""
    rows: list[dict] = []
    for run in runs:
        tasks = run.get('lm_eval', {}).get('tasks', {}) or {}
        for task_name, metrics in tasks.items():
            if not isinstance(metrics, dict):
                continue
            for metric_name, value in metrics.items():
                rows.append(
                    {
                        'name': run.get('name'),
                        'model': run.get('model'),
                        'task': task_name,
                        'metric': metric_name,
                        'value': value,
                        'source': run.get('source'),
                    }
                )
    return rows

def format_mmlu_table(runs: list[dict]) -> pd.DataFrame:
    """Create a paper-style table for MMLU acc_norm ± stderr by model."""
    rows = []
    for run in runs:
        mmlu = (run.get('lm_eval', {}) or {}).get('tasks', {}).get('mmlu', {}) or {}
        acc_norm = mmlu.get('acc_norm,none') or mmlu.get('acc_norm') or mmlu.get('acc_norm,all')
        stderr = mmlu.get('acc_norm_stderr,none') or mmlu.get('acc_norm_stderr') or mmlu.get('acc_norm_stderr,all')
        if acc_norm is None:
            continue
        display_name = run.get('name') or run.get('model')
        cell = f"{acc_norm:.3f}" if stderr is None else f"{acc_norm:.3f} ± {stderr:.3f}"
        rows.append({'Model': display_name, 'MMLU acc_norm ± stderr': cell})
    return pd.DataFrame(rows).set_index('Model') if rows else pd.DataFrame(columns=['MMLU acc_norm ± stderr'])


In [7]:
runs = load_runs(EVAL_ROOT)
print(f"Loaded {len(runs)} run entries from {EVAL_ROOT.resolve()}")


Loaded 5 run entries from /home/vijai/code/Refrag/eval


In [8]:
if runs:
    run_df = pd.DataFrame(runs)
    cols = ['name', 'model', 'accuracy_avg', 'perplexity_avg', 'source']
    display(run_df[cols].sort_values(by=['accuracy_avg', 'perplexity_avg'], ascending=[False, True]).reset_index(drop=True))
else:
    print('No summaries found under eval/. Run evaluation first.')


Unnamed: 0,name,model,accuracy_avg,perplexity_avg,source
0,LLaMA2-7B-Full-Context,meta-llama/Llama-2-7b-chat-hf,0.256509,,eval/refrag_paper
1,llama2-7B_full_context,meta-llama/Llama-2-7b-chat-hf,0.256509,,eval/refrag_paper/llama2-7B_full_context
2,ibm-granite-granite-4.0-350M,ibm-granite/granite-4.0-350M,0.248076,,eval/ibm-granite-granite-4.0-350M
3,Granite-4.0-350M-FC,ibm-granite/granite-4.0-350M,0.196407,,eval/refrag_paper
4,ibm-granite-granite-4.0-350M,ibm-granite/granite-4.0-350M,0.196407,,eval/refrag_paper/ibm-granite-granite-4.0-350M


In [10]:
metric_rows = build_metric_rows(runs) if runs else []
if metric_rows:
    metric_df = pd.DataFrame(metric_rows)
    pivot = metric_df.pivot_table(index=['name', 'model'], columns=['task', 'metric'], values='value')
    display(pivot.T)
else:
    print('No per-task metrics available.')


Unnamed: 0_level_0,name,Granite-4.0-350M-FC,LLaMA2-7B-Full-Context,ibm-granite-granite-4.0-350M,llama2-7B_full_context
Unnamed: 0_level_1,model,ibm-granite/granite-4.0-350M,meta-llama/Llama-2-7b-chat-hf,ibm-granite/granite-4.0-350M,meta-llama/Llama-2-7b-chat-hf
task,metric,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
hellaswag,"acc,none",0.406100,0.577600,0.353050,0.577600
hellaswag,"acc_norm,none",0.518300,0.755300,0.409150,0.755300
hellaswag,"acc_norm_stderr,none",0.004997,0.004299,0.078875,0.004299
hellaswag,"acc_stderr,none",0.004911,0.004940,0.078832,0.004940
mathqa,"acc,none",0.346399,0.287772,0.223199,0.287772
...,...,...,...,...,...
piqa,"acc_norm,none",0.693689,0.772035,0.746844,0.772035
piqa,"acc_norm_stderr,none",0.010755,0.009788,0.072044,0.009788
piqa,"acc_stderr,none",0.010883,0.009909,0.081818,0.009909
winogrande,"acc,none",0.565114,0.664562,0.682557,0.664562


In [None]:
if runs:
    mmlu_df = format_mmlu_table(runs)
    if not mmlu_df.empty:
        display(mmlu_df)
    else:
        print('No MMLU metrics found in the loaded runs.')
else:
    print('No runs loaded to build MMLU table.')
