# ScholarRAG Evaluation Notebook

Use this notebook to inspect retrieval quality and LLM-judge scores.

Inputs:

- `runs/ask_results.jsonl`: generated by `evaluation/run_batch.py`
- `runs/ask_results_scored.jsonl`: same file after passing through `evaluation/llm_judge.py`

Update the paths below if you store results elsewhere.

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px

# Configure paths
RUN_DIR = Path('../runs')
RAW_FILE = RUN_DIR / 'ask_results.jsonl'
JUDGED_FILE = RUN_DIR / 'ask_results_scored.jsonl'

if not RAW_FILE.exists():
    raise FileNotFoundError('Generate runs/ask_results.jsonl via evaluation/run_batch.py first.')

FileNotFoundError: Generate runs/ask_results.jsonl via evaluation/run_batch.py first.

In [None]:
def load_jsonl(path: Path):
    with path.open() as f:
        for line in f:
            if line.strip():
                yield json.loads(line)

records = list(load_jsonl(RAW_FILE))
len(records)

In [None]:
df = pd.json_normalize(records, record_path='retrieved', meta=['query', 'k', 'year_from', 'year_to', 'relevant_ids'])
df.head()

## Retrieval Metrics

Assumes each query object in the input file includes `relevant_ids` listing known good paper identifiers (e.g. OpenAlex IDs or DOIs).

In [None]:
def relevance_flag(row):
    rel = row.get('relevant_ids') or []
    identifiers = [str(row.get('openalex_id')), str(row.get('doi'))]
    return any(i and i in rel for i in identifiers)

df['is_relevant'] = df.apply(relevance_flag, axis=1)
df['rank'] = df['rank'].astype(int)
df.sample(min(3, len(df)))

In [None]:
def precision_at_k(group, k=10):
    topk = group.nsmallest(k, 'rank')
    if topk.empty:
        return np.nan
    return topk['is_relevant'].mean()

def recall_at_k(group, k=10):
    rel_total = len(group['relevant_ids'].iloc[0] or [])
    if rel_total == 0:
        return np.nan
    topk = group.nsmallest(k, 'rank')
    return topk['is_relevant'].sum() / rel_total

rows = []
for k_val in (3, 5, 10):
    precision = df.groupby('query').apply(precision_at_k, k=k_val).mean()
    recall = df.groupby('query').apply(recall_at_k, k=k_val).mean()
    rows.append({'k': k_val, 'precision_at_k': precision, 'recall_at_k': recall})

pd.DataFrame(rows)

## LLM Judge Scores (optional)

Run `evaluation/llm_judge.py --input runs/ask_results.jsonl --output runs/ask_results_scored.jsonl` first.

In [None]:
if JUDGED_FILE.exists():
    judged_records = list(load_jsonl(JUDGED_FILE))
    judged_df = pd.json_normalize(judged_records)
    judged_df[['query', 'evaluation.score', 'evaluation.verdict']].head()
else:
    judged_df = None
    print('LLM judge file not found. Run evaluation/llm_judge.py to populate it.')

In [None]:
if judged_df is not None:
    avg_score = judged_df['evaluation.score'].mean()
    pass_rate = (judged_df['evaluation.verdict'] == 'pass').mean()
    print(f'Average judge score: {avg_score:.2f}')
    print(f'Pass rate: {pass_rate*100:.1f}%')
    fig = px.histogram(judged_df, x='evaluation.score', nbins=6, title='LLM Judge Score Distribution')
    fig.show()