In [1]:
import pandas as pd

results = pd.read_json("rag_results_300_250.json")
results_dict = results.to_dict(orient="records")
test = results_dict[:2]

In [2]:
# prepare structured output
from pydantic import BaseModel, Field
from enum import Enum

class CheckName(str, Enum):
    answer_relevant = "answer_relevant"
    completeness = "completeness"
    grounding_accuracy = "grounded_accuracy"
    context_utilization = "context_utilization"
    chunk_coverage = "chunk_coverage"
    consistency = "consistency"
    faithful_to_source = "faithful_to_source"
    focused = "focused"
    uncertainty_handling = "uncertainty_handling"
  

CHECK_DESCRIPTIONS = {
    CheckName.answer_relevant: "The answer directly address the user's question.",
    CheckName.completeness: "The answer cover all key points requested.",
    CheckName.grounding_accuracy: "All claims are supported by retrieved chunks (no hallucinations).",
    CheckName.context_utilization: "The answer utilized the provided snippets well and kept generic knowledge to a minimal.",
    CheckName.chunk_coverage: "The answer utilized multiple chunks effectively and rarely missed relevant information.",
    CheckName.consistency: "The answer avoids conflicting statements and contradictions.",
    CheckName.faithful_to_source: "The wording or paraphasing in the answer does not misrepresent the tone or meaning of the source.",
    CheckName.focused: "The response is focused and free of fluff.",
    CheckName.uncertainty_handling: "The model explictly indicates uncertainty when chunks lack coverage."
}

class EvaluationCheck(BaseModel):
    check_name: CheckName = Field(description="The type of evaluation check")
    reasoning: str = Field(description="The reasoning behind the check result")
    check_pass: bool = Field(description="Whether the check passed (True) or failed (False)")
    
class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck] = Field(description="List of all evaluation checks")
    summary: str = Field(description="Evaluation summary")

In [3]:
from openai import OpenAI

client = OpenAI()

def run_judge_structured(instructions:str, user_query:str, output_format, model="gpt-5-nano"):
    """Send the prepared prompt to the LLM client and return the raw SDK response."""

    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_query}]
    

    response = client.responses.parse(
        model=model,
        input=messages,
        text_format=output_format
    )
    return (response.output_parsed, response.usage)


In [4]:
from utils import judge_instructions

def run_eval(row):

    user_query = f"""
        <QUESTION>{row['question']}</QUESTION>
        <ANSWER>{row['answer']}</ANSWER>
        <CONTEXT>{row["context"]}</CONTEXT>
        """.strip()
    
    output = run_judge_structured(instructions=judge_instructions, user_query=user_query, output_format=EvaluationChecklist)
    return row, output

In [5]:
from utils import map_progress

from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=6) as pool:
    results = map_progress(pool, test, run_eval)

100%|██████████| 2/2 [00:35<00:00, 17.75s/it]


In [35]:
all_checks = []

for original_row, result in results:
    checklist, usage = result
    checks = checklist.checklist
    checks_formatted = {
        'question': original_row['question'],
        'input_tokens': usage.input_tokens,
        'output_tokens': usage.output_tokens, 
        'total_tokens': usage.total_tokens
    }
    for check in checks:
        checks_formatted[check.check_name] = check.check_pass
    all_checks.append(checks_formatted)

In [36]:
import pandas as pd
df_eval = pd.DataFrame(all_checks)
df_eval

Unnamed: 0,question,input_tokens,output_tokens,total_tokens,CheckName.answer_relevant,CheckName.completeness,CheckName.grounding_accuracy,CheckName.context_utilization,CheckName.chunk_coverage,CheckName.consistency,CheckName.focused,CheckName.uncertainty_handling
0,importance of reliable benchmarks in AI,1579,3476,5055,True,True,True,True,True,True,True,False
1,what is evaluation-driven development in ai?,1410,3172,4582,True,True,True,True,True,True,True,True


In [22]:
df_eval[df_eval.columns[1:]].mean()

CheckName.answer_relevant         1.0
CheckName.completeness            1.0
CheckName.grounding_accuracy      1.0
CheckName.context_utilization     1.0
CheckName.chunk_coverage          1.0
CheckName.consistency             1.0
CheckName.focused                 1.0
CheckName.uncertainty_handling    0.5
dtype: float64

In [2]:
from judge import run_eval_concurrent

run_eval_concurrent(test, "eval_test.json")

100%|██████████| 2/2 [00:24<00:00, 12.40s/it]


[{'question': 'importance of reliable benchmarks in AI',
  'input_tokens': 1579,
  'output_tokens': 2512,
  'total_tokens': 4091,
  'input_cost': 7.895e-05,
  'output_cost': 0.0010048,
  'total_cost': 0.00108375,
  <CheckName.answer_relevant: 'answer_relevant'>: True,
  <CheckName.completeness: 'completeness'>: True,
  <CheckName.grounding_accuracy: 'grounded_accuracy'>: True,
  <CheckName.context_utilization: 'context_utilization'>: True,
  <CheckName.chunk_coverage: 'chunk_coverage'>: True,
  <CheckName.consistency: 'consistency'>: True,
  <CheckName.focused: 'focused'>: True,
  <CheckName.uncertainty_handling: 'uncertainty_handling'>: True},
 {'question': 'what is evaluation-driven development in ai?',
  'input_tokens': 1410,
  'output_tokens': 1719,
  'total_tokens': 3129,
  'input_cost': 7.05e-05,
  'output_cost': 0.0006876,
  'total_cost': 0.0007581,
  <CheckName.answer_relevant: 'answer_relevant'>: True,
  <CheckName.completeness: 'completeness'>: True,
  <CheckName.grounding_ac

# Results

In [10]:
import pandas as pd

small_chunks = pd.read_json("eval_chunk_300_250.json")
small_chunks["total_cost"].sum() # cost 0.0431785

small_col_means = small_chunks[small_chunks.columns[7:]].mean()
small_col_means.mean() #0.890625

np.float64(0.890625)

In [88]:
small = pd.DataFrame(small_col_means, columns=["Score"])
small["Experiment"] = "Small (300)"
small

Unnamed: 0,Score,Experiment
answer_relevant,0.975,Small (300)
completeness,0.825,Small (300)
grounded_accuracy,0.9,Small (300)
context_utilization,1.0,Small (300)
chunk_coverage,0.95,Small (300)
consistency,1.0,Small (300)
focused,0.975,Small (300)
uncertainty_handling,0.5,Small (300)


In [15]:
medium_chunks = pd.read_json("eval_chunk_600_500.json")
medium_chunks["total_cost"].sum() #0.0440232
medium_col_means = medium_chunks[medium_chunks.columns[7:]].mean()
medium_col_means.mean() #0.91875

np.float64(0.91875)

In [87]:
medium = pd.DataFrame(medium_col_means, columns=["Score"])
medium["Experiment"] = "Medium (600)"
medium

Unnamed: 0,Score,Experiment
answer_relevant,1.0,Medium (600)
completeness,0.975,Medium (600)
grounded_accuracy,0.95,Medium (600)
context_utilization,1.0,Medium (600)
chunk_coverage,0.95,Medium (600)
consistency,1.0,Medium (600)
focused,1.0,Medium (600)
uncertainty_handling,0.475,Medium (600)


In [17]:
large_chunks = pd.read_json("eval_chunk_1000_833.json")
large_chunks["total_cost"].sum() #0.0460499
large_col_means = large_chunks[medium_chunks.columns[7:]].mean()
large_col_means.mean() #0.9125

np.float64(0.9125000000000001)

In [86]:
large = pd.DataFrame(large_col_means, columns=["Score"])
large["Experiment"] = "Large (1000)"
large

Unnamed: 0,Score,Experiment
answer_relevant,1.0,Large (1000)
completeness,0.925,Large (1000)
grounded_accuracy,0.975,Large (1000)
context_utilization,1.0,Large (1000)
chunk_coverage,0.95,Large (1000)
consistency,1.0,Large (1000)
focused,1.0,Large (1000)
uncertainty_handling,0.45,Large (1000)


In [91]:
import plotly.express as px

combined_results = pd.concat([small, medium, large])
combined_results.reset_index(inplace=True)
matrix = combined_results.pivot(index="Experiment", columns="index", values="Score")
fig = px.imshow(
    matrix, 
    color_continuous_scale="Viridis",
    labels=dict(color="Scores")
)
fig.update_layout(xaxis_title="Criteria", yaxis_title="Experiment")
fig.show()

In [96]:
means = pd.DataFrame(
   { "Experiment": ["Small (300)", "Medium (600)", "Large (1000)"],
    "Score": [small_col_means.mean(), medium_col_means.mean(), large_col_means.mean()],
    "Cost": [small_chunks["total_cost"].sum(), medium_chunks["total_cost"].sum(), large_chunks["total_cost"].sum()]
    }
)

long_means = means.melt(id_vars="Experiment", value_vars=["Score", "Cost"], var_name="metric", value_name="value")


fig = px.bar(long_means, x="Experiment", y="value", color="metric", barmode="group")
# fig.update_layout(width=800)
fig.show() 

In [98]:
fig = px.bar(
    data_frame=means, x="Experiment", y="Score", color="Experiment"
)
fig.show()
