1️⃣ Setup & Imports

In [27]:
import json
import pandas as pd
from pathlib import Path

from evaluation.pipeline import run_evaluation
from evaluation.metrics import (
    compute_retrieval_metrics,
    compute_answer_metrics,
    compute_rag_quality_score
)

2️⃣ Load Evaluation Dataset

In [28]:
EVAL_DATASET_PATH = Path("evaluation/eval_dataset.json")

with open(EVAL_DATASET_PATH, "r", encoding="utf-8") as f:
    dataset_json = json.load(f)

eval_dataset = dataset_json.get("samples", [])
print(f"Loaded {len(eval_dataset)} evaluation examples")

Loaded 8 evaluation examples


3️⃣ Запускаємо evaluation pipeline

In [29]:
results = run_evaluation()

Loaded 8 evaluation examples
[1/8] Question processed
[2/8] Question processed
[3/8] Question processed
[4/8] Question processed
[5/8] Question processed
[6/8] Question processed
[7/8] Question processed
[8/8] Question processed
Evaluation finished. 8 samples processed.
Results saved to evaluation\reports\eval_results.json


4️⃣ Перетворюємо результати в DataFrame для зручного перегляду

In [30]:
df = pd.DataFrame(results)

5️⃣ Виводимо перші кілька рядків

In [31]:
df.head(8)

Unnamed: 0,id,question,expected_answer,rag_answer,retrieved_docs,retrieval_score,answer_score,rag_quality_score
0,Q1,What does the slippery road traffic sign mean ...,The Slippery When Wet traffic sign warns drive...,"Based on the provided context, the Slippery Wh...",[{'page_content': 'Traffic sign: Solid Broken ...,1.0,0.703704,0.852
1,Q2,What is the meaning of the oil slick traffic s...,The oil slick traffic sign indicates the prese...,"Based on the provided context, the oil slick t...",[{'page_content': 'Traffic sign: Oil Slick. Ca...,1.0,0.791667,0.896
2,Q3,What does a triangular warning sign generally ...,A triangular warning sign indicates a hazard o...,"Based on the provided context, a triangular wa...",[{'page_content': 'These are other vehicle cla...,0.0,0.809524,0.405
3,Q4,Where is parking on the pavement allowed?,Parking on the pavement is allowed in specific...,"Based on the provided context, parking on the ...",[{'page_content': 'over manhole covers and oth...,0.5,0.666667,0.583
4,Q5,What category do slippery road signs belong to?,Slippery road signs belong to warning signs.,"Based on the provided context, the category fo...",[{'page_content': 'Traffic sign: Oil Slick. Ca...,1.0,0.166667,0.583
5,Q6,How should a driver react when seeing a slippe...,"A driver should exercise caution, reduce speed...","Based on the provided context, the relevant in...",[{'page_content': 'Traffic sign: Slippery When...,1.0,0.863636,0.932
6,Q7,Is the slippery road sign a regulatory or warn...,"It is a warning sign, used to alert drivers to...","Unfortunately, the context provided does not i...",[{'page_content': 'Traffic sign: Riders Prohib...,1.0,0.928571,0.964
7,Q8,What legal role do traffic signs have under Ge...,Traffic signs are legally binding instructions...,"According to the provided context, traffic sig...",[{'page_content': 'Traffic sign: no passing ve...,1.0,0.714286,0.857


6️⃣ Додаткові підрахунки: середні метрики

In [32]:
avg_retrieval = df['retrieval_score'].mean()
avg_answer = df['answer_score'].mean()
avg_rag_quality = df['rag_quality_score'].mean()

print(f"Average Retrieval Score: {avg_retrieval:.3f}")
print(f"Average Answer Score: {avg_answer:.3f}")
print(f"Average RAG Quality Score: {avg_rag_quality:.3f}")

Average Retrieval Score: 0.812
Average Answer Score: 0.706
Average RAG Quality Score: 0.759


7️⃣ Збереження DataFrame у CSV для подальшого аналізу

In [33]:
REPORTS_DIR = Path("evaluation/reports")
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
df.to_csv(REPORTS_DIR / "eval_results.csv", index=False)
print(f"Results saved to {REPORTS_DIR / 'eval_results.csv'}")

Results saved to evaluation\reports\eval_results.csv
