In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    context_recall,
    faithfulness,
)


from tqdm import tqdm
from utils import neo4j_driver

In [None]:
test_data = pd.read_csv("data/benchmark_data.csv", delimiter=";")
test_data

In [None]:
import importlib
import ch08_tools
importlib.reload(ch08_tools)

In [None]:
answer, context = ch08_tools.get_answer("Who acted in the most movies?")

print(f"Answer: {answer}")
print(f"Context: {context}")

In [None]:
answers = []
ground_truths = []
latencies = []
contexts = []

for i, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Processing rows"):
    ground_truth, _, _ = neo4j_driver.execute_query(row["cypher"])
    ground_truths.append([str(el.data()) for el in ground_truth])
    start = datetime.now()
    try:
        answer, context = ch08_tools.get_answer(row["question"])
        context = [el['content'] for el in context]
    except Exception as e:
        print(f"Error processing row {i}: {e}")
        answer, context = None, []
    latencies.append((datetime.now() - start).total_seconds())
    answers.append(answer)
    contexts.append(context)

In [None]:
print(contexts)
print(answers)

In [None]:
test_data['ground_truth'] = [str(el) for el in ground_truths]
test_data['answer'] = answers
test_data['latency'] = latencies
test_data['retrieved_contexts'] = contexts

In [None]:
test_data

In [None]:
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

llm = ChatOllama(model="llama3.2")
embeddings = OllamaEmbeddings(model="llama3.2")

In [None]:
dataset = Dataset.from_pandas(test_data.fillna("I don't know"))
result = evaluate(
    dataset,
    metrics=[
        answer_correctness,
        context_recall,
        faithfulness,
    ],
    llm=llm,
    embeddings=embeddings,
)
print(result)

In [None]:
print(dataset)

In [None]:
for key in ["answer_correctness", "context_recall", "faithfulness"]:
    test_data[key] = [el[key] for el in result.scores]
test_data