## Evaluation on the hotpotQA dataset

In [None]:
from evals.eval_on_hotpot import eval_on_hotpotQA
from evals.eval_on_hotpot import answer_with_cognee
from evals.eval_on_hotpot import answer_without_cognee
from evals.eval_on_hotpot import eval_answers
from cognee.base_config import get_base_config
from pathlib import Path
from tqdm import tqdm
import wget
import json
import statistics

### Getting the answers for the first num_samples questions of the dataset

In [None]:
answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee
num_samples = 10 # With cognee, it takes ~1m10s per sample

base_config = get_base_config()
data_root_dir = base_config.data_root_directory

if not Path(data_root_dir).exists():
    Path(data_root_dir).mkdir()

filepath = data_root_dir / Path("hotpot_dev_fullwiki_v1.json")
if not filepath.exists():
    url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'
    wget.download(url, out=data_root_dir)

with open(filepath, "r") as file:
    dataset = json.load(file)

instances = dataset if not num_samples else dataset[:num_samples]
answers = []
for instance in tqdm(instances, desc="Getting answers"):
    answer = await answer_provider(instance)
    answers.append(answer)

### Calculating the official HotpotQA benchmark metrics: F1 score and EM

In [None]:
from evals.deepeval_metrics import f1_score_metric
from evals.deepeval_metrics import em_score_metric

In [None]:
f1_metric = f1_score_metric()
eval_results = await eval_answers(instances, answers, f1_metric)
avg_f1_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print("F1 score: ", avg_f1_score)

In [None]:
em_metric = em_score_metric()
eval_results = await eval_answers(instances, answers, em_metric)
avg_em_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print("EM score: ", avg_em_score)

### Calculating a custom metric called Correctness
##### Correctness is judged by an LLM

In [8]:
from evals.deepeval_metrics import correctness_metric

In [None]:
eval_results = await eval_answers(instances, answers, correctness_metric) # note that instantiation is not needed for correctness metric as it is already an instance
avg_correctness_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print("Correctness score: ", avg_correctness_score)

### Using a metric from Deepeval

In [9]:
from deepeval.metrics import AnswerRelevancyMetric

In [None]:
relevancy_metric = AnswerRelevancyMetric()
eval_results = await eval_answers(instances, answers, relevancy_metric) # note that instantiation is not needed for correctness metric as it is already an instance
avg_relevancy_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print("Relevancy score: ", avg_relevancy_score)

### Answering and eval in one step

In [None]:
answer_provider = answer_without_cognee
f1_metric = f1_score_metric()
f1_score =  await eval_on_hotpotQA(answer_provider, num_samples=10, eval_metric=f1_metric) # takes ~1m10s per sample
print("F1 score: ", f1_score)