# Evaluation on the hotpotQA dataset

In [None]:
!pip install "cognee[deepeval]"

In [None]:
from evals.eval_on_hotpot import deepeval_answers, answer_qa_instance
from evals.qa_dataset_utils import load_qa_dataset
from evals.qa_metrics_utils import get_metrics
from evals.qa_context_provider_utils import qa_context_providers
from pathlib import Path
from tqdm import tqdm
import statistics
import random

## Load Dataset

In [None]:
num_samples = 10  # With cognee, it takes ~1m10s per sample
dataset_name_or_filename = "hotpotqa"
dataset = load_qa_dataset(dataset_name_or_filename)

## Define Context Provider
**Options**: 
- **cognee**: context with cognee 
- **no_rag**: raw context 
- **simple_rag**: context with simple rag 
- **brute_force**: context with brute force triplet search

### Here, "cognee" is used as context provider


In [None]:
context_provider_name = "cognee"
context_provider = qa_context_providers[context_provider_name]

#### Generate Answers for QA Instances
1. **Random Sampling**: Selects a random subset of the dataset if `num_samples` is defined.
2. **Context Filename**: Defines the file path for storing contexts generated by the context provider.
3. **Answer Generation**: Iterates over the QA instances using `tqdm` for progress tracking and generates answers using the `answer_qa_instance` function asynchronously.

In [None]:
random.seed(42)
instances = dataset if not num_samples else random.sample(dataset, num_samples)

out_path = "out"
if not Path(out_path).exists():
    Path(out_path).mkdir()
contexts_filename = out_path / Path(
    f"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json"
)

answers = []
for instance in tqdm(instances, desc="Getting answers"):
    answer = await answer_qa_instance(instance, context_provider, contexts_filename)
    answers.append(answer)

#### Define Metrics for Evaluation and Calculate Score
**Options**: 
- **Correctness**: Is the actual output factually correct based on the expected output?
- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?
- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?
- **Empowerment**: How well does the answer help the reader understand and make informed judgements about the topic?
- **Directness**: How specifically and clearly does the answer address the question?
- **F1 Score**: the harmonic mean of the precision and recall, using word-level Exact Match
- **EM Score**: the rate at which the predicted strings exactly match their references, ignoring white spaces and capitalization.

We can also calculate scores based on the same metrics with promptfoo

##### Calculating `"Correctness"`

In [None]:
metric_name_list = ["Correctness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Correctness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Correctness)

##### Calculating `"Comprehensiveness"`

In [None]:
metric_name_list = ["Comprehensiveness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Comprehensiveness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Comprehensiveness)

##### Calculating `"Diversity"`

In [None]:
metric_name_list = ["Diversity"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Diversity = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Diversity)

##### Calculating`"Empowerment"`

In [None]:
metric_name_list = ["Empowerment"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Empowerment = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Empowerment)

##### Calculating `"Directness"`

In [None]:
metric_name_list = ["Directness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Directness = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Directness)

##### Calculating `"F1 Score"`

In [None]:
metric_name_list = ["F1"]
eval_metrics = get_metrics(metric_name_list)

In [None]:
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
F1_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(F1_score)

##### Calculating `"Exact Match (EM) Score"`

In [None]:
metric_name_list = ["EM"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
EM = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(EM)

### "no_rag" as context provider

In [None]:
context_provider_name = "no_rag"
context_provider = qa_context_providers[context_provider_name]

#### Generate Answers for QA Instances

In [None]:
random.seed(42)
instances = dataset if not num_samples else random.sample(dataset, num_samples)

out_path = "out"
if not Path(out_path).exists():
    Path(out_path).mkdir()
contexts_filename = out_path / Path(
    f"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json"
)

answers = []
for instance in tqdm(instances, desc="Getting answers"):
    answer = await answer_qa_instance(instance, context_provider, contexts_filename)
    answers.append(answer)

#### Define Metrics for Evaluation and Calculate Score

##### Calculate `"Correctness"`

In [None]:
metric_name_list = ["Correctness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Correctness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Correctness)

##### Calculating `"Comprehensiveness"`

In [None]:
metric_name_list = ["Comprehensiveness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Comprehensiveness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Comprehensiveness)

##### Calculating `"Diversity"`

In [None]:
metric_name_list = ["Diversity"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Diversity = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Diversity)

##### Calculating`"Empowerment"`

In [None]:
metric_name_list = ["Empowerment"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Empowerment = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Empowerment)

##### Calculating `"Directness"`

In [None]:
metric_name_list = ["Directness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Directness = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Directness)

##### Calculating `"F1 Score"`

In [None]:
metric_name_list = ["F1"]
eval_metrics = get_metrics(metric_name_list)

In [None]:
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
F1_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(F1_score)

##### Calculating `"Exact Match (EM) Score"`

In [None]:
metric_name_list = ["EM"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
EM = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(EM)

### "simple_rag" as context provider


In [None]:
context_provider_name = "simple_rag"
context_provider = qa_context_providers[context_provider_name]

#### Generate Answers for QA Instances

In [None]:
random.seed(42)
instances = dataset if not num_samples else random.sample(dataset, num_samples)

out_path = "out"
if not Path(out_path).exists():
    Path(out_path).mkdir()
contexts_filename = out_path / Path(
    f"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json"
)

answers = []
for instance in tqdm(instances, desc="Getting answers"):
    answer = await answer_qa_instance(instance, context_provider, contexts_filename)
    answers.append(answer)

#### Define Metrics for Evaluation and Calculate Score

##### Calculate `"Correctness"`

In [None]:
metric_name_list = ["Correctness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Correctness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Correctness)

##### Calculating `"Comprehensiveness"`

In [None]:
metric_name_list = ["Comprehensiveness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Comprehensiveness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Comprehensiveness)

##### Calculating `"Diversity"`

In [None]:
metric_name_list = ["Diversity"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Diversity = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Diversity)

##### Calculating`"Empowerment"`

In [None]:
metric_name_list = ["Empowerment"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Empowerment = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Empowerment)

##### Calculating `"Directness"`

In [None]:
metric_name_list = ["Directness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Directness = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Directness)

##### Calculating `"F1"`

In [None]:
metric_name_list = ["F1 Score"]
eval_metrics = get_metrics(metric_name_list)

In [None]:
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
F1_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(F1_score)

##### Calculating `"Exact Match (EM) Score"`

In [None]:
metric_name_list = ["EM"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
EM = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(EM)

### "brute_force" as context provider


In [None]:
context_provider_name = "brute_force"
context_provider = qa_context_providers[context_provider_name]

#### Generate Answers for QA Instances

In [None]:
random.seed(42)
instances = dataset if not num_samples else random.sample(dataset, num_samples)

out_path = "out"
if not Path(out_path).exists():
    Path(out_path).mkdir()
contexts_filename = out_path / Path(
    f"contexts_{dataset_name_or_filename.split('.')[0]}_{context_provider_name}.json"
)

answers = []
for instance in tqdm(instances, desc="Getting answers"):
    answer = await answer_qa_instance(instance, context_provider, contexts_filename)
    answers.append(answer)

#### Define Metrics for Evaluation and Calculate Score

##### Calculate `"Correctness"`

In [None]:
metric_name_list = ["Correctness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Correctness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Correctness)

##### Calculating `"Comprehensiveness"`

In [None]:
metric_name_list = ["Comprehensiveness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Comprehensiveness = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Comprehensiveness)

##### Calculating `"Diversity"`

In [None]:
metric_name_list = ["Diversity"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Diversity = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Diversity)

##### Calculating`"Empowerment"`

In [None]:
metric_name_list = ["Empowerment"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Empowerment = statistics.mean(
    [result.metrics_data[0].score for result in eval_results.test_results]
)
print(Empowerment)

##### Calculating `"Directness"`

In [None]:
metric_name_list = ["Directness"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
Directness = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(Directness)

##### Calculating `"F1 Score"`

In [None]:
metric_name_list = ["F1"]
eval_metrics = get_metrics(metric_name_list)

In [None]:
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
F1_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(F1_score)

##### Calculating `"Exact Match (EM) Score"`

In [None]:
metric_name_list = ["EM"]
eval_metrics = get_metrics(metric_name_list)
eval_results = await deepeval_answers(instances, answers, eval_metrics["deepeval_metrics"])

In [None]:
EM = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
print(EM)