In [1]:
import pandas as pd
from ast import literal_eval
pd.set_option(
    'display.precision', 2,
    'display.max_colwidth', 200
)
from collections import namedtuple
from tqdm.notebook import tqdm
from deepeval.metrics import (
    ContextualPrecisionMetric,
    FaithfulnessMetric,
    ContextualRecallMetric,
    AnswerRelevancyMetric,
)
from deepeval.test_case import LLMTestCase
from deepeval.evaluate import evaluate

In [2]:
def run_evaluations(qa_set_df, metrics, nrows=5):
    """
    Run the measurement of contextual_precision, contextual_recall,
    answer_relevancy, and faithfulness metrics implemented by
    DeepEval. 
    :param qa_set_df: A dataframe containing the Q/A pairs,
    ground truth and context required by DeepEval to run a
    test case. 
    :param metrics: A list of metrics to execute on each test case.
    :param nrows: Number of the first n rows to be extracted from
    the qa_set_df to run the test cases.
    :return: The results from deepeval.evaluate.
    """
    
    test_cases =[]
    for _, row in qa_set_df.head(nrows).iterrows():  
        test_case = LLMTestCase(
            input=row['query'],
            actual_output=row['answer'],
            expected_output=row['ground_truth'],
            retrieval_context=[row['contexts']],
        )
        test_cases.append(test_case)
    
    return evaluate(
        test_cases=test_cases,
        metrics=metrics,
        show_indicator=False,
        print_results=False,
    )

In [3]:
def deepeval_on_test_sets(test_sets, metrics, nrows=5):
    """
    Runs DeepEval on each element of test_sets and returns a named tuple containing
    the name of the RAG pipeline and a Pandas dataframe containing the scores 
    from each DeepEval metric.
    :param test_sets: The list of test sets containing the Q/A pairs, ground truth and
    context of a RAG pipeline.
    :param metrics: A list of metrics to execute on each test case.
    :param nrows: Number of rows of the test set to be used by DeepEval to evaluate a
    RAG pipeline.
    :return: A list named tuples containing the DeepEval scores for each test set.
    """
    
    # Try opening all test sets, if one does not exist, rise an exception
    for file in test_sets:
        f = open(file)
        f.close()

    ModelEval = namedtuple("ModelEval",
                           "modelmix results")
    model_evals = []
    
    for tset in tqdm(test_sets):
        model_mix = tset.split('/')[-1].split('.')[0]
        print(f"Evaluating {model_mix}")
        qa_set_df = pd.read_csv(
            filepath_or_buffer=tset,
        )
        results = run_evaluations(
            qa_set_df,
            metrics,
            nrows=nrows)
        
        model_evals.append(
            ModelEval(
                model_mix,
                results
            )
        )

    return model_evals

In [4]:
def deepeval_to_dict(evals):
    """
    Converts a list of DeepEval results into a dictionary that can be used to
    create Pandas dataframes.
    :param evals: A list with DeepEval results from 'eval'
    :return: A dictionary with the dataset name as key and the DeepEval results as value
    """
    res_summary = {}
    for dset in evals:
        res_summary[dset.modelmix] = {}
        metrics = dset.results[0].metrics
        for metric in metrics:
            res_summary[dset.modelmix][f"{metric.__name__} Score"] = []
            res_summary[dset.modelmix][f"{metric.__name__} Reason"] = []
        for result in dset.results:
            for metric in result.metrics:
                res_summary[dset.modelmix] \
                    [f"{metric.__name__} Score"].append(metric.score)
                res_summary[dset.modelmix]\
                    [f"{metric.__name__} Reason"].append(metric.reason)
    return res_summary        
    

In [5]:
# File paths for the inference results from each RAG variant
# on the same Q/A pairs evaluation dataset.
PATH_BASE = "../../04-RAG_Variants"
SIMPLE_RAG_PATH = (f"{PATH_BASE}/01-Simple_Retrieval/"
                   f"Standard_RAG_zephyr-7b-alpha_bge-base-en-v1.5.csv")
SENTENCE_WINDOW_RAG_PATH = (f"{PATH_BASE}/02-Sentence_Window_Retrieval/"
                            "Sentence_Window_RAG_zephyr-7b-alpha_bge-base-en-v1.5.csv")
AUTO_MERGE_RAG_PATH = (f"{PATH_BASE}/03-Auto_Merging_Retrieval/"
                       "Auto_Merging_RAG_zephyr-7b-alpha_bge-base-en-v1.5.csv")
test_sets = [
    SIMPLE_RAG_PATH,
    SENTENCE_WINDOW_RAG_PATH,
    AUTO_MERGE_RAG_PATH
]

# OpenAI LLM to get used as evaluator
JUDGE_LLM = "gpt-3.5-turbo-0125"

# DeepEval evaluation parameters
DECISION_THRESHOLD = 0.5
NUM_EVAL_SAMPLES = 30
eval_parameters = {
    "threshold": DECISION_THRESHOLD,
    "model": JUDGE_LLM,
}

# DeepEval list of metrics to get applied on each
# test case.
metrics = [
    ContextualPrecisionMetric(**eval_parameters),
    ContextualRecallMetric(**eval_parameters),
    AnswerRelevancyMetric(**eval_parameters),
    FaithfulnessMetric(**eval_parameters), 
]

In [6]:
# Execute DeepEval evaluations on test cases
evals = deepeval_on_test_sets(
    test_sets,
    metrics,
    NUM_EVAL_SAMPLES)

  0%|          | 0/3 [00:00<?, ?it/s]

Evaluating Standard_RAG_zephyr-7b-alpha_bge-base-en-v1
Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating Sentence_Window_RAG_zephyr-7b-alpha_bge-base-en-v1
Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating Auto_Merging_RAG_zephyr-7b-alpha_bge-base-en-v1
Event loop is already running. Applying nest_asyncio patch to allow async execution...


### Display DeepEval summarized results

- Notice __there is no absolute winner RAG approach__. Further tweaking is required to get a configuration
that maximizes the metric(s) of interest.
- It might be (too) hard to make a single RAG approach get
the highest scores all across the board.  

In [7]:
# Display the evaluation results. 
# Use a summarized view of the results to facilitate the comparison
print("DeepEval Results")
summary = deepeval_to_dict(evals)
labels = [modelmix for modelmix in summary.keys()]
dframes = [pd.DataFrame.from_dict(summary[label]).describe(percentiles=[]) for label in labels]
results_df = pd.concat(dframes, keys=labels)
display(results_df)

DeepEval Results


Unnamed: 0,Unnamed: 1,Contextual Precision Score,Contextual Recall Score,Answer Relevancy Score,Faithfulness Score
Standard_RAG_zephyr-7b-alpha_bge-base-en-v1,count,30.0,30.0,30.0,30.0
Standard_RAG_zephyr-7b-alpha_bge-base-en-v1,mean,0.76,0.8,0.76,0.67
Standard_RAG_zephyr-7b-alpha_bge-base-en-v1,std,0.42,0.33,0.26,0.26
Standard_RAG_zephyr-7b-alpha_bge-base-en-v1,min,0.0,0.0,0.11,0.0
Standard_RAG_zephyr-7b-alpha_bge-base-en-v1,50%,1.0,1.0,0.83,0.72
Standard_RAG_zephyr-7b-alpha_bge-base-en-v1,max,1.0,1.0,1.0,1.0
Sentence_Window_RAG_zephyr-7b-alpha_bge-base-en-v1,count,30.0,30.0,30.0,30.0
Sentence_Window_RAG_zephyr-7b-alpha_bge-base-en-v1,mean,0.88,0.68,0.9,0.68
Sentence_Window_RAG_zephyr-7b-alpha_bge-base-en-v1,std,0.3,0.38,0.19,0.31
Sentence_Window_RAG_zephyr-7b-alpha_bge-base-en-v1,min,0.0,0.0,0.33,0.0
