# RAG
## Evaluation!


In [1]:
from evaluation import test
import importlib
from tqdm import tqdm as tqdm

In [2]:
tests = test.load_tests()

In [3]:
len(tests)

150

In [4]:
example = tests[35]
print(example.question)
print(example.category)
print(example.reference_answer)
print(example.keywords)


Which product does Sarah Williams lead design for?
direct_fact
Sarah Williams leads design for the Homellm home insurance portal.
['Homellm', 'Sarah']


In [5]:
from collections import Counter
count = Counter([t.category for t in tests])
count

Counter({'direct_fact': 70,
         'temporal': 20,
         'spanning': 20,
         'comparative': 10,
         'numerical': 10,
         'relationship': 10,
         'holistic': 10})

In [6]:
import evaluation
importlib.reload(evaluation)
from evaluation.eval import evaluate_retrieval, evaluate_answer

In [7]:
evaluate_retrieval(example)

RetrievalEval(mrr=1.0, ndcg=0.9196711652255352, keywords_found=2, total_keywords=2, keyword_coverage=100.0)

In [8]:
eval, answer, chunks = evaluate_answer(example)

In [9]:
eval

AnswerEval(feedback='The answer correctly identifies the product Sarah Williams leads design for, matching the reference answer exactly. It is accurate, complete, and directly addresses the question without extraneous information.', accuracy=5.0, completeness=5.0, relevance=5.0)

In [10]:
print(eval.feedback)
print(eval.accuracy)
print(eval.completeness)
print(eval.relevance)

The answer correctly identifies the product Sarah Williams leads design for, matching the reference answer exactly. It is accurate, complete, and directly addresses the question without extraneous information.
5.0
5.0
5.0


In [11]:
mrr, ndcg, keyword_coverage = [], [], []
num_tests = len(tests)

def run_retriveal(test):
    scores = evaluate_retrieval(test)
    mrr.append(scores.mrr)
    ndcg.append(scores.ndcg)
    keyword_coverage.append(scores.keyword_coverage)

for test in tqdm(tests): 
    try:
        run_retriveal(test)
    except:
        print("Ran into exception")
        mrr.append(0)
        ndcg.append(0)
        keyword_coverage.append(0)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [01:12<00:00,  2.07it/s]


In [12]:
print("Average MRR: ", sum(mrr)/num_tests)
print("Average NDCG: ", sum(ndcg)/num_tests)
print("Average keyword_coverage: ", sum(keyword_coverage)/num_tests)

Average MRR:  0.8544604497354494
Average NDCG:  0.8411499273298114
Average keyword_coverage:  95.12222222222222


In [13]:
accuracy, completeness, relevance = [], [], []
num_tests = len(tests)

def run_evaluation(test):
    scores, _, _ = evaluate_answer(test)
    accuracy.append(scores.accuracy)
    completeness.append(scores.completeness)
    relevance.append(scores.relevance)

for test in tqdm(tests): 
    try:
        run_evaluation(test)
    except:
        print("Ran into exception")
        accuracy.append(0)
        completeness.append(0)
        relevance.append(0)

 35%|█████████████████████████████████▏                                                            | 53/150 [01:58<03:33,  2.20s/it]

Ran into exception


 60%|████████████████████████████████████████████████████████▍                                     | 90/150 [03:19<02:13,  2.23s/it]

Ran into exception


 73%|███████████████████████████████████████████████████████████████████▌                         | 109/150 [04:04<01:40,  2.46s/it]

Ran into exception


 75%|██████████████████████████████████████████████████████████████████████                       | 113/150 [04:13<01:29,  2.43s/it]

Ran into exception


 77%|███████████████████████████████████████████████████████████████████████▎                     | 115/150 [04:18<01:22,  2.35s/it]

Ran into exception


 81%|███████████████████████████████████████████████████████████████████████████                  | 121/150 [04:32<01:05,  2.26s/it]

Ran into exception


 84%|██████████████████████████████████████████████████████████████████████████████               | 126/150 [04:42<00:53,  2.25s/it]

Ran into exception


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [05:40<00:00,  2.27s/it]


In [14]:
print("Average Accuracy: ", sum(accuracy)/num_tests)
print("Average completeness: ", sum(completeness)/num_tests)
print("Average Relevance: ", sum(relevance)/num_tests)

Average Accuracy:  4.453333333333333
Average completeness:  4.08
Average Relevance:  4.566666666666666
