Evaluation of Synthetic Dataset
===

Now, that we have generated a synthetic dataset and also built a RAG pipeline, let's first evaluate how good our dataset is. Then, we will filter out a gold dataset and then evaluate the RAG pipeline on the gold dataset.

In [2]:
import os
import dspy
import json

In [3]:
os.chdir('../')

In [4]:
DATASET_FPATH = './data/processed/dataset.json'

In [5]:
# Read the dataset.
with open(DATASET_FPATH, 'r') as f:
    dataset = json.load(f)

In [6]:
dataset.keys()

dict_keys(['queries', 'answers', 'corpus', 'relevant_docs'])

In [7]:
# Print an example from each key of dataset
for key in dataset.keys():
    print(f"{key}:")
    for k,v in dataset[key].items():
        print(f"\t{k}: {v}")
        break
    print()



queries:
	d7b65e32-b094-465b-ad5d-9d1636707bff: Who directed the 2007 production of How to Curse?

answers:
	d7b65e32-b094-465b-ad5d-9d1636707bff: Josie Rourke

corpus:
	273a6e33-7a57-4b02-b704-9c13264f1769: = Robert Boulter = 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben

RAGAS
---

In [8]:
from ragas import evaluate

In [24]:
import pandas as pd
# Creating the DataFrame
data = []
for query_id, query_text in dataset['queries'].items():
    answer_text = dataset['answers'].get(query_id)
    doc_ids = dataset['relevant_docs'].get(query_id, [])
    for doc_id in doc_ids:
        corpus_text = dataset['corpus'].get(doc_id)
        # Rename ['question', 'ground_truth', 'answer', 'contexts']

        # data.append({"query": query_text, "answer": answer_text, "corpus": corpus_text})
        data.append({"question": query_text, "ground_truths": [answer_text], "answer": answer_text, "contexts": [corpus_text]})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,question,ground_truths,answer,contexts
0,Who directed the 2007 production of How to Curse?,[Josie Rourke],Josie Rourke,[= Robert Boulter = \r\n Robert Boulter is an ...
1,"Who starred as ""Jason Tyler"" in the 2006 episo...",[Robert Boulter],Robert Boulter,[= = = 2006 – present = = = \r\n In 2006 Boult...
2,Who was Du Fu's paternal grandfather?,[Du Shenyan],Du Shenyan,[Since many of Du Fu 's poems feature morality...
3,When did Du Fu meet Li Bai for the first time?,[Autumn of 744],Autumn of 744,[Since many of Du Fu 's poems feature morality...
4,What was Du Fu's first official post in the ca...,[Registrar of the Right Commandant's office],Registrar of the Right Commandant's office,[Since many of Du Fu 's poems feature morality...


In [38]:
df.to_csv('./data/processed/synthetic_dataset.csv', index=False)

In [26]:
from datasets import Dataset
ds = Dataset.from_pandas(df)

In [27]:
os.environ['OPENAI_API_KEY'] = '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'

In [28]:
os.environ['LANGFUSE_SECRET_KEY'] = 'sk-lf-6f3542d6-53e7-4fd2-b417-e6e2fc0512a0'
os.environ['LANGFUSE_PUBLIC_KEY'] = 'pk-lf-3d36f7c6-2840-40d1-b129-63e075e24226'
os.environ["LANGFUSE_HOST"] = 'https://us.cloud.langfuse.com'

In [29]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)
try:
    result = evaluate(
        dataset = ds,
        metrics=[
            context_relevancy,
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        raise_exceptions=False
    )
except Exception as e:
    print(e)



Evaluating:   0%|          | 0/3865 [00:00<?, ?it/s]

ERROR:ragas.executor:Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\asyncio\tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\base.py", line 91, in ascore
    raise e
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\base.py", line 87, in ascore
    score = await self._ascore(row=row, callbacks=group_cm, is_async=is_async)
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\_faithfulness.py", line 190, in _ascore
    assert isinstanc

In [22]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    context_relevancy
)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

ValueError: Dataset feature "ground_truth" should be of type string

In [30]:
result

{'context_relevancy': 0.7795, 'context_precision': 0.7685, 'context_recall': 0.7931, 'faithfulness': 0.7821, 'answer_relevancy': 0.7886}

In [31]:
result.to_pandas().head()

Unnamed: 0,question,ground_truths,answer,contexts,ground_truth,context_relevancy,context_precision,context_recall,faithfulness,answer_relevancy
0,Who directed the 2007 production of How to Curse?,[Josie Rourke],Josie Rourke,[= Robert Boulter = \r\n Robert Boulter is an ...,Josie Rourke,,,,,
1,"Who starred as ""Jason Tyler"" in the 2006 episo...",[Robert Boulter],Robert Boulter,[= = = 2006 – present = = = \r\n In 2006 Boult...,Robert Boulter,,,,,
2,Who was Du Fu's paternal grandfather?,[Du Shenyan],Du Shenyan,[Since many of Du Fu 's poems feature morality...,Du Shenyan,,,,,
3,When did Du Fu meet Li Bai for the first time?,[Autumn of 744],Autumn of 744,[Since many of Du Fu 's poems feature morality...,Autumn of 744,,,,,
4,What was Du Fu's first official post in the ca...,[Registrar of the Right Commandant's office],Registrar of the Right Commandant's office,[Since many of Du Fu 's poems feature morality...,Registrar of the Right Commandant's office,,,,,


In [32]:
# Use the save_result function to save the result to a csv file.
import time

def save_result(result):
    exp_name = f"results/eval_synthetic_data_{time.strftime('%Y%m%d-%H%M%S')}"
    print(f"Saving results to {exp_name}.csv")
    # make dir results
    if not os.path.exists('results'):
        os.makedirs('results')

    # Write to file
    result.to_pandas().to_csv(f"{exp_name}.csv")

In [33]:
# Uncomment the following line to save the result.
save_result(result)

Saving results to results/eval_synthetic_data_20240911-232529.csv


In [35]:
os.environ['WANDB_NOTEBOOK_NAME'] = '04_eval_synth_data.ipynb'

In [36]:
os.environ['WANDB_API_KEY'] = '489eb28b2888d684cef50ac9633d922c62b6c655'

In [37]:
# Logging to wandb

import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-synthetic-eval",
    
    # track hyperparameters and run metadata
    config={
        "chuck_size": 1024,
        "sentence_chunck_overlap": 200,
        "number_of_questions": len(ds),
        "comments": "Synthetic dataset where ground truth and the answer are the same.",
    }
)

wandb.log(result)

wandb.finish()

wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\kuotz\_netrc


VBox(children=(Label(value='0.015 MB of 0.025 MB uploaded\r'), FloatProgress(value=0.5948204782250206, max=1.0…

0,1
answer_relevancy,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.78857
context_precision,0.76852
context_recall,0.79314
context_relevancy,0.77951
faithfulness,0.78206


-----