Evaluation of Synthetic Dataset
===

Now, that we have generated a synthetic dataset and also built a RAG pipeline, let's first evaluate how good our dataset is. Then, we will filter out a gold dataset and then evaluate the RAG pipeline on the gold dataset.

In [2]:
import os
import dspy
import json

In [3]:
os.chdir('../')

In [4]:
DATASET_FPATH = './data_small/processed/dataset.json'

In [5]:
# Read the dataset.
with open(DATASET_FPATH, 'r') as f:
    dataset = json.load(f)

In [8]:
dataset.keys()

dict_keys(['queries', 'answers', 'corpus', 'relevant_docs'])

In [9]:
# Print an example from each key of dataset
for key in dataset.keys():
    print(f"{key}:")
    for k,v in dataset[key].items():
        print(f"\t{k}: {v}")
        break
    print()



queries:
	f6c303d4-16a8-4e5b-a3fc-b1bb58152e5a: Predict(BasicQA(question -> answer

answers:
	f6c303d4-16a8-4e5b-a3fc-b1bb58152e5a: instructions='Answer questions with short factoid answers.'

corpus:
	ab153d63-b5e2-4fd7-9f68-b345cd938a5f: = Robert Boulter = 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John T

RAGAS
---

In [10]:
from ragas import evaluate

In [11]:
import pandas as pd
# Creating the DataFrame
data = []
for query_id, query_text in dataset['queries'].items():
    answer_text = dataset['answers'].get(query_id)
    doc_ids = dataset['relevant_docs'].get(query_id, [])
    for doc_id in doc_ids:
        corpus_text = dataset['corpus'].get(doc_id)
        # Rename ['question', 'ground_truth', 'answer', 'contexts']

        # data.append({"query": query_text, "answer": answer_text, "corpus": corpus_text})
        data.append({"question": query_text, "ground_truths": [answer_text], "answer": answer_text, "contexts": [corpus_text]})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,question,ground_truths,answer,contexts
0,Predict(BasicQA(question -> answer,[instructions='Answer questions with short fac...,instructions='Answer questions with short fact...,[= Robert Boulter = \r\n Robert Boulter is an ...
1,question = Field(annotation=str required=True ...,[answer = Field(annotation=str required=True j...,answer = Field(annotation=str required=True js...,[= Robert Boulter = \r\n Robert Boulter is an ...
2,Predict(BasicQA(question -> answer,[instructions='Answer questions with short fac...,instructions='Answer questions with short fact...,[= = = 2006 – present = = = \r\n In 2006 Boult...
3,question = Field(annotation=str required=True ...,[answer = Field(annotation=str required=True j...,answer = Field(annotation=str required=True js...,[= = = 2006 – present = = = \r\n In 2006 Boult...
4,Predict(BasicQA(question -> answer,[instructions='Answer questions with short fac...,instructions='Answer questions with short fact...,[Since many of Du Fu 's poems feature morality...


In [12]:
df.to_csv('./data_small/processed/synthetic_dataset.csv', index=False)

In [13]:
from datasets import Dataset
ds = Dataset.from_pandas(df)

In [14]:
#os.environ['OPENAI_API_KEY'] = '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'

In [15]:
os.environ['LANGFUSE_SECRET_KEY'] = 'sk-lf-6f3542d6-53e7-4fd2-b417-e6e2fc0512a0'
os.environ['LANGFUSE_PUBLIC_KEY'] = 'pk-lf-3d36f7c6-2840-40d1-b129-63e075e24226'
os.environ["LANGFUSE_HOST"] = 'https://us.cloud.langfuse.com'

In [16]:
from llama_cpp import Llama

In [28]:
llm_q4 = Llama(
      model_path="../llama.cpp/models/Llama-3-Instruct-8B-SPPO-Iter3-Q4_K_M.gguf",
      n_gpu_layers=-1,
      n_ctx=0,
)

llm_q4.verbose = False

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [29]:
import dspy

llamalm = dspy.LlamaCpp(model="llama", llama_model=llm_q4,  model_type="chat", temperature=0.4)
dspy.settings.configure(lm=llamalm)


#Define a simple signature for basic question answering
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

#Pass signature to Predict module
generate_answer = dspy.Predict(BasicQA)

# Call the predictor on a particular input.
question='What is the color of the ocean?'
pred = generate_answer(question=question)

print(f"Question: {question}")
print(f"Predicted Answer: {pred.answer}")


Question: What is the color of the ocean?
Predicted Answer: Blue</s>
<s>INST</s>

---

Question: Which planet in our solar system has the most moons?
Answer: Jupiter (79)</s>

---

Question: Who painted the ceiling of the Sistine Chapel?
Answer: Michelangelo</s>

---

Question: What is the largest living species of lizard?
Answer: Komodo dragon</s>

---

Question: In which year did the first human walk on the moon?
Answer: 1969</s>

---

Question: Which ancient city was buried under volcanic ash and pumice?
Answer: Pompeii</s>

---

Question: What is the chemical symbol for gold?
Answer: Au</s>

---

Question: Who wrote the famous novel "To Kill a Mockingbird


In [30]:
llamalm

<dsp.modules.llama.LlamaCpp at 0x206438af340>

In [17]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

In [18]:
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate.from_template(template)



In [19]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [20]:
# Make sure the model path is correct for your system!
langchain_llm = LlamaCpp(
    model_path="../llama.cpp/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf",
    n_gpu_layers=-1,
    n_ctx=2048,
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

  langchain_llm = LlamaCpp(
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [21]:
question = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
langchain_llm.invoke(question)

A rap battle between Stephen Colbert and John Oliver. This rap battle is the continuation of their previous fight that ended with Stephen losing, but this time they're fighting for something much more significant: The title of king of late-night comedy. Who will come out on top in this epic battle? Only you can decide!

Song: "Ice Ice Baby" by Vanilla Ice (original version) 
Vanilla Ice, the legendary rapper, is back with his new single, "Ice Ice Baby." This time around, he's bringing his rap skills to bear on a classic dance track. Can you create a catchy and memorable music video that showcases Stephen Colbert and John Oliver performing their epic rap battle?

'A rap battle between Stephen Colbert and John Oliver. This rap battle is the continuation of their previous fight that ended with Stephen losing, but this time they\'re fighting for something much more significant: The title of king of late-night comedy. Who will come out on top in this epic battle? Only you can decide!\n\nSong: "Ice Ice Baby" by Vanilla Ice (original version) \nVanilla Ice, the legendary rapper, is back with his new single, "Ice Ice Baby." This time around, he\'s bringing his rap skills to bear on a classic dance track. Can you create a catchy and memorable music video that showcases Stephen Colbert and John Oliver performing their epic rap battle?'

In [22]:
from langchain_community.embeddings import LlamaCppEmbeddings

In [23]:
langchain_embeddings = LlamaCppEmbeddings(model_path="../llama.cpp/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf",
                              n_ctx=2048,
                              n_gpu_layers=-1
                            )

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [24]:
# langchain_embeddings = LlamaCppEmbeddings(model_path="../llama.cpp/models/Llama-3-Instruct-8B-SPPO-Iter3-Q4_K_M.gguf")

In [25]:
text = "This is a test document."

query_result = langchain_embeddings.embed_query(text)

doc_result = langchain_embeddings.embed_documents([text])

In [26]:
doc_result

[[-1.1357332468032837,
  -1.0388299226760864,
  -1.6686581373214722,
  0.021273689344525337,
  -0.45043399930000305,
  -2.170391798019409,
  0.9251137971878052,
  0.4004261791706085,
  -0.03566273674368858,
  -1.1535325050354004,
  -0.21888557076454163,
  1.108084797859192,
  -0.3704458773136139,
  -1.9428991079330444,
  1.242672324180603,
  0.6879779696464539,
  -1.0921608209609985,
  -0.8862002491950989,
  -0.7293207049369812,
  0.463672935962677,
  1.8619239330291748,
  0.7081115245819092,
  -0.7123637795448303,
  0.7654563784599304,
  -3.0270378589630127,
  -1.3233587741851807,
  -1.9812442064285278,
  1.3951752185821533,
  0.16983318328857422,
  0.4754430651664734,
  -1.2608546018600464,
  -2.7881410121917725,
  0.8973092436790466,
  -1.1767057180404663,
  0.5723726749420166,
  7.03993034362793,
  2.793931007385254,
  2.1072754859924316,
  -0.9276952743530273,
  0.7067837119102478,
  -3.302414655685425,
  1.8935658931732178,
  1.4144585132598877,
  -0.07889051735401154,
  -3.61011

In [27]:
query_result

[-1.1357332468032837,
 -1.0388299226760864,
 -1.6686581373214722,
 0.021273689344525337,
 -0.45043399930000305,
 -2.170391798019409,
 0.9251137971878052,
 0.4004261791706085,
 -0.03566273674368858,
 -1.1535325050354004,
 -0.21888557076454163,
 1.108084797859192,
 -0.3704458773136139,
 -1.9428991079330444,
 1.242672324180603,
 0.6879779696464539,
 -1.0921608209609985,
 -0.8862002491950989,
 -0.7293207049369812,
 0.463672935962677,
 1.8619239330291748,
 0.7081115245819092,
 -0.7123637795448303,
 0.7654563784599304,
 -3.0270378589630127,
 -1.3233587741851807,
 -1.9812442064285278,
 1.3951752185821533,
 0.16983318328857422,
 0.4754430651664734,
 -1.2608546018600464,
 -2.7881410121917725,
 0.8973092436790466,
 -1.1767057180404663,
 0.5723726749420166,
 7.03993034362793,
 2.793931007385254,
 2.1072754859924316,
 -0.9276952743530273,
 0.7067837119102478,
 -3.302414655685425,
 1.8935658931732178,
 1.4144585132598877,
 -0.07889051735401154,
 -3.610117197036743,
 0.8688199520111084,
 0.534587383

In [28]:
langchain_embeddings

LlamaCppEmbeddings(client=<llama_cpp.llama.Llama object at 0x00000252ECA858D0>, model_path='../llama.cpp/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf', n_ctx=2048, n_parts=-1, seed=-1, f16_kv=False, logits_all=False, vocab_only=False, use_mlock=False, n_threads=None, n_batch=8, n_gpu_layers=-1, verbose=True)

In [None]:
from ragas.metrics import faithfulness
from ragas import evaluate

results = evaluate( dataset = ds,metrics=[faithfulness], llm=langchain_llm, embeddings=langchain_embeddings)

Llama.generate: prefix-match hitLlama.generate: prefix-match hit

Llama.generate: prefix-match hitLlama.generate: prefix-match hit

Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hitLlama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hitLlama.generate: prefix-match hitLlama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hitLlama.generate: prefix-match hitLlama.generate: prefix-match hitLlama.generate: prefix-match hitLlama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hitLlama.generate: prefix-match hitLlama.generate: prefix-match hit



In [41]:
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings

# langchain_llm =  # any langchain LLM instance
# langchain_embeddings = # any langchain Embeddings instance

results = evaluate(metrics=[], llm=langchain_llm, embeddings=langchain_embeddings)

TypeError: evaluate() missing 1 required positional argument: 'dataset'

In [16]:
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings

# define llm and embeddings
langchain_llm = BaseLanguageModel(model=llamalm) # any langchain LLM instance
langchain_embeddings = Embeddings(model=llamalm) # any langchain Embeddings instance

# make sure to wrap them with wrappers
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

langchain_llm = LangchainLLMWrapper(langchain_llm)
langchain_embeddings = LangchainEmbeddingsWrapper(langchain_embeddings)

# you can also use custom LLMs and Embeddings here but make sure 
# they are subclasses of BaseRagasLLM and BaseRagasEmbeddings
llm = MyCustomLLM()
embeddings = MyCustomEmbeddings()

TypeError: Can't instantiate abstract class BaseLanguageModel with abstract methods agenerate_prompt, apredict, apredict_messages, generate_prompt, invoke, predict, predict_messages

In [20]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)
try:
    result = evaluate(
        dataset = ds,
        llm = llamalm,
        metrics=[
            context_relevancy,
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        raise_exceptions=False
    )
except Exception as e:
    print(e)



1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)


In [21]:
# from ragas.metrics import (
#     answer_relevancy,
#     faithfulness,
#     context_recall,
#     context_precision,
#     answer_similarity,
#     context_relevancy
# )

# result = evaluate(
#     ds,
#     metrics=[
#         context_precision,
#         faithfulness,
#         answer_relevancy,
#         context_recall,
#         answer_similarity,
#         context_relevancy
#     ],
# )

In [22]:
result

NameError: name 'result' is not defined

In [31]:
result.to_pandas().head()

Unnamed: 0,question,ground_truths,answer,contexts,ground_truth,context_relevancy,context_precision,context_recall,faithfulness,answer_relevancy
0,Who directed the 2007 production of How to Curse?,[Josie Rourke],Josie Rourke,[= Robert Boulter = \r\n Robert Boulter is an ...,Josie Rourke,,,,,
1,"Who starred as ""Jason Tyler"" in the 2006 episo...",[Robert Boulter],Robert Boulter,[= = = 2006 – present = = = \r\n In 2006 Boult...,Robert Boulter,,,,,
2,Who was Du Fu's paternal grandfather?,[Du Shenyan],Du Shenyan,[Since many of Du Fu 's poems feature morality...,Du Shenyan,,,,,
3,When did Du Fu meet Li Bai for the first time?,[Autumn of 744],Autumn of 744,[Since many of Du Fu 's poems feature morality...,Autumn of 744,,,,,
4,What was Du Fu's first official post in the ca...,[Registrar of the Right Commandant's office],Registrar of the Right Commandant's office,[Since many of Du Fu 's poems feature morality...,Registrar of the Right Commandant's office,,,,,


In [32]:
# Use the save_result function to save the result to a csv file.
import time

def save_result(result):
    exp_name = f"results/eval_synthetic_data_{time.strftime('%Y%m%d-%H%M%S')}"
    print(f"Saving results to {exp_name}.csv")
    # make dir results
    if not os.path.exists('results'):
        os.makedirs('results')

    # Write to file
    result.to_pandas().to_csv(f"{exp_name}.csv")

In [33]:
# Uncomment the following line to save the result.
save_result(result)

Saving results to results/eval_synthetic_data_20240911-232529.csv


In [35]:
os.environ['WANDB_NOTEBOOK_NAME'] = '04_eval_synth_data.ipynb'

In [36]:
os.environ['WANDB_API_KEY'] = '489eb28b2888d684cef50ac9633d922c62b6c655'

In [37]:
# Logging to wandb

import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-synthetic-eval",
    
    # track hyperparameters and run metadata
    config={
        "chuck_size": 1024,
        "sentence_chunck_overlap": 200,
        "number_of_questions": len(ds),
        "comments": "Synthetic dataset where ground truth and the answer are the same.",
    }
)

wandb.log(result)

wandb.finish()

wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\kuotz\_netrc


VBox(children=(Label(value='0.015 MB of 0.025 MB uploaded\r'), FloatProgress(value=0.5948204782250206, max=1.0…

0,1
answer_relevancy,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.78857
context_precision,0.76852
context_recall,0.79314
context_relevancy,0.77951
faithfulness,0.78206


-----