# WIP # Chapter 2:

**Comprehensive Evaluation Strategies**

In [None]:
import os
import difflib
import Levenshtein
from nltk import word_tokenize
from nltk.translate import meteor
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from ranx import Qrels, Run, evaluate
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist
import pandas as pd
import json
import pathlib
from typing import List, Dict, Any
from tqdm import tqdm
import cohere
import nest_asyncio
nest_asyncio.apply()
import asyncio
from dotenv import load_dotenv
import numpy as np

load_dotenv()


## Building and improving an evaluation dataset

### Collecting data for evaluation
Get from data from the docs website [FAQs](https://docs.wandb.ai/guides/technical-faq) to test the system.

In [None]:
# from datetime import datetime

# # TODO: Remove this once we more to the final project
# eval_artifact = wandb.Artifact(
#     name="eval_dataset",
#     type="dataset",
#     description="Evaluation dataset for RAG",
#     metadata={
#         "total_samples": 20,
#         "date_collected": datetime.now().strftime("%Y-%m-%d"),
#         "chapter": "Chapter 1",
#     },
# )
# eval_artifact.add_file("../data/eval/eval_dataset.jsonl")
# run.log_artifact(eval_artifact)

In [None]:
WANDB_ENTITY = "rag-course"
WANDB_PROJECT = "dev"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    group="Chapter 2",
)

In [None]:
eval_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/eval_dataset:latest", type="dataset"
)
eval_dir = eval_artifact.download("../data/eval")
eval_dataset = pd.read_json(
    f"{eval_dir}/eval_dataset.jsonl", lines=True, orient="records"
)
eval_samples = eval_dataset.to_dict(orient="records")
eval_dataset

### Evaluating the Retriever

This is a search problem, it's easiest to start with tradiaional Information retrieval metrics.


ref: https://weaviate.io/blog/retrieval-evaluation-metrics

**TODO** Add weave model and evals in this section

In [None]:
# Reload the data from Chapter 1
chunked_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/chunked_data:latest", type="dataset"
)
artifact_dir = chunked_artifact.download()
chunked_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
chunked_data = list(map(json.loads, chunked_data_file.read_text().splitlines()))
chunked_data[:2]

In [None]:
# Reuse the Retriever class from Chapter 1
class Retriever:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.index = None
        self.data = None

    def index_data(self, data):
        self.data = data
        docs = [doc["cleaned_content"] for doc in data]
        self.index = self.vectorizer.fit_transform(docs)

    #@weave.op
    def search(self, query, k=5):
        query_vec = self.vectorizer.transform([query])
        cosine_distances = cdist(
            query_vec.todense(), self.index.todense(), metric="cosine"
        )[0]
        top_k_indices = cosine_distances.argsort()[:k]
        output = []
        for idx in top_k_indices:
            output.append(
                {
                    "source": self.data[idx]["metadata"]["source"],
                    "text": self.data[idx]["cleaned_content"],
                    "score": 1 - cosine_distances[idx],
                }
            )
        return output

In [None]:
retriever = Retriever()
retriever.index_data(chunked_data)

In [None]:
def calculate_hit_rate(retrieved_docs: List[str], actual_doc: str) -> float:
    """
    Calculate the hit rate for a single query.

    :param retrieved_docs: List of retrieved documents
    :param actual_doc: The single actual relevant document
    :return: Hit rate (1 if the relevant document is retrieved, 0 otherwise)
    """
    return 1 if actual_doc in retrieved_docs else 0


In [None]:
hit_rates = []
for sample in eval_samples:
    query = sample["question"]
    expected_source = sample["source"]
    search_results = [doc['source'] for doc in retriever.search(query, k=5)]
    hit_rate = calculate_hit_rate(search_results, expected_source)
    hit_rates.append({"query": query, "hit_rate": hit_rate})

hit_rate_df = pd.DataFrame(hit_rates)
display(hit_rate_df)

In [None]:
# we need a single number to rate the retrieval system
# the mean hit rate is a good metric to evaluate the retrieval system as a whole

print(f"Mean Hit Rate: {hit_rate_df['hit_rate'].mean():.4f}")
print(f"Std-dev Hit Rate: {hit_rate_df['hit_rate'].std():.4f}")

In [None]:
# MRR (Mean Reciprocal Rank) is a metric that measures the quality of the retrieval system by evaluating the proportion of queries for which the most relevant document is retrieved.
# Let's calculate the MRR score for our retrieval system

def calculate_mrr(retrieved_docs: List[str], actual_doc: str) -> float:
    mrr_score = 0
    for rank, result in enumerate(retrieved_docs, 1):
        if result == actual_doc:
            mrr_score = 1 / rank
            break
    return mrr_score

In [None]:
mrr_scores = []
for sample in eval_samples:
    query = sample["question"]
    expected_source = sample["source"]
    search_results = [doc['source'] for doc in retriever.search(query, k=5)]
    mrr_score = calculate_mrr(search_results, expected_source)
    mrr_scores.append({"query": query, "mrr_score": mrr_score})

mrr_scores_df = pd.DataFrame(mrr_scores)
display(mrr_scores_df)

In [None]:
# we need a single number to rate the retrieval system
# the mean mrr score is a good metric to evaluate the retrieval system
print(f"Mean MRR Score: {mrr_scores_df['mrr_score'].mean():.4f}")

# Looking at the mean might not give us the complete picture. We can also look at the distribution of the MRR scores
print("MRR Score Statistics:")
display(pd.DataFrame(mrr_scores_df["mrr_score"].describe()).T)

#### Evaluating retrieval on other metrics

In [None]:
# Similarly, we can also evaluate the retrieval system on other metrics
# Writing these might be tedious, but we can use the `ranx` library to evaluate the retrieval system
# Metrics Include
# NDCG (Normalized Discounted Cumulative Gain)
# MAP (Mean Average Precision)
# Hit Rate
# Precision
# Recall
# F1 Score


RETRIEVAL_METRICS = ["ndcg@5", "map@5", "mrr", "hit_rate", "precision", "recall", "f1"]


def evaluate_retriever(retrieved_docs: List[Dict[str, Any]], actual_doc: str) -> Dict[str, Any]:
    qrels = Qrels({"query": {actual_doc: 1}})
    run = Run({"query": {doc["source"]: doc["score"] for doc in retrieved_docs}})
    return evaluate(qrels, run, metrics=RETRIEVAL_METRICS)


In [None]:
retrieval_scores = []
for sample in tqdm(eval_samples):
    query = sample["question"]
    expected_source = sample["source"]
    search_results = retriever.search(query, k=5)
    eval_scores = evaluate_retriever(search_results, expected_source)
    retrieval_scores.append({"query": query, **eval_scores})

retrieval_scores_df = pd.DataFrame(retrieval_scores)
display(retrieval_scores_df)

In [None]:
print("\nMean Overall Retrieval Scores:")
display(pd.DataFrame(retrieval_scores_df[RETRIEVAL_METRICS].mean()).T)

print("\nOverall Retrieval Score Statistics:")
display(pd.DataFrame(retrieval_scores_df[RETRIEVAL_METRICS].describe()).T)

### Using an LLM as a Retrieval Judge

**ref: https://arxiv.org/pdf/2406.06519**

How do we evaluate if we don't have any ground truth? 

We can use a powerful LLM as a judge to evaluate the retriever. 


In [None]:

RETRIEVAL_EVAL_PROMPT ="""
Given a query and a document excerpt, you must provide a score on an integer scale of 0 to 3 with the following meanings:
    0 = represent that the excerpt has nothing to do with the query,
    1 = represents that the excerpt seems related to the query but does not help answer it,
    2 = represents that the excerpt has some answer for the query, but the answer may be a bit unclear, or hidden amongst extraneous information and
    3 = represents that the excerpt is dedicated to the query and contains the exact answer.

Important Instruction: Assign category 1 if the excerpt is somewhat related to the topic but not completely, category 2 if excerpt presents something very important related to the entire topic but also has some extra information and category 3 if the excerpt only and entirely refers to the topic. If none of the above satisfies give it category 0.

Query: {query}
Document: {document}

Split this problem into steps:
Consider the underlying intent of the query. Measure how well the content matches a likely intent of the query(M).
Measure how trustworthy the excerpt is (T).
Consider the aspects above and the relative importance of each, and decide on a final score (O). 
Final score must be an integer value only.
Do not provide any code in result. Provide each score in the following JSON format: 


{{"final_score": <integer score without providing any reasoning.>}}

"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

async def evaluate_retriever_using_llm_judge(query: str, passage: str) -> int:
    response = await client.chat(
        message=RETRIEVAL_EVAL_PROMPT.format(query=query, document=passage),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=2000,
    )
    return response.text


In [None]:
sample = eval_samples[0]
query = sample["question"]
search_results = retriever.search(query, k=5)
tasks = []
for result in search_results:
    tasks.append(evaluate_retriever_using_llm_judge(query, result["text"]))
sample_scores = asyncio.run(asyncio.gather(*tasks))
sample_scores

In [None]:
async def run_retriever_evaluation_using_llm(eval_samples):
    scores = []
    for sample in eval_samples:
        query = sample["question"]
        search_results = retriever.search(query, k=5)
        tasks = []
        for result in search_results:
            tasks.append(evaluate_retriever_using_llm_judge(query, result["text"]))
        sample_scores = await asyncio.gather(*tasks)
        sample_scores = map(json.loads, sample_scores)
        sample_scores = list(map(lambda x: x["final_score"], sample_scores))
        scores.append({"query": query, "scores": sample_scores})
    return scores
    

In [None]:
llm_judge_retrieval_results = asyncio.run(run_retriever_evaluation_using_llm(eval_samples))

In [None]:
# we have the scores for each document
llm_judge_retrieval_results_df = pd.DataFrame(llm_judge_retrieval_results)

# we can compute the reciprocal rank of the first document that is relevant to the query i.e. rated as 3 by our llm judge.
def compute_rank_score(scores: List[int]) -> float:
    rank_score = 0
    for rank, result in enumerate(scores, 1):
        if result == 3:
            rank_score = 1 / rank
            return rank_score
    return rank_score

llm_judge_retrieval_results_df["rank_score"] = llm_judge_retrieval_results_df["scores"].map(compute_rank_score)


display(llm_judge_retrieval_results_df)


print(f"Mean Rank Score: {llm_judge_retrieval_results_df['rank_score'].mean():.4f}")
print(f"Std-dev Rank Score: {llm_judge_retrieval_results_df['rank_score'].std():.4f}")
    

## Evaluating the Response

In [None]:
# Lets reload the Response Generator and the RAGPipeline from the previous chapter
class ResponseGenerator:
    def __init__(self, model: str, prompt: str):
        self.client = cohere.Client(api_key=os.environ["CO_API_KEY"])
        self.model = model
        self.prompt = prompt

    # @weave.op()

    def generate_response(self, query: str, context: List[Dict[str, any]]) -> str:
        
        documents = [{"source": item['source'], "text": item['text']} for item in context]
        response = self.client.chat(
            preamble=self.prompt,
            message=query,
            model=self.model,
            documents=documents,
            temperature=0.1,
            max_tokens=2000,
        )
        return response.text
    

PROMPT = "Answer to the following question about W&B. Provide an helful and complete answer based only on the provided documents."
response_generator = ResponseGenerator(model="command-r", prompt=PROMPT)

class RAGPipeline:
    def __init__(self, retriever: Retriever, response_generator: ResponseGenerator, top_k: int = 5):
        self.retriever = retriever
        self.response_generator = response_generator
        self.top_k = top_k

    
    # @weave.op
    def __call__(self, query: str) -> str:
        context = self.retriever.search(query, self.top_k)
        return self.response_generator.generate_response(query, context)

In [None]:
# We can measure the similarity of the response to the expected answer using difflib and Levenshtein distance
# These are simple metrics.

def calculate_diff_score(candidate, reference):
    return difflib.SequenceMatcher(None, candidate, reference).ratio()


def calculate_levenshtein_score(candidate, reference):
    return Levenshtein.ratio(candidate, reference)



# semantic answer similarity. (SAS) - https://arxiv.org/abs/2108.06130
# Originally, one should use a transformer based cross-encoder to measure and classify this. 
# For example, use something from https://sbert.net/docs/cross_encoder/usage/usage.html
# we can also calculate the cosine similarity between the candidate and the reference using our retriever's vectorizer
def calculate_similarity(candidate, reference):
    vectors = retriever.vectorizer.transform([candidate, reference])
    similarity = cosine_similarity(vectors)[0][1]
    return similarity




# or we can use traditional metrics used to measure generation systems.
# ref: https://blog.paperspace.com/automated-metrics-for-evaluating-generated-text/

def calculate_rouge(candidate, reference):
    rouge = Rouge(metrics=["rouge-l"], stats="f")
    scores = rouge.get_scores(candidate, reference)
    return scores[0]["rouge-l"]["f"]


def calculate_bleu(candidate, reference):
    chencherry = SmoothingFunction()
    smoothing_function = chencherry.method2

    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
    return score


def calculate_meteor(candidate, reference):
    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    meteor_score = meteor([candidate], reference)
    return meteor_score


In [None]:
rag_pipeline = RAGPipeline(retriever, response_generator)

response_scores = []
for sample in tqdm(eval_samples):
    query = sample['question']
    actual_answer = rag_pipeline(query)
    expected_answer = sample['answer']
    diff_score = calculate_diff_score(actual_answer, expected_answer)
    levenshtein_score = calculate_levenshtein_score(actual_answer, expected_answer)
    rouge_score = calculate_rouge(actual_answer, expected_answer)
    bleu_score = calculate_bleu(actual_answer, expected_answer)
    meteor_score = calculate_meteor(actual_answer, expected_answer)
    similarity_score = calculate_similarity(actual_answer, expected_answer)

    response_scores.append({
        "query": query,
        "expected_answer": expected_answer,
        "actual_answer": actual_answer,
        "diff_score": diff_score,
        "levenshtein_score": levenshtein_score,
        "rouge_score": rouge_score,
        "bleu_score": bleu_score,
        "meteor_score": meteor_score,
        "similarity_score": similarity_score
    })

In [None]:

response_scores_df = pd.DataFrame(response_scores)
display(response_scores_df)

GENERATION_METRICS = [col for col in response_scores_df.columns if "score" in col]


print("\nMean Overall Generation Scores:")
display(pd.DataFrame(response_scores_df[GENERATION_METRICS].mean()).T)

print("\nOverall Generation Score Statistics:")
display(pd.DataFrame(response_scores_df[GENERATION_METRICS].describe()).T)

### Using an LLM as a Response Judge

Some metrics cannot be defined objectively and are particularly useful for more subjective or complex criteria.
We care about correctness, faithfulness, and relevance.

- **Answer Correctness** - Is the generated answer correct compared to the reference and thoroughly answers the user's query?
- **Answer Relevancy** - Is the generated answer relevant and comprehensive?
- **Answer Factfulness** - Is the generated answer factually consistent with the context document?


In [None]:

CORRECTNESS_EVAL_PROMPT ="""
You are a Weight & Biases support expert tasked with evaluating the correctness of answers to questions asked by users to a technical support chatbot. 
You are tasked with judging the correctness of a generated answer based on the user's query, and a reference answer.

You will be given the following information:

<query>
{query}
</query>

<reference_answer>
{reference_answer}
</reference_answer>

<generated_answer>
{generated_answer}
</generated_answer>

Important Instruction: To evaluate the generated answer, follow these steps:

1. Intent Analysis: Consider the underlying intent of the query.
2. Relevance: Check if the generated answer addresses all aspects of the question.
3. Accuracy: Compare the generated answer to the reference answer for completeness and correctness.
4. Trustworthiness: Measure how trustworthy the generated answer is when compared to the reference.

Assign a score on an integer scale of 0 to 3 with the following meanings:
- 0 = The generated answer is incorrect and does not satisfy any of the criteria.
- 1 = The generated answer is partially correct, contains mistakes or is not factually correct.
- 2 = The generated answer is correct but includes some extra information, is incomplete or misses some evaluation criteria.
- 3 = The generated answer is correct, completely answers the query, does not contain any mistakes, and is factually consistent with the reference answer.

After your analysis, provide your verdict in the following JSON format:

{{
    "reason": "<<Provide a brief explanation for your decision here>>",
    "final_score": <<Provide a score as per the above guidelines>>,
    "decision": "<<Provide your final decision here, either 'correct' or 'incorrect'>>"
}}

Here are some examples of correct output:

Example 1:
{{
    "reason": "The generated answer has the exact details as the reference answer and completely answers the user's query.",
    "final_score": 3,
    "decision": "correct"
}}

Example 2:
{{
    "reason": "The generated answer doesn't match the reference answer and deviates from the user's query.",
    "final_score": 0,
    "decision": "incorrect"
}}

Example 3:
{{
    "reason": "The generated answer follows the same steps as the reference answer. However, it includes assumptions about functions that are not requested in the user's query",
    "final_score": 2,
    "decision": "correct"
}}

Example 4:
{{
    "reason": "The generated answer is incorrect, irrelevant, and not factually correct and completely misses the user's intent.",
    "final_score": 0,
    "decision": "incorrect"
}}

Please provide your evaluation based on the given information and format your response according to the specified JSON structure.
"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

async def evaluate_correctness_using_llm_judge(query: str, reference_answer: str, generated_answer: str) -> int:
    response = await client.chat(
        message=CORRECTNESS_EVAL_PROMPT.format(query=query, reference_answer=reference_answer, generated_answer=generated_answer),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=2000,
    )
    return response.text


In [None]:
async def run_correctness_evaluation_using_llm(response_scores):
    tasks = []
    for row in response_scores:
        query = row["query"]
        expected_answer = row["expected_answer"]
        generated_answer = row["actual_answer"]
        tasks.append(evaluate_correctness_using_llm_judge(query, expected_answer, generated_answer))
    scores = await asyncio.gather(*tasks)
    scores = list(map(json.loads, scores))
    return scores
    

In [None]:
llm_judge_correctness_results = asyncio.run(run_correctness_evaluation_using_llm(response_scores))

In [None]:
correctness_eval_df = pd.DataFrame(llm_judge_correctness_results)
response_evals_df = pd.concat([response_scores_df, correctness_eval_df], axis=1)
response_evals_df

In [None]:
response_evals_df.iloc[14].to_dict()

## Exercise

1. Implement the `Relevance` and `Faithfulness` evaluators and evaluate the pipeline on all the dimensions.
2. Generate and share a W&B report with the following sections in the form of tables and charts:
    
    - Summary of the evaluation
    - Retreival Evaluations
        - IR Metrics
        - LLM As a Retrieval Judge Metric
    - Response Evalations
        - Traditional NLP Metrics
        - LLM Judgement Metrics
    - Overall Evalations
    - Conclusion
