# WIP # Chapter 2:

**Comprehensive Evaluation Strategies**

In [None]:
import os
import difflib
import Levenshtein
from nltk import word_tokenize
from nltk.translate import meteor
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from ranx import Qrels, Run, evaluate
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist
import pandas as pd
import json
import pathlib
from typing import List, Dict, Any
from tqdm import tqdm
import cohere
import nest_asyncio
nest_asyncio.apply()
import asyncio
from dotenv import load_dotenv
import numpy as np

load_dotenv()


## Building and improving an evaluation dataset

### Collecting data for evaluation
Get from data from the docs website [FAQs](https://docs.wandb.ai/guides/technical-faq) to test the system.

In [None]:
# from datetime import datetime

# # TODO: Remove this once we more to the final project
# eval_artifact = wandb.Artifact(
#     name="eval_dataset",
#     type="dataset",
#     description="Evaluation dataset for RAG",
#     metadata={
#         "total_samples": 20,
#         "date_collected": datetime.now().strftime("%Y-%m-%d"),
#         "chapter": "Chapter 1",
#     },
# )
# eval_artifact.add_file("../data/eval/eval_dataset.jsonl")
# run.log_artifact(eval_artifact)

In [None]:
WANDB_ENTITY = "rag-course"
WANDB_PROJECT = "dev"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    group="Chapter 2",
)

In [None]:
eval_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/eval_dataset:latest", type="dataset"
)
eval_dir = eval_artifact.download("../data/eval")
eval_dataset = pd.read_json(
    f"{eval_dir}/eval_dataset.jsonl", lines=True, orient="records"
)
eval_samples = eval_dataset.to_dict(orient="records")
eval_dataset

### Evaluating the Retriever

This is a search problem, it's easiest to start with tradiaional Information retrieval metrics.


ref: https://weaviate.io/blog/retrieval-evaluation-metrics

**TODO** Add weave model and evals in this section

In [None]:
# Reload the data from Chapter 1
chunked_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/chunked_data:latest", type="dataset"
)
artifact_dir = chunked_artifact.download()
chunked_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
chunked_data = list(map(json.loads, chunked_data_file.read_text().splitlines()))
chunked_data[:2]

In [None]:
import weave

weave.init(f"{WANDB_ENTITY}/{WANDB_PROJECT}")

In [None]:
# Reuse the Retriever class from Chapter 1
retriever = weave.ref("weave:///rag-course/dev/object/Retriever:OvLVBKNX0eRiaaGOBCavdnlxzNQiC6SCMtOGRRhi0uM").get()

In [None]:
retriever.vectorizer = TfidfVectorizer()
retriever.index_data(chunked_data)
retriever.predict("what is wandb", 5)

In [None]:
@weave.op()
def compute_hit_rate(model_output: List[Dict[str, Any]], source: str) -> float:
    """
    Calculate the hit rate for a single query.

    :param retrieved_docs: List of retrieved documents
    :param actual_doc: The single actual relevant document
    :return: Hit rate (1 if the relevant document is retrieved, 0 otherwise)
    """
    search_results = [doc['source'] for doc in model_output]
    return 1 if source in search_results else 0


In [None]:
## Here's how we can evaluate the retrieval system normally in python

# hit_rates = []
# for sample in eval_samples:
#     query = sample["question"]
#     expected_source = sample["source"]
#     search_results = [doc['source'] for doc in retriever.search(query, k=5)]
#     hit_rate = compute_hit_rate(search_results, expected_source)
#     hit_rates.append({"query": query, "hit_rate": hit_rate})

# hit_rate_df = pd.DataFrame(hit_rates)
# display(hit_rate_df)


# # we need a single number to rate the retrieval system
# # the mean hit rate is a good metric to evaluate the retrieval system as a whole

# print(f"Mean Hit Rate: {hit_rate_df['hit_rate'].mean():.4f}")
# print(f"Std-dev Hit Rate: {hit_rate_df['hit_rate'].std():.4f}")

In [None]:
# the same evaluatuion can be done in weave

hit_rate_evaluation = weave.Evaluation(
    name="Retrieval_Hit_Score",
    dataset=eval_samples,
    scorers=[compute_hit_rate],
    preprocess_model_input=lambda x: {"query": x["question"], "k":5}
)
hit_rate = asyncio.run(hit_rate_evaluation.evaluate(retriever))

#### Evaluating retrieval on other metrics

In [None]:
# MRR (Mean Reciprocal Rank)
@weave.op()
def compute_mrr(model_output: List[Dict[str, Any]], source: str) -> float:
    mrr_score = 0
    for rank, result in enumerate(model_output, 1):
        if result['source'] == source:
            mrr_score = 1 / rank
            break
    return mrr_score


# NDCG (Normalized Discounted Cumulative Gain)
@weave.op()
def compute_ndcg(model_output: List[Dict[str, Any]], source: str) -> float:
    dcg = 0.0
    idcg = 0.0

    # Sort the results by score to calculate IDCG
    sorted_model_output = sorted(model_output, key=lambda x: x['score'], reverse=True)

    for i, result in enumerate(model_output):
        if result['source'] == source:
            # Calculate DCG
            dcg += (2 ** result['score'] - 1) / np.log2(i + 2)  # i+2 because log2 starts at 1 for i=0

    for i, result in enumerate(sorted_model_output):
        if result['source'] == source:
            # Calculate IDCG
            idcg += (2 ** result['score'] - 1) / np.log2(i + 2)

    # To avoid division by zero
    if idcg == 0:
        return 0.0

    # Calculate nDCG
    ndcg = dcg / idcg
    return ndcg



# MAP (Mean Average Precision)
@weave.op()
def compute_map(model_output: List[Dict[str, Any]], source: str) -> float:
    num_relevant = 0
    sum_precision = 0.0

    for i, result in enumerate(model_output):
        if result['source'] == source:
            num_relevant += 1
            sum_precision += num_relevant / (i + 1)

    if num_relevant == 0:
        return 0.0

    average_precision = sum_precision / num_relevant
    return average_precision


# Precision <- this is more discounted precision because we only have 1 reference 
@weave.op()
def compute_rank_precision(model_output: List[Dict[str, Any]], source: str) -> float:
    total_score = 0.0
    relevant_count = 0
    for i, result in enumerate(model_output):
        if result['source'] == source:
            total_score += max(1 - 0.2 * i, 0)
            relevant_count += 1
    return total_score / relevant_count if relevant_count > 0 else 0.0

# Recall
@weave.op()
def compute_recall(model_output: List[Dict[str, Any]], source: str) -> float:
    total_relevant_items = sum(1 for result in model_output if result['source'] == source)
    retrieved_relevant_items = total_relevant_items
    return retrieved_relevant_items / total_relevant_items if total_relevant_items > 0 else 0.0

# F1 Score
@weave.op()
def compute_f1(model_output: List[Dict[str, Any]], source: str) -> float:
    precision = compute_rank_precision(model_output, source)
    recall = compute_recall(model_output, source)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0


In [None]:
retrieval_scorers = [compute_mrr, compute_ndcg, compute_map, compute_hit_rate, compute_rank_precision, compute_recall, compute_f1]
retrieval_evaluation = weave.Evaluation(
    name="Retrieval_Evaluation",
    dataset=eval_samples,
    scorers=retrieval_scorers,
    preprocess_model_input=lambda x: {"query": x["question"], "k":5}
)
retrieval_scores = asyncio.run(retrieval_evaluation.evaluate(retriever))


### Using an LLM as a Retrieval Judge

**ref: https://arxiv.org/pdf/2406.06519**

How do we evaluate if we don't have any ground truth? 

We can use a powerful LLM as a judge to evaluate the retriever. 


In [None]:

RETRIEVAL_EVAL_PROMPT ="""
Given a query and a document excerpt, you must provide a score on an integer scale of 0 to 2 with the following meanings:
    0 = represents that the excerpt is irrelevant to the query,
    1 = represents that the excerpt is somewhat relevant to the query,
    2 = represents that the excerpt is is highly relevant to the query.
    

Important Instruction: Assign category 1 if the excerpt is somewhat related to the query but not completely, category 2 if the excerpt only and entirely refers to the query. If neither of these criteria satisfies the query, give it category 0.


Split this problem into steps:
Consider the underlying intent of the query. Measure how well the content matches a likely intent of the query(M).
Measure how trustworthy the excerpt is (T).
Consider the aspects above and the relative importance of each, and decide on a final score (O). 
Final score must be an integer value only.
Do not provide any code in result. Provide each score in the following JSON format: 
{{"final_score": <integer score without providing any reasoning.>}}

## Examples

Example 1: 
<Query>
How do I programmatically access the human-readable run name?
</Query>
<Document>
If you do not explicitly name your run, a random run name will be assigned to the run to help identify the run in the UI. For instance, random run names will look like "pleasant-flower-4" or "misunderstood-glade-2".

If you'd like to overwrite the run name (like snowy-owl-10) with the run ID (like qvlp96vk) you can use this snippet:

import wandbRetrieval_Evaluation

wandb.init()
wandb.run.name = wandb.run.id
wandb.run.save()

</Document>
{{"final_score": 0}}

Example 2:
<Query>
What are Runs?
</Query>
<Document>
A single unit of computation logged by W&B is called a run. You can think of a W&B run as an atomic element of your whole project. You should initiate a new run when you:
 - Train a model
 - Change a hyperparameter
 - Use a different model
 - Log data or a model as a W&B Artifact
 - Download a W&B Artifact

For example, during a sweep, W&B explores a hyperparameter search space that you specify. Each new hyperparameter combination created by the sweep is implemented and recorded as a unique run. 
</Document>
{{"final_score": 2}}

Example 3:
<Query>
How do I use W&B with Keras ?
</Query>
<Document>
We have added three new callbacks for Keras and TensorFlow users, available from wandb v0.13.4. For the legacy WandbCallback scroll down.
These new callbacks,
 - Adhere to Keras design philosophy
 - Reduce the cognitive load of using a single callback (WandbCallback) for everything
 - Make it easy for Keras users to modify the callback by subclassing it to support their niche use case
</Document>
{{"final_score": 1}}

<Query>
{query}
</Query>

<Document>
{document}
</Document>

"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

@weave.op()
async def evaluate_retriever_using_llm_judge(query: str, passage: str) -> str:
    response = await client.chat(
        message=RETRIEVAL_EVAL_PROMPT.format(query=query, document=passage),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=20,
    )
    return response.text


In [None]:
@weave.op()
async def run_retriever_evaluation_using_llm(eval_samples: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    scores = []
    for sample in eval_samples:
        query = sample["question"]
        search_results = retriever.search(query, k=5)
        tasks = []
        for result in search_results:
            tasks.append(evaluate_retriever_using_llm_judge(query, result["text"]))
        sample_scores = await asyncio.gather(*tasks)
        sample_scores = map(json.loads, sample_scores)
        sample_scores = list(map(lambda x: x["final_score"], sample_scores))
        scores.append({"query": query, "scores": sample_scores})
    return scores
    

In [None]:
llm_judge_retrieval_results = asyncio.run(run_retriever_evaluation_using_llm(eval_samples))

In [None]:
# we have the scores for each document
llm_judge_retrieval_results_df = pd.DataFrame(llm_judge_retrieval_results)

# we can compute the reciprocal rank of the first document that is relevant to the query i.e. rated as 2 by our llm judge.
def compute_rank_score(scores: List[int]) -> float:
    rank_score = 0
    for rank, result in enumerate(scores, 1):
        if result == 2:
            rank_score = 1 / rank
            return rank_score
    return rank_score

llm_judge_retrieval_results_df["rank_score"] = llm_judge_retrieval_results_df["scores"].map(compute_rank_score)


display(llm_judge_retrieval_results_df)


print(f"Mean Rank Score: {llm_judge_retrieval_results_df['rank_score'].mean():.4f}")
print(f"Std-dev Rank Score: {llm_judge_retrieval_results_df['rank_score'].std():.4f}")
    

## Evaluating the Response

In [None]:
response_generator = weave.ref("weave:///rag-course/dev/object/ResponseGenerator:YQiRpgHDhvJcrZEVpXJbvy2PuJyh5W3gXuhX06zZiYQ").get()
query = "What is w&b?"
context = retriever.search(query, 5)

response_generator.client = cohere.Client(api_key=os.environ["CO_API_KEY"])
response_generator.predict(query, context)



In [None]:
rag_pipeline = weave.ref("weave:///rag-course/dev/object/RAGPipeline:njpUKmvYezO3X9cJJ5BXAujh0XSTHcpRyDsYswHbPRs").get()
rag_pipeline.retriever = retriever
rag_pipeline.response_generator = response_generator
rag_pipeline.predict(query)

In [None]:
from nltk.corpus import wordnet as wn
wn.ensure_loaded()
# We can measure the similarity of the response to the expected answer using difflib and Levenshtein distance
# These are simple metrics.

@weave.op()
def compute_diff(model_output: str, answer: str) -> float:
    return difflib.SequenceMatcher(None, model_output, answer).ratio()

@weave.op()
def compute_levenshtein(model_output: str, answer: str) -> float:
    return Levenshtein.ratio(model_output, answer)



# semantic answer similarity. (SAS) - https://arxiv.org/abs/2108.06130
# Originally, one should use a transformer based cross-encoder to measure and classify this. 
# For example, use something from https://sbert.net/docs/cross_encoder/usage/usage.html
# we can also calculate the cosine similarity between the candidate and the reference using our retriever's vectorizer
@weave.op()
def compute_similarity(model_output: str, answer: str) -> float:
    vectors = retriever.vectorizer.transform([model_output, answer])
    similarity = cosine_similarity(vectors)[0][1]
    return similarity




# or we can use traditional metrics used to measure generation systems.
# ref: https://blog.paperspace.com/automated-metrics-for-evaluating-generated-text/

@weave.op()
def compute_rouge(model_output: str, answer: str) -> float:
    rouge = Rouge(metrics=["rouge-l"], stats="f")
    scores = rouge.get_scores(model_output, answer)
    return scores[0]["rouge-l"]["f"]


@weave.op()
def compute_bleu(model_output: str, answer: str) -> float:
    chencherry = SmoothingFunction()
    smoothing_function = chencherry.method2

    reference = word_tokenize(answer)
    candidate = word_tokenize(model_output)
    score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
    return score


In [None]:
response_scorers = [compute_diff, compute_levenshtein, compute_similarity, compute_rouge, compute_bleu]
response_evaluations = weave.Evaluation(
    name="Response_Evaluation",
    dataset=eval_samples, 
    scorers=response_scorers, 
    preprocess_model_input=lambda x: {"query": x["question"]})
response_scores = asyncio.run(response_evaluations.evaluate(rag_pipeline))


### Using an LLM as a Response Judge

Some metrics cannot be defined objectively and are particularly useful for more subjective or complex criteria.
We care about correctness, faithfulness, and relevance.

- **Answer Correctness** - Is the generated answer correct compared to the reference and thoroughly answers the user's query?
- **Answer Relevancy** - Is the generated answer relevant and comprehensive?
- **Answer Factfulness** - Is the generated answer factually consistent with the context document?


In [None]:

CORRECTNESS_EVAL_PROMPT ="""
You are a Weight & Biases support expert tasked with evaluating the correctness of answers to questions asked by users to a technical support chatbot. 
You are tasked with judging the correctness of a generated answer based on the user's query, and a reference answer.

You will be given the following information:

<query>
{query}
</query>

<reference_answer>
{reference_answer}
</reference_answer>

<generated_answer>
{generated_answer}
</generated_answer>

Important Instruction: To evaluate the generated answer, follow these steps:

1. Intent Analysis: Consider the underlying intent of the query.
2. Relevance: Check if the generated answer addresses all aspects of the question.
3. Accuracy: Compare the generated answer to the reference answer for completeness and correctness.
4. Trustworthiness: Measure how trustworthy the generated answer is when compared to the reference.

Assign a score on an integer scale of 0 to 2 with the following meanings:
- 0 = The generated answer is incorrect and does not satisfy any of the criteria.
- 1 = The generated answer is partially correct, contains mistakes or is not factually correct.
- 2 = The generated answer is correct, completely answers the query, does not contain any mistakes, and is factually consistent with the reference answer.

After your analysis, provide your verdict in the following JSON format:

{{
    "reason": "<<Provide a brief explanation for your decision here>>",
    "final_score": <<Provide a score as per the above guidelines>>,
    "decision": "<<Provide your final decision here, either 'correct' or 'incorrect'>>"
}}

Here are some examples of correct output:

Example 1:
{{
    "reason": "The generated answer has the exact details as the reference answer and completely answers the user's query.",
    "final_score": 2,
    "decision": "correct"
}}

Example 2:
{{
    "reason": "The generated answer doesn't match the reference answer and deviates from the user's query.",
    "final_score": 0,
    "decision": "incorrect"
}}

Example 3:
{{
    "reason": "The generated answer follows the same steps as the reference answer. However, it significantly misses the user's intent,
    "final_score": 1,
    "decision": "incorrect"
}}

Example 4:
{{
    "reason": "The generated is not factually correct and includes assumptions about code methods completely different from the reference answer",
    "final_score": 0,
    "decision": "incorrect"
}}

Please provide your evaluation based on the given information and format your response according to the specified JSON structure.
"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

@weave.op()
async def evaluate_correctness_using_llm_judge(question: str, answer: str, model_output: str) -> Dict[str, Any]:
    response = await client.chat(
        message=CORRECTNESS_EVAL_PROMPT.format(query=question, reference_answer=answer, generated_answer=model_output),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=150,
    )
    return json.loads(response.text)


In [None]:
response_scorers = [evaluate_correctness_using_llm_judge]
correctness_evaluations = weave.Evaluation(
    name="Correctness_Evaluation",
    dataset=eval_samples, 
    scorers=response_scorers, 
    preprocess_model_input=lambda x: {"query": x["question"]})
response_scores = asyncio.run(correctness_evaluations.evaluate(rag_pipeline))


## Exercise

1. Implement the `Relevance` and `Faithfulness` evaluators and evaluate the pipeline on all the dimensions.
2. Generate and share a W&B report with the following sections in the form of tables and charts:
    
    - Summary of the evaluation
    - Retreival Evaluations
        - IR Metrics
        - LLM As a Retrieval Judge Metric
    - Response Evalations
        - Traditional NLP Metrics
        - LLM Judgement Metrics
    - Overall Evalations
    - Conclusion
