# WIP # Chapter 2:

**Comprehensive Evaluation Strategies**

In [None]:
%load_ext autoreload
%autoreload 2


import json
import pathlib

import nest_asyncio
import pandas as pd

import wandb

nest_asyncio.apply()
import asyncio

import weave
from dotenv import load_dotenv

from scripts.utils import display_source

load_dotenv()

## Building and improving an evaluation dataset

### Collecting data for evaluation
Get from data from the docs website [FAQs](https://docs.wandb.ai/guides/technical-faq) to test the system.

In [None]:
# from datetime import datetime
# # TODO: Remove this once we more to the final project
# run = wandb.init(
#     entity="rag-course",
#     project="dev",
#     group="Chapter 2",
# )
# eval_artifact = wandb.Artifact(
#     name="eval_dataset",
#     type="dataset",
#     description="Evaluation dataset for RAG",
#     metadata={
#         "total_samples": {"easy_eval": 20, "hard_eval": 50, "test": 100},
#         "date_collected": datetime.now().strftime("%Y-%m-%d"),
#         "chapter": "Chapter 2",
#     },
# )
# eval_artifact.add_dir("../data/eval")
# run.log_artifact(eval_artifact)
# run.finish()

In [None]:
WANDB_ENTITY = "rag-course"
WANDB_PROJECT = "dev"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    group="Chapter 2",
)


weave_client = weave.init(f"{WANDB_ENTITY}/{WANDB_PROJECT}")

In [None]:
eval_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/eval_dataset:latest", type="dataset"
)
eval_dir = eval_artifact.download("../data/eval")
eval_dataset = pd.read_json(
    f"{eval_dir}/eval_dataset.jsonl", lines=True, orient="records"
)
eval_samples = eval_dataset.to_dict(orient="records")
eval_dataset

### Evaluating the Retriever

This is a search problem, it's easiest to start with tradiaional Information retrieval metrics.


ref: https://weaviate.io/blog/retrieval-evaluation-metrics

**TODO** Add weave model and evals in this section

In [None]:
# Reload the data from Chapter 1
chunked_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/chunked_data:latest", type="dataset"
)
artifact_dir = chunked_artifact.download()
chunked_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
chunked_data = list(map(json.loads, chunked_data_file.read_text().splitlines()))
chunked_data[:2]

In [None]:
from scripts.retriever import TFIDFRetriever

display_source(TFIDFRetriever)

retriever = TFIDFRetriever()
retriever.index_data(chunked_data)

In [None]:
from scripts.retrieval_metrics import IR_METRICS, LLM_METRICS as RETRIEVAL_LLM_METRICS
from scripts.utils import display_source

for scorer in IR_METRICS:
    display_source(scorer)

#### Evaluating retrieval on other metrics

In [None]:
retrieval_evaluation = weave.Evaluation(
    name="Retrieval_Evaluation",
    dataset=eval_samples,
    scorers=IR_METRICS,
    preprocess_model_input=lambda x: {"query": x["question"], "k":5}
)
retrieval_scores = asyncio.run(retrieval_evaluation.evaluate(retriever))

### Using an LLM as a Retrieval Judge

**ref: https://arxiv.org/pdf/2406.06519**

How do we evaluate if we don't have any ground truth? 

We can use a powerful LLM as a judge to evaluate the retriever. 


In [None]:
for metric in RETRIEVAL_LLM_METRICS:
    display_source(metric)

In [None]:
retrieval_evaluation = weave.Evaluation(
    name="LLM_Judge_Retrieval_Evaluation",
    dataset=eval_samples,
    scorers=RETRIEVAL_LLM_METRICS,
    preprocess_model_input=lambda x: {"query": x["question"], "k":5}
)
retrieval_scores = asyncio.run(retrieval_evaluation.evaluate(retriever))

## Evaluating the Response

In [None]:
from scripts.rag_pipeline import SimpleRAGPipeline
from scripts.response_generator import SimpleResponseGenerator

INITIAL_PROMPT = open("prompts/initial_system.txt", "r").read()
response_generator = SimpleResponseGenerator(model="command-r", prompt=INITIAL_PROMPT)
rag_pipeline = SimpleRAGPipeline(retriever=retriever, response_generator=response_generator, top_k=5)

In [None]:
from scripts.response_metrics import NLP_METRICS, LLM_METRICS as RESPONSE_LLM_METRICS
for scorer in NLP_METRICS:
    display_source(scorer)

In [None]:
response_evaluations = weave.Evaluation(
    name="Response_Evaluation",
    dataset=eval_samples, 
    scorers=NLP_METRICS, 
    preprocess_model_input=lambda x: {"query": x["question"]})
response_scores = asyncio.run(response_evaluations.evaluate(rag_pipeline))

### Using an LLM as a Response Judge

Some metrics cannot be defined objectively and are particularly useful for more subjective or complex criteria.
We care about correctness, faithfulness, and relevance.

- **Answer Correctness** - Is the generated answer correct compared to the reference and thoroughly answers the user's query?
- **Answer Relevancy** - Is the generated answer relevant and comprehensive?
- **Answer Factfulness** - Is the generated answer factually consistent with the context document?


In [None]:
for metric in RESPONSE_LLM_METRICS:
    display_source(metric)

In [None]:
correctness_evaluations = weave.Evaluation(
    name="Correctness_Evaluation",
    dataset=eval_samples, 
    scorers=RESPONSE_LLM_METRICS, 
    preprocess_model_input=lambda x: {"query": x["question"]})
response_scores = asyncio.run(correctness_evaluations.evaluate(rag_pipeline))

## Exercise

1. Implement the `Relevance` and `Faithfulness` evaluators and evaluate the pipeline on all the dimensions.
2. Generate and share a W&B report with the following sections in the form of tables and charts:
    
    - Summary of the evaluation
    - Retreival Evaluations
        - IR Metrics
        - LLM As a Retrieval Judge Metric
    - Response Evalations
        - Traditional NLP Metrics
        - LLM Judgement Metrics
    - Overall Evalations
    - Conclusion
