# Chapter 3 
## Data Ingestion and Preprocessing

One way to improve our RAG system is to improve our data ingestion and preprocessing.

In [None]:
import json
import os
import pathlib
from datetime import datetime
from typing import Dict, List

import dotenv
import numpy as np
import wandb
import cohere

dotenv.load_dotenv()

In [None]:
WANDB_ENTITY = "rag-course"
WANDB_PROJECT = "dev"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    group="Chapter 3",
)

In [None]:
# We'll re-use the raw dataset from the artifact in our previous step


raw_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/raw_data:latest", type="dataset"
)
artifact_dir = raw_artifact.download()
raw_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
raw_data = list(map(json.loads, raw_data_file.read_text().splitlines()))
raw_data[:2]

In [None]:
# Earlier we referred to words as tokens. We can be more correct in defining tokens by using a tokenizer.
# We'll use the Cohere tokenizer for this example.

co = cohere.Client(api_key=os.environ["CO_API_KEY"])

In [None]:
def tokenize_text(text: str) -> List[str]:
    return co.tokenize(text=text, model="command-r", offline=True)

In [None]:
tokenizers = {
    "command-r": "https://storage.googleapis.com/cohere-public/tokenizers/command-r.json",
    "command-r-plus": "https://storage.googleapis.com/cohere-public/tokenizers/command-r-plus.json"
}


In [None]:
for doc in raw_data[:]:
    doc['metadata']['words'] = doc['metadata'].pop('raw_tokens')
    doc['metadata']['tokens'] = len(tokenize_text(doc['content']).tokens)
raw_data[:2]

## Pre-processing

There is a lot of extra formatting information (markdown elements) that is not very useful to an LLM.

We can remove this information by converting the contents to text. We can also remove any special characters and extra whitespace. 

Special characters here are ones that are defined in the tokenizer and will vary depending on the model used.


In [None]:
# %load scripts/preprocess
import frontmatter
import markdown
from bs4 import BeautifulSoup
import requests
import re

def convert_contents_to_text(contents: str) -> str:
    _, content = frontmatter.parse(contents)
    # use some extensions to convert the markdown to html
    markdown_document = markdown.markdown(
        content,
        extensions=[
            "toc",
            "pymdownx.extra",
            "pymdownx.blocks.admonition",
            "pymdownx.magiclink",
            "pymdownx.blocks.tab",
            "pymdownx.pathconverter",
            "pymdownx.saneheaders",
            "pymdownx.striphtml",
            "pymdownx.highlight",
            "pymdownx.pathconverter",
            "pymdownx.escapeall"
        ],
    )
    soup = BeautifulSoup(markdown_document, "html.parser")
    def remove_urls_a_tags_hrefs(soup):
        # For hyperlinks, keep the text but remove the link
        for a_tag in soup.find_all('a'):
            a_tag.replace_with(a_tag.text)
        
        # Remove all images
        for img_tag in soup.find_all('img'):
            img_tag.decompose()
        
        return soup

    # Use the function as before
    soup = remove_urls_a_tags_hrefs(soup)

    def remove_javascript_import_statements(soup):
        for p in soup.find_all('p'):
            if p.text.strip().startswith('import') and ';' in p.text:
                p.decompose()
        return soup
    soup = remove_javascript_import_statements(soup)

    return soup.get_text()

def get_special_tokens_set(tokenizer_url):
    # https://docs.cohere.com/docs/tokens-and-tokenizers
    response = requests.get(tokenizer_url)
    return set([tok["content"] for tok in response.json()["added_tokens"]])

def make_text_tokenization_safe(content: str, special_tokens_set: set) -> str:
    def remove_special_tokens(text: str) -> str:
        """Removes special tokens from the given text.

        Args:
            text: A string representing the text.

        Returns:
            The text with special tokens removed.
        """
        for token in special_tokens_set:
            text = text.replace(token, "")
        return text

    cleaned_content = remove_special_tokens(content)
    return cleaned_content


In [None]:
special_tokens_set = get_special_tokens_set(tokenizers["command-r"])
parsed_data = []

for doc in raw_data:
    parsed_doc = doc.copy()
    content = convert_contents_to_text(doc["content"])
    parsed_doc["parsed_content"] = make_text_tokenization_safe(content, special_tokens_set=special_tokens_set)
    parsed_doc["metadata"]["parsed_tokens"] = len(tokenize_text(parsed_doc["parsed_content"]).tokens)
    parsed_data.append(parsed_doc)
parsed_data[:2]

In [None]:
total_words = sum(map(lambda x: x["metadata"]["words"], parsed_data))
total_raw_tokens = sum(map(lambda x: x["metadata"]["tokens"], raw_data))
total_parsed_tokens = sum(map(lambda x: x["metadata"]["parsed_tokens"], parsed_data))

preprocessed_artifact = wandb.Artifact(name="preprocessed_data", type="dataset",
description="Preprocessed wandb documentation", metadata={
    "total_files": len(parsed_data),
    "date_preprocessed": datetime.now().strftime("%Y-%m-%d"),
    "total_words": total_words,
    "total_raw_tokens": total_raw_tokens,
    "total_parsed_tokens": total_parsed_tokens,
    }
)
with preprocessed_artifact.new_file("documents.jsonl", mode="w") as f:
    for item in parsed_data:
        f.write(json.dumps(item) + "\n")
run.log_artifact(preprocessed_artifact)

## Data Chunking

1. First we split the text into sentences using [BlingFire](https://github.com/microsoft/BlingFire) library.
2. Then we split the sentences into chunks of a maximum number of tokens.

In [None]:
preprocessed_artifact = run.use_artifact(f'{WANDB_ENTITY}/{WANDB_PROJECT}/preprocessed_data:latest', type='dataset')
artifact_dir = preprocessed_artifact.download()
preprocessed_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
preprocessed_data = list(map(json.loads, preprocessed_data_file.read_text().splitlines()))
preprocessed_data[:2]

In [None]:
# %load scripts/chunking
from blingfire import text_to_sentences
from typing import List, Callable

CHUNK_SIZE = 500


def split_into_chunks(
    text: str, tokenize_text: Callable[[str], List[str]], max_tokens: int = CHUNK_SIZE
) -> List[str]:
    # Split the text into sentences
    sentences = text_to_sentences(text).split("\n")

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenize_text("\n" + sentence).tokens) for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):
        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append("\n".join(chunk))
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    # Add any remaining chunk
    if chunk:
        chunks.append("\n".join(chunk))

    return chunks


In [None]:
chunked_data = []
for doc in preprocessed_data:
    chunks = split_into_chunks(doc["parsed_content"], tokenize_text=tokenize_text)
    for chunk in chunks:
        chunked_data.append(
            {
                "cleaned_content" : chunk,
                "metadata": {
                    "source": doc["metadata"]["source"],
                    "parsed_tokens": len(tokenize_text(chunk).tokens)
            }})
        
chunked_data[:2]

In [None]:
# Again, we'll store the cleaned data in an artifact for future use and reproducibility

total_cleaned_tokens = sum(map(lambda x: x["metadata"]["parsed_tokens"], chunked_data))

chunked_artifact = wandb.Artifact(
    name="chunked_data",
    type="dataset",
    description="Chunked wandb documentation",
    metadata={
        "total_files": len(chunked_data),
        "date_processed": datetime.now().strftime("%Y-%m-%d"),
        "total_raw_tokens": total_raw_tokens,
        "total_cleaned_tokens": total_cleaned_tokens,
        "chunk_size": CHUNK_SIZE,
    },
)
with chunked_artifact.new_file("documents.jsonl", mode="w") as f:
    for item in chunked_data:
        f.write(json.dumps(item) + "\n")
run.log_artifact(chunked_artifact)

In [None]:
# %load scripts/retriever
import weave
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist

class Retriever(weave.Model):
    vectorizer: TfidfVectorizer = TfidfVectorizer()
    index: list = None
    data: list = None

    @weave.op()
    def index_data(self, data):
        self.data = data
        docs = [doc["cleaned_content"] for doc in data]
        self.index = self.vectorizer.fit_transform(docs)

    @weave.op()
    def search(self, query, k=5):
        query_vec = self.vectorizer.transform([query])
        cosine_distances = cdist(
            query_vec.todense(), self.index.todense(), metric="cosine"
        )[0]
        top_k_indices = cosine_distances.argsort()[:k]
        output = []
        for idx in top_k_indices:
            output.append(
                {
                    "source": self.data[idx]["metadata"]["source"],
                    "text": self.data[idx]["cleaned_content"],
                    "score": 1 - cosine_distances[idx],
                }
            )
        return output

    @weave.op()
    def predict(self, query: str, k: int):
        return self.search(query, k)


In [None]:
retriever = Retriever()
retriever.vectorizer = TfidfVectorizer()
retriever.index_data(chunked_data)


In [None]:
# %load scripts/response_generator
import os
from typing import List, Dict
import cohere
import weave

class ResponseGenerator(weave.Model):
    model: str
    prompt: str
    client: cohere.Client = None
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.client = cohere.Client(api_key=os.environ["CO_API_KEY"])

    @weave.op()
    def generate_context(self, context: List[Dict[str, any]]) -> str:
        return [{"source": item['source'], "text": item['text']} for item in context]
    
    @weave.op()
    def generate_response(self, query: str, context: List[Dict[str, any]]) -> str:
        contexts = self.generate_context(context)
        response = self.client.chat(
            preamble=self.prompt,
            message=query,
            model=self.model,
            documents=contexts,
            temperature=0.1,
            max_tokens=2000,
        )
        return response.text

    @weave.op()
    def predict(self, query: str, context: List[Dict[str, any]]):
        return self.generate_response(query, context)


In [None]:
# %load prompts/initial_system
INITIAL_PROMPT = """
Answer to the following question about W&B. Provide an helful and complete answer based only on the provided documents.
"""


In [None]:
response_generator = ResponseGenerator(model="command-r", prompt=INITIAL_PROMPT)

In [None]:
# %load scripts/rag_pipeline
import weave
from typing import Optional, Union
from scripts.retriever import Retriever
from scripts.response_generator import ResponseGenerator


class RAGPipeline(weave.Model):
    retriever: Union[weave.Model, Retriever] = None
    response_generator: Union[weave.Model, ResponseGenerator] = None
    top_k: int = 5

    @weave.op()
    def predict(self, query: str):
        context = self.retriever.predict(query, self.top_k)
        return self.response_generator.predict(query, context)


In [None]:
rag_pipeline = RAGPipeline(retriever=retriever, response_generator=response_generator)

## Eval the changes

In [None]:
import pandas as pd

In [None]:
eval_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/eval_dataset:latest", type="dataset"
)
eval_dir = eval_artifact.download("../data/eval")
eval_dataset = pd.read_json(
    f"{eval_dir}/final_eval_dataset.jsonl", lines=True, orient="records"
)
eval_samples = eval_dataset.to_dict(orient="records")
eval_dataset

In [None]:
# %load scripts/retrieval_metrics.py
import weave
import numpy as np
from typing import List, Dict, Any


@weave.op()
def compute_hit_rate(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the hit rate (precision) for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The relevance score of the document.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The hit rate (precision).

    The hit rate (precision) measures the proportion of retrieved documents that are relevant.
    It is calculated using the following formula:

    \[ \text{Hit Rate (Precision)} = \frac{\text{Number of Relevant Documents Retrieved}}{\text{Total Number of Documents Retrieved}} \]

    This metric is useful for assessing the accuracy of the retrieval system by determining the relevance of the retrieved documents.
    ```
    """
    search_results = [doc["source"] for doc in model_output]
    relevant_sources = [
        context["source"] for context in contexts if context["relevance"] != 0
    ]

    # Calculate the number of relevant documents retrieved
    relevant_retrieved = sum(
        1 for source in search_results if source in relevant_sources
    )

    # Calculate the hit rate (precision)
    hit_rate = relevant_retrieved / len(search_results) if search_results else 0.0

    return hit_rate


@weave.op
def compute_mrr(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the Mean Reciprocal Rank (MRR) for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The relevance score of the document.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:<- this is not working
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The MRR score for the given query.

    MRR measures the rank of the first relevant document in the result list.
    It is calculated using the following formula:

    \[ \text{MRR} = \frac{1}{\text{rank of first relevant document}} \]

    If no relevant document is found, MRR is 0.

    This metric is useful for evaluating systems where there is typically one relevant document
    and the user is interested in finding that document quickly.
    """
    relevant_sources = [
        context["source"] for context in contexts if context["relevance"] != 0
    ]

    mrr_score = 0
    for rank, result in enumerate(model_output, 1):
        if result["source"] in relevant_sources:
            mrr_score = 1 / rank
            break
    return mrr_score


# NDCG (Normalized Discounted Cumulative Gain)
@weave.op
def compute_ndcg(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the Normalized Discounted Cumulative Gain (NDCG) for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The cosine similarity score of the document to the query.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The NDCG score for the given query.

    NDCG measures the ranking quality of the search results, taking into account the position of relevant documents.

    NDCG Formula:
    1. Calculate the Discounted Cumulative Gain (DCG):
       \[ \text{DCG}_p = \sum_{i=1}^p \frac{2^{rel_i} - 1}{\log_2(i + 1)} \]
       where \( rel_i \) is the relevance score of the document at position \( i \).

    2. Calculate the Ideal Discounted Cumulative Gain (IDCG), which is the DCG of the ideal ranking:
       \[ \text{IDCG}_p = \sum_{i=1}^p \frac{2^{rel_i} - 1}{\log_2(i + 1)} \]
       where documents are sorted by their relevance scores in descending order.

    3. Normalize the DCG by dividing it by the IDCG to get NDCG:
       \[ \text{NDCG}_p = \frac{\text{DCG}_p}{\text{IDCG}_p} \]

    This implementation uses continuous relevance scores.
    """
    relevant_sources = {
        context["source"] for context in contexts if context["relevance"] != 0
    }

    dcg = 0.0
    idcg = 0.0

    # Calculate DCG
    for i, result in enumerate(model_output):
        if result["source"] in relevant_sources:
            dcg += (2 ** result["score"] - 1) / np.log2(
                i + 2
            )  # i+2 because log2 starts at 1 for i=0

    # Sort the results by score to calculate IDCG
    sorted_model_output = sorted(model_output, key=lambda x: x["score"], reverse=True)

    # Calculate IDCG
    for i, result in enumerate(sorted_model_output):
        if result["source"] in relevant_sources:
            idcg += (2 ** result["score"] - 1) / np.log2(i + 2)

    # To avoid division by zero
    if idcg == 0:
        return 0.0

    # Calculate nDCG
    ndcg = dcg / idcg
    return ndcg


# MAP (Mean Average Precision)
@weave.op()
def compute_map(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the Mean Average Precision (MAP) for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The relevance score of the document.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The MAP score for the given query.

    MAP provides a single-figure measure of quality across recall levels.
    For a single query, it's equivalent to the Average Precision (AP).
    It's calculated using the following formula:

    \[ \text{MAP} = \frac{\sum_{k=1}^n P(k) \times \text{rel}(k)}{\text{number of relevant documents}} \]

    Where:
    - n is the number of retrieved documents
    - P(k) is the precision at cut-off k in the list
    - rel(k) is an indicator function: 1 if the item at rank k is relevant, 0 otherwise
    MAP considers both precision and recall, as well as the ranking of relevant documents.

    """
    relevant_sources = {
        context["source"] for context in contexts if context["relevance"] != 0
    }

    num_relevant = 0
    sum_precision = 0.0

    for i, result in enumerate(model_output):
        if result["source"] in relevant_sources:
            num_relevant += 1
            sum_precision += num_relevant / (i + 1)

    if num_relevant == 0:
        return 0.0

    average_precision = sum_precision / len(relevant_sources)
    return average_precision


@weave.op()
def compute_precision(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the Precision for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The relevance score of the document.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The Precision score for the given query.

    Precision measures the proportion of retrieved documents that are relevant.
    It is calculated using the following formula:

    \[ \text{Precision} = \frac{\text{Number of Relevant Documents Retrieved}}{\text{Total Number of Documents Retrieved}} \]
    """
    relevant_sources = {
        context["source"] for context in contexts if context["relevance"] != 0
    }
    retrieved_sources = {result["source"] for result in model_output}

    relevant_retrieved = relevant_sources & retrieved_sources

    precision = (
        len(relevant_retrieved) / len(retrieved_sources) if retrieved_sources else 0.0
    )
    return precision


# Recall
@weave.op()
def compute_recall(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the Recall for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The relevance score of the document.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The Recall score for the given query.

    Recall measures the proportion of relevant documents that are retrieved.
    It is calculated using the following formula:

    \[ \text{Recall} = \frac{\text{Number of Relevant Documents Retrieved}}{\text{Total Number of Relevant Documents}} \]
    """
    relevant_sources = {
        context["source"] for context in contexts if context["relevance"] != 0
    }
    retrieved_sources = {result["source"] for result in model_output}

    relevant_retrieved = relevant_sources & retrieved_sources

    recall = (
        len(relevant_retrieved) / len(relevant_sources) if relevant_sources else 0.0
    )
    return recall


# F1 Score
@weave.op()
def compute_f1_score(
    model_output: List[Dict[str, Any]], contexts: List[Dict[str, Any]]
) -> float:
    """
    Calculate the F1-Score for a single query.

    Args:
        model_output (List[Dict[str, Any]]): The list of retrieved documents from the model.
            Each dictionary contains:
                - 'source': A unique identifier for the document.
                - 'score': The relevance score of the document.
        contexts (List[Dict[str, Any]]): A list of dictionaries representing the relevant contexts.
            Each dictionary contains:
                - 'source': A unique identifier for the relevant document.

    Returns:
        float: The F1-Score for the given query.

    F1-Score is the harmonic mean of Precision and Recall.
    It is calculated using the following formula:

    \[ \text{F1-Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} \]
    """
    precision = compute_precision(model_output, contexts)
    recall = compute_recall(model_output, contexts)

    if precision + recall == 0:
        return 0.0

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


In [None]:
import nest_asyncio
nest_asyncio.apply()
import asyncio

retrieval_scorers = [
    compute_mrr, compute_ndcg, compute_map,
    compute_hit_rate, compute_precision,
    compute_recall, compute_f1_score
]
retrieval_evaluation = weave.Evaluation(
    name="Retrieval_Evaluation",
    dataset=eval_dataset.to_dict(orient="records"),
    scorers=retrieval_scorers,
    preprocess_model_input=lambda x: {"query": x["question"], "k":5}
)
retrieval_scores = asyncio.run(retrieval_evaluation.evaluate(retriever))



In [None]:

RETRIEVAL_EVAL_PROMPT ="""
Given a query and a document excerpt, you must provide a score on an integer scale of 0 to 2 with the following meanings:
    0 = represents that the excerpt is irrelevant to the query,
    1 = represents that the excerpt is somewhat relevant to the query,
    2 = represents that the excerpt is is highly relevant to the query.
    

Important Instruction: Assign category 1 if the excerpt is somewhat related to the query but not completely, category 2 if the excerpt only and entirely refers to the query. If neither of these criteria satisfies the query, give it category 0.


Split this problem into steps:
Consider the underlying intent of the query. Measure how well the content matches a likely intent of the query(M).
Measure how trustworthy the excerpt is (T).
Consider the aspects above and the relative importance of each, and decide on a final score (O). 
Final score must be an integer value only.
Do not provide any code in result. Provide each score in the following JSON format: 
{{"final_score": <integer score without providing any reasoning.>}}

## Examples

Example 1: 
<Query>
How do I programmatically access the human-readable run name?
</Query>
<Document>
If you do not explicitly name your run, a random run name will be assigned to the run to help identify the run in the UI. For instance, random run names will look like "pleasant-flower-4" or "misunderstood-glade-2".

If you'd like to overwrite the run name (like snowy-owl-10) with the run ID (like qvlp96vk) you can use this snippet:

import wandbRetrieval_Evaluation

wandb.init()
wandb.run.name = wandb.run.id
wandb.run.save()

</Document>
{{"final_score": 0}}

Example 2:
<Query>
What are Runs?
</Query>
<Document>
A single unit of computation logged by W&B is called a run. You can think of a W&B run as an atomic element of your whole project. You should initiate a new run when you:
 - Train a model
 - Change a hyperparameter
 - Use a different model
 - Log data or a model as a W&B Artifact
 - Download a W&B Artifact

For example, during a sweep, W&B explores a hyperparameter search space that you specify. Each new hyperparameter combination created by the sweep is implemented and recorded as a unique run. 
</Document>
{{"final_score": 2}}

Example 3:
<Query>
How do I use W&B with Keras ?
</Query>
<Document>
We have added three new callbacks for Keras and TensorFlow users, available from wandb v0.13.4. For the legacy WandbCallback scroll down.
These new callbacks,
 - Adhere to Keras design philosophy
 - Reduce the cognitive load of using a single callback (WandbCallback) for everything
 - Make it easy for Keras users to modify the callback by subclassing it to support their niche use case
</Document>
{{"final_score": 1}}

<Query>
{query}
</Query>

<Document>
{document}
</Document>

"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

@weave.op()
async def evaluate_retriever_using_llm_judge(query: str, passage: str) -> str:
    response = await client.chat(
        message=RETRIEVAL_EVAL_PROMPT.format(query=query, document=passage),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=20,
    )
    return response.text


In [None]:
from typing import List, Dict, Any

@weave.op()

async def run_retriever_evaluation_using_llm(eval_samples: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    scores = []
    for sample in eval_samples:
        query = sample["question"]
        search_results = retriever.search(query, k=5)
        tasks = []
        for result in search_results:
            tasks.append(evaluate_retriever_using_llm_judge(query, result["text"]))
        sample_scores = await asyncio.gather(*tasks)
        sample_scores = map(json.loads, sample_scores)
        sample_scores = list(map(lambda x: x["final_score"], sample_scores))
        scores.append({"query": query, "scores": sample_scores})
    return scores
    

In [None]:
llm_judge_retrieval_results = asyncio.run(run_retriever_evaluation_using_llm(eval_samples))

In [None]:
# we have the scores for each document
llm_judge_retrieval_results_df = pd.DataFrame(llm_judge_retrieval_results)

# we can compute the reciprocal rank of the first document that is relevant to the query i.e. rated as 2 by our llm judge.
def compute_rank_score(scores: List[int]) -> float:
    rank_score = 0
    for rank, result in enumerate(scores, 1):
        if result == 2:
            rank_score = 1 / rank
            return rank_score
    return rank_score

llm_judge_retrieval_results_df["rank_score"] = llm_judge_retrieval_results_df["scores"].map(compute_rank_score)


display(llm_judge_retrieval_results_df)


print(f"Mean Rank Score: {llm_judge_retrieval_results_df['rank_score'].mean():.4f}")
print(f"Std-dev Rank Score: {llm_judge_retrieval_results_df['rank_score'].std():.4f}")
    

In [None]:
# %load scripts/response_eval
import difflib
import Levenshtein
from nltk import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate import meteor
from nltk.corpus import wordnet as wn
import weave
import re
import string

wn.ensure_loaded()


def normalize_text(text: str) -> str:
    """
    Normalize the input text by lowercasing, removing punctuation, and extra whitespace.

    Args:
        text (str): The input text to normalize.

    Returns:
        str: The normalized text.
    """
    # Convert to lowercase
    text = text.lower()
    # Split on punctuation before removing it, ensuring numbers are not split
    text = re.sub(r"[^\w\s\d]", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text


@weave.op()
def compute_diff(model_output: str, answer: str) -> float:
    """
    Compute the similarity ratio between the normalized model output and the expected answer.

    Args:
        model_output (str): The output generated by the model.
        answer (str): The expected answer.

    Returns:
        float: The similarity ratio between the normalized model output and the expected answer.
    """
    norm_output = normalize_text(model_output)
    norm_answer = normalize_text(answer)
    return difflib.SequenceMatcher(None, norm_output, norm_answer).ratio()


@weave.op()
def compute_levenshtein(model_output: str, answer: str) -> float:
    """
    Compute the Levenshtein ratio between the normalized model output and the answer.

    Args:
        model_output (str): The output generated by the model.
        answer (str): The expected answer.

    Returns:
        float: The Levenshtein ratio between the normalized model output and the answer.
    """
    norm_output = normalize_text(model_output)
    norm_answer = normalize_text(answer)
    return Levenshtein.ratio(norm_output, norm_answer)


@weave.op()
def compute_rouge(model_output: str, answer: str) -> float:
    """
    Compute the ROUGE-L F1 score between the normalized model output and the reference answer.

    Args:
        model_output (str): The model's generated output.
        answer (str): The reference answer.

    Returns:
        float: The ROUGE-L F1 score.
    """
    norm_output = normalize_text(model_output)
    norm_answer = normalize_text(answer)
    rouge = Rouge(metrics=["rouge-l"], stats="f")
    scores = rouge.get_scores(norm_output, norm_answer)
    return scores[0]["rouge-l"]["f"]


@weave.op()
def compute_bleu(model_output: str, answer: str) -> float:
    """
    Compute the BLEU score between the normalized model output and the reference answer.

    Args:
        model_output (str): The generated output from the model.
        answer (str): The reference answer.

    Returns:
        float: The BLEU score between the normalized model output and the reference answer.
    """
    chencherry = SmoothingFunction()
    smoothing_function = chencherry.method2

    norm_output = normalize_text(model_output)
    norm_answer = normalize_text(answer)
    reference = word_tokenize(norm_answer)
    candidate = word_tokenize(norm_output)
    score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
    return score


@weave.op
def compute_meteor(model_output: str, answer: str) -> float:
    """
    Compute the METEOR score between the normalized model output and the reference answer.

    Args:
        model_output (str): The model's generated output.
        answer (str): The reference answer.

    Returns:
        float: The METEOR score rounded to 4 decimal places.
    """
    norm_output = normalize_text(model_output)
    norm_answer = normalize_text(answer)
    reference = word_tokenize(norm_answer)
    candidate = word_tokenize(norm_output)
    meteor_score = round(meteor([candidate], reference), 4)
    return meteor_score


In [None]:
response_scorers = [
    compute_diff,
    compute_levenshtein,
    compute_rouge,
    compute_bleu,
    ]

response_evaluations = weave.Evaluation(
    name="Response_Evaluation",
    dataset=eval_samples, 
    scorers=response_scorers, 
    preprocess_model_input=lambda x: {"query": x["question"]})
response_scores = asyncio.run(response_evaluations.evaluate(rag_pipeline))


In [None]:

CORRECTNESS_EVAL_PROMPT ="""
You are a Weight & Biases support expert tasked with evaluating the correctness of answers to questions asked by users to a technical support chatbot. 
You are tasked with judging the correctness of a generated answer based on the user's query, and a reference answer.

You will be given the following information:

<query>
{query}
</query>

<reference_answer>
{reference_answer}
</reference_answer>

<generated_answer>
{generated_answer}
</generated_answer>

Important Instruction: To evaluate the generated answer, follow these steps:

1. Intent Analysis: Consider the underlying intent of the query.
2. Relevance: Check if the generated answer addresses all aspects of the question.
3. Accuracy: Compare the generated answer to the reference answer for completeness and correctness.
4. Trustworthiness: Measure how trustworthy the generated answer is when compared to the reference.

Assign a score on an integer scale of 0 to 2 with the following meanings:
- 0 = The generated answer is incorrect and does not satisfy any of the criteria.
- 1 = The generated answer is partially correct, contains mistakes or is not factually correct.
- 2 = The generated answer is correct, completely answers the query, does not contain any mistakes, and is factually consistent with the reference answer.

After your analysis, provide your verdict in the following JSON format:

{{
    "reason": "<<Provide a brief explanation for your decision here>>",
    "final_score": <<Provide a score as per the above guidelines>>,
    "decision": "<<Provide your final decision here, either 'correct' or 'incorrect'>>"
}}

Here are some examples of correct output:

Example 1:
{{
    "reason": "The generated answer has the exact details as the reference answer and completely answers the user's query.",
    "final_score": 2,
    "decision": "correct"
}}

Example 2:
{{
    "reason": "The generated answer doesn't match the reference answer and deviates from the user's query.",
    "final_score": 0,
    "decision": "incorrect"
}}

Example 3:
{{
    "reason": "The generated answer follows the same steps as the reference answer. However, it significantly misses the user's intent,
    "final_score": 1,
    "decision": "incorrect"
}}

Example 4:
{{
    "reason": "The generated is not factually correct and includes assumptions about code methods completely different from the reference answer",
    "final_score": 0,
    "decision": "incorrect"
}}

Please provide your evaluation based on the given information and format your response according to the specified JSON structure.
"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

@weave.op()
async def evaluate_correctness_using_llm_judge(question: str, answer: str, model_output: str) -> Dict[str, Any]:
    response = await client.chat(
        message=CORRECTNESS_EVAL_PROMPT.format(query=question, reference_answer=answer, generated_answer=model_output),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=150,
    )
    return json.loads(response.text)


In [None]:
response_scorers = [evaluate_correctness_using_llm_judge]
correctness_evaluations = weave.Evaluation(
    name="Correctness_Evaluation",
    dataset=eval_samples, 
    scorers=response_scorers, 
    preprocess_model_input=lambda x: {"query": x["question"]})
response_scores = asyncio.run(correctness_evaluations.evaluate(rag_pipeline))
