In [None]:
# WIP Initial simple RAG example with eval

In [None]:
import difflib
import json
import os
import pathlib
from datetime import datetime
from typing import List, Dict, Any

import dotenv
import Levenshtein
import nest_asyncio
import numpy as np
import pandas as pd
import wandb
from IPython.display import Markdown
from nltk import word_tokenize
from nltk.translate import meteor
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from openai import AsyncOpenAI
from ranx import Qrels, Run, evaluate
from rouge import Rouge
from scipy.spatial.distance import cdist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nest_asyncio.apply()
import asyncio

dotenv.load_dotenv()

In [None]:
WANDB_ENTITY = "parambharat"
WANDB_PROJECT = "advanced_rag"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    job_type="data_ingestion",
    group="initial_example",
)

## Data ingestion

### Loading the data

In [None]:
documents_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/documentation_data:latest", type="dataset"
)
data_dir = "../data/wandb_docs"

docs_dir = documents_artifact.download(data_dir)

In [None]:
docs_dir = pathlib.Path(docs_dir)
docs_files = sorted(docs_dir.rglob("*.md"))

print(f"Number of files: {len(docs_files)}\n")
print("First 5 files:\n{files}".format(files="\n".join(map(str, docs_files[:5]))))

In [None]:
# Lets look at an example file
print(docs_files[0].read_text())

In [None]:
# We'll store the files as dictionaries with some content and metadata
data = []
for file in docs_files:
    content = file.read_text()
    data.append(
        {
            "content": content,
            "metadata": {
                "source": str(file.relative_to(docs_dir)),
                "raw_tokens": len(content.split()),
            },
        }
    )
data[:2]

In [None]:
total_tokens = sum(map(lambda x: x["metadata"]["raw_tokens"], data))
print(f"Total Tokens in dataset: {total_tokens}")

In [None]:
# Let's store the raw data in an artifact for future use and reproducibility

raw_artifact = wandb.Artifact(
    name="raw_data",
    type="dataset",
    description="Raw wandb documentation",
    metadata={
        "total_files": len(data),
        "date_processed": datetime.now().strftime("%Y-%m-%d"),
        "total_raw_tokens": total_tokens,
    },
)
with raw_artifact.new_file("documents.jsonl", mode="w") as f:
    for item in data:
        f.write(json.dumps(item) + "\n")
run.log_artifact(raw_artifact)

### Chunking the data

In [None]:
# These are hyperparameters of our ingestion pipeline

CHUNK_SIZE = 500
CHUNK_OVERLAP = 0


def split_into_chunks(
    text: str, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP
) -> List[str]:
    """Function to split the text into chunks of a maximum number of tokens
    ensure that the chunks are of size CHUNK_SIZE and overlap by chunk_overlap tokens
    use the `tokenizer.encode` method to tokenize the text
    """
    tokens = text.split()
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        chunks.append(" ".join(chunk))
        start = end - chunk_overlap
    return chunks

In [None]:
# We'll re-use the raw dataset from the artifact in our previous step


raw_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/raw_data:latest", type="dataset"
)
artifact_dir = raw_artifact.download()
raw_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
raw_data = list(map(json.loads, raw_data_file.read_text().splitlines()))
raw_data[:2]

In [None]:
chunked_data = []
for doc in raw_data:
    chunks = split_into_chunks(doc["content"])
    for chunk in chunks:
        chunked_data.append(
            {
                "content": chunk,
                "metadata": {
                    "source": doc["metadata"]["source"],
                    "raw_tokens": len(chunk.split()),
                },
            }
        )

### Cleaning the data

In [None]:
# some of our examples have special tokens that we need to remove otherwise it will break the chat.completions api.


def make_text_tokenization_safe(content: str) -> str:
    special_tokens_set = {
        "<|endofprompt|>",
        "<|endoftext|>",
        "<|fim_middle|>",
        "<|fim_prefix|>",
        "<|fim_suffix|>",
    }

    def remove_special_tokens(text: str) -> str:
        """Removes special tokens from the given text.

        Args:
            text: A string representing the text.

        Returns:
            The text with special tokens removed.
        """
        for token in special_tokens_set:
            text = text.replace(token, "")
        return text

    cleaned_content = remove_special_tokens(content)
    return cleaned_content

In [None]:
cleaned_data = []
for doc in chunked_data:
    cleaned_doc = doc.copy()
    cleaned_doc["cleaned_content"] = make_text_tokenization_safe(doc["content"])
    cleaned_doc["metadata"]["cleaned_tokens"] = len(
        cleaned_doc["cleaned_content"].split()
    )
    cleaned_data.append(cleaned_doc)
cleaned_data[:2]

In [None]:
# Again, we'll store the cleaned data in an artifact for future use and reproducibility

total_raw_tokens = sum(map(lambda x: x["metadata"]["raw_tokens"], cleaned_data))
total_cleaned_tokens = sum(map(lambda x: x["metadata"]["cleaned_tokens"], cleaned_data))

chunked_artifact = wandb.Artifact(
    name="chunked_data",
    type="dataset",
    description="Chunked wandb documentation",
    metadata={
        "total_files": len(cleaned_data),
        "date_processed": datetime.now().strftime("%Y-%m-%d"),
        "total_raw_tokens": total_raw_tokens,
        "total_cleaned_tokens": total_cleaned_tokens,
        "chunk_size": CHUNK_SIZE,
        "chunk_overlap": CHUNK_OVERLAP,
    },
)
with chunked_artifact.new_file("documents.jsonl", mode="w") as f:
    for item in cleaned_data:
        f.write(json.dumps(item) + "\n")
run.log_artifact(chunked_artifact)

## Vectorizing the data

In [None]:
# Now we can re-use the chunked data from the artifact in our previous step

chunked_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/chunked_data:latest", type="dataset"
)
artifact_dir = chunked_artifact.download()
chunked_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
chunked_data = list(map(json.loads, chunked_data_file.read_text().splitlines()))
chunked_data[:2]

In [None]:
# We'll create a simple retriever class to get the most relevant chunks of data for a given query.
# We'll use TF-IDF to vectorize the documents and cosine distance to measure the similarity between the query and the documents.
# Two methods: index_data and search
# index_data will take the data and vectorize it and store the index
# search will take a query and return the most relevant chunks from the index


class Retriever:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.index = None
        self.data = None

    def index_data(self, data):
        self.data = data
        docs = [doc["cleaned_content"] for doc in data]
        self.index = self.vectorizer.fit_transform(docs)

    # @weave.op()
    def search(self, query, k=5):
        query_vec = self.vectorizer.transform([query])
        cosine_distances = cdist(
            query_vec.todense(), self.index.todense(), metric="cosine"
        )[0]
        top_k_indices = cosine_distances.argsort()[:k]
        output = []
        for idx in top_k_indices:
            output.append(
                {
                    "source": self.data[idx]["metadata"]["source"],
                    "text": self.data[idx]["cleaned_content"],
                    "score": 1 - cosine_distances[idx],
                }
            )
        return output

In [None]:
# Let's test with a simple query


retriever = Retriever()
retriever.index_data(chunked_data)
retriever.search("How do I get get started with wandb?")

## Generating a response

In [None]:
# Now we are ready to generate a response grounded on the documentation.

client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])


# @weave.op()
async def generate_response(query):
    context = retriever.search(query)
    context_text = "\n".join(
        [f"Source: {item['source']}\nText: {item['text']}\n\n" for item in context]
    )

    system_message = {
        "role": "system",
        "content": "You are a helpful customer support assistant that can answer questions about W&B\n\n"
        "Your answers must be based only on the provided context.\n\n"
        f"<context>\n{context_text}\n</context>",
    }

    user_message = {"role": "user", "content": f"Question: {query}\n\nAnswer:"}

    response = await client.chat.completions.create(
        model="gpt-3.5-turbo", messages=[system_message, user_message]
    )

    return response.choices[0].message.content

In [None]:
response = asyncio.run(generate_response("How do I get get started with wandb?"))
print(response)

## Evaluating the RAG system

Get from data from the docs website [FAQs](https://docs.wandb.ai/guides/technical-faq) to test the system.

### Collecting data for evaluation

In [None]:
eval_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/eval_dataset:latest", type="dataset"
)
eval_dir = eval_artifact.download("../data/eval")
eval_dataset = pd.read_json(
    f"{eval_dir}/eval_dataset.jsonl", lines=True, orient="records"
)
eval_samples = eval_dataset.to_dict(orient="records")
eval_dataset

### Evaluating the Retriever


ref: https://weaviate.io/blog/retrieval-evaluation-metrics

In [None]:
# MRR (Mean Reciprocal Rank) is a metric that measures the quality of the retrieval system by evaluating the proportion of queries for which the most relevant document is retrieved.
# Let's calculate the MRR score for our retrieval system


def calculate_mrr(
    retriever: Retriever, eval_samples: List[Dict[str, Any]], k: int = 5
) -> pd.DataFrame:
    results = []
    for sample in eval_samples:
        query = sample["question"]
        expected_source = sample["source"]

        search_results = retriever.search(query, k=k)

        # the rank of the expected source
        for rank, result in enumerate(search_results, 1):
            if result["source"] == expected_source:
                mrr_score = 1 / rank
                break
        else:
            # expected source not found in top k results
            mrr_score = 0

        results.append(
            {"Query": query, "Expected Source": expected_source, "MRR Score": mrr_score}
        )

    df = pd.DataFrame(results)
    df["MRR Score"] = df["MRR Score"].astype(float)  # Ensure MRR Score is float

    return df


# Evaluate
results_df = calculate_mrr(retriever, eval_samples)
display(results_df)

In [None]:
overall_mrr = results_df["MRR Score"].mean()
print(f"Mean MRR Score: {overall_mrr:.4f}")

# Calculate and print overall MRR score
display(pd.DataFrame(results_df["MRR Score"].describe()).T)

#### Evaluating retrieval on other metrics

In [None]:
# Other metrics include:
# NDCG (Normalized Discounted Cumulative Gain)
# MAP (Mean Average Precision)
# Hit Rate
# Precision
# Recall
# F1 Score

RETRIEVAL_METRICS = ["ndcg@5", "map@5", "mrr", "hit_rate", "precision", "recall", "f1"]


def evaluate_retriever(retriever, eval_samples, k=5, metrics=RETRIEVAL_METRICS):
    # Prepare qrels_dict
    qrels_dict = {}
    for i, sample in enumerate(eval_samples):
        qrels_dict[f"q_{i}"] = {
            sample["source"]: 1
        }  # Assuming relevance of 1 for the correct source

    # Prepare run_dict
    run_dict = {}
    for i, sample in enumerate(eval_samples):
        query = sample["question"]
        results = retriever.search(query, k=k)
        run_dict[f"q_{i}"] = {result["source"]: result["score"] for result in results}

    # Create Qrels and Run objects
    qrels = Qrels(qrels_dict)
    run = Run(run_dict)

    # Compute metrics
    score_dict = evaluate(qrels, run, metrics, return_mean=False)

    # Combine eval_samples and scores into a DataFrame
    results_df = pd.concat(
        [pd.DataFrame(eval_samples), pd.DataFrame(score_dict)], axis=1
    )

    return results_df

In [None]:
results_df = evaluate_retriever(retriever, eval_samples)
display(results_df)

print("\nMean Overall Retrieval Scores:")
display(pd.DataFrame(results_df[RETRIEVAL_METRICS].mean()).T)

print("\nOverall Retrieval Score Statistics:")
display(pd.DataFrame(results_df[RETRIEVAL_METRICS].describe()).T)

### Evaluating the Response

In [None]:
# We can measure the similarity of the response to the expected answer using difflib and Levenshtein distance
# These are simple metrics.

# or we can use traditional metrics used to measure generation systems.
# ref: https://blog.paperspace.com/automated-metrics-for-evaluating-generated-text/


def calculate_diff_score(candidate, reference):
    return difflib.SequenceMatcher(None, candidate, reference).ratio()


def calculate_levenshtein_score(candidate, reference):
    return Levenshtein.ratio(candidate, reference)


def calculate_rouge(candidate, reference):
    rouge = Rouge(metrics=["rouge-l"], stats="f")
    scores = rouge.get_scores(candidate, reference)
    return scores[0]["rouge-l"]["f"]


def calculate_bleu(candidate, reference):
    chencherry = SmoothingFunction()
    smoothing_function = chencherry.method2

    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
    return score


def calculate_meteor(candidate, reference):
    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    meteor_score = meteor([candidate], reference)
    return meteor_score


# we can also calculate the cosine similarity between the candidate and the reference using our retriever's vectorizer


def calculate_similarity(candidate, reference):
    vectors = retriever.vectorizer.transform([candidate, reference])
    similarity = cosine_similarity(vectors)[0][1]
    return similarity



async def evaluate_response(eval_samples):
    tasks = []
    for eval_sample in eval_samples:
        tasks.append(generate_response(eval_sample["question"]))
    responses = await asyncio.gather(*tasks)
    results = []
    for response, eval_sample in zip(responses, eval_samples):
        results.append(
            {
                "Query": eval_sample["question"],
                "Source": eval_sample["source"],
                "Expected_Answer": eval_sample["answer"],
                "Actual_Answer": response,
                "Diff_Score": calculate_diff_score(response, eval_sample["answer"]),
                "Levenshtein_Score": calculate_levenshtein_score(
                    response, eval_sample["answer"]
                ),
                "Rouge(l)_Score": calculate_rouge(response, eval_sample["answer"]),
                "BLEU_Score": calculate_bleu(response, eval_sample["answer"]),
                "Meteor_Score": calculate_meteor(response, eval_sample["answer"]),
                "Similarity_Score": calculate_similarity(
                    response, eval_sample["answer"]
                ),
            }
        )

    return results

In [None]:
generation_eval = asyncio.run(evaluate_response(eval_samples))
generation_eval_df = pd.DataFrame(generation_eval).round(4)
display(generation_eval_df)


GENERATION_METRICS = [col for col in generation_eval_df.columns if "Score" in col]


print("\nMean Overall Generation Scores:")
display(pd.DataFrame(generation_eval_df[GENERATION_METRICS].mean()).T)

print("\nOverall Generation Score Statistics:")
display(pd.DataFrame(generation_eval_df[GENERATION_METRICS].describe()).T)