# Chapter 3 
## Data Ingestion and Preprocessing

One way to improve our RAG system is to improve our data ingestion and preprocessing.

In [None]:
import json
import os
import pathlib
from datetime import datetime
from typing import Dict, List

import dotenv
import numpy as np
import wandb
import cohere
import requests
import markdown
import re
from bs4 import BeautifulSoup


dotenv.load_dotenv()

In [None]:
WANDB_ENTITY = "rag-course"
WANDB_PROJECT = "dev"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    group="Chapter 3",
)

In [None]:
# We'll re-use the raw dataset from the artifact in our previous step


raw_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/raw_data:latest", type="dataset"
)
artifact_dir = raw_artifact.download()
raw_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
raw_data = list(map(json.loads, raw_data_file.read_text().splitlines()))
raw_data[:2]

In [None]:
# Earlier we referred to words as tokens. We can be more correct in defining tokens by using a tokenizer.
# We'll use the Cohere tokenizer for this example.

co = cohere.Client(api_key=os.environ["CO_API_KEY"])

In [None]:
def tokenize_text(text: str) -> List[str]:
    return co.tokenize(text=text, model="command-r", offline=True)

In [None]:
tokenizers = {
    "command-r": "https://storage.googleapis.com/cohere-public/tokenizers/command-r.json",
    "command-r-plus": "https://storage.googleapis.com/cohere-public/tokenizers/command-r-plus.json"
}


In [None]:
for doc in raw_data[:]:
    doc['metadata']['words'] = doc['metadata'].pop('raw_tokens')
    doc['metadata']['tokens'] = len(tokenize_text(doc['content']).tokens)
raw_data[:2]

## Pre-processing

There is a lot of extra formatting information (markdown elements) that is not very useful to an LLM.

We can remove this information by converting the contents to text. We can also remove any special characters and extra whitespace. 

Special characters here are ones that are defined in the tokenizer and will vary depending on the model used.


In [None]:
import frontmatter

def convert_contents_to_text(contents: str) -> str:
    _, content = frontmatter.parse(contents)
    # use some extensions to convert the markdown to html
    markdown_document = markdown.markdown(
        content,
        extensions=[
            "toc",
            "pymdownx.extra",
            "pymdownx.blocks.admonition",
            "pymdownx.magiclink",
            "pymdownx.blocks.tab",
            "pymdownx.pathconverter",
            "pymdownx.saneheaders",
            "pymdownx.striphtml",
            "pymdownx.highlight",
            "pymdownx.pathconverter",
            "pymdownx.escapeall"
        ],
    )
    soup = BeautifulSoup(markdown_document, "html.parser")
    def remove_urls_a_tags_hrefs(soup):
        # For hyperlinks, keep the text but remove the link
        for a_tag in soup.find_all('a'):
            a_tag.replace_with(a_tag.text)
        
        # Remove all images
        for img_tag in soup.find_all('img'):
            img_tag.decompose()
        
        # Remove all href attributes (this is now redundant for <a> tags, but keeps other elements clean)
        # for tag in soup.find_all(href=True):
        #     del tag['href']
        
        return soup

    # Use the function as before
    soup = remove_urls_a_tags_hrefs(soup)

    def remove_javascript_import_statements(soup):
        for p in soup.find_all('p'):
            if p.text.strip().startswith('import') and ';' in p.text:
                p.decompose()
        return soup
    soup = remove_javascript_import_statements(soup)

    return soup.get_text()

def get_special_tokens_set(tokenizer_url):
    # https://docs.cohere.com/docs/tokens-and-tokenizers
    response = requests.get(tokenizer_url)
    return set([tok["content"] for tok in response.json()["added_tokens"]])

special_tokens_set = get_special_tokens_set(tokenizers["command-r"])
def make_text_tokenization_safe(content: str, special_tokens_set: set=special_tokens_set) -> str:
    
    # Normalize newlines and replace multiple new lines with a single new line
    # content = re.sub(r'\n+', '\n', content, flags=re.UNICODE)
    

    def remove_special_tokens(text: str) -> str:
        """Removes special tokens from the given text.

        Args:
            text: A string representing the text.

        Returns:
            The text with special tokens removed.
        """
        for token in special_tokens_set:
            text = text.replace(token, "")
        return text

    cleaned_content = remove_special_tokens(content)
    return cleaned_content

In [None]:
parsed_data = []

for doc in raw_data:
    parsed_doc = doc.copy()
    content = convert_contents_to_text(doc["content"])
    parsed_doc["parsed_content"] = make_text_tokenization_safe(content)
    parsed_doc["metadata"]["parsed_tokens"] = len(tokenize_text(parsed_doc["parsed_content"]).tokens)
    parsed_data.append(parsed_doc)
parsed_data[:2]

In [None]:
total_words = sum(map(lambda x: x["metadata"]["words"], parsed_data))
total_raw_tokens = sum(map(lambda x: x["metadata"]["tokens"], raw_data))
total_parsed_tokens = sum(map(lambda x: x["metadata"]["parsed_tokens"], parsed_data))

preprocessed_artifact = wandb.Artifact(name="preprocessed_data", type="dataset",
description="Preprocessed wandb documentation", metadata={
    "total_files": len(parsed_data),
    "date_preprocessed": datetime.now().strftime("%Y-%m-%d"),
    "total_words": total_words,
    "total_raw_tokens": total_raw_tokens,
    "total_parsed_tokens": total_parsed_tokens,
    }
)
with preprocessed_artifact.new_file("documents.jsonl", mode="w") as f:
    for item in parsed_data:
        f.write(json.dumps(item) + "\n")
run.log_artifact(preprocessed_artifact)

## Data Chunking

1. First we split the text into sentences using [BlingFire](https://github.com/microsoft/BlingFire) library.
2. Then we split the sentences into chunks of a maximum number of tokens.

In [None]:
from blingfire import text_to_sentences

In [None]:
preprocessed_artifact = run.use_artifact(f'{WANDB_ENTITY}/{WANDB_PROJECT}/preprocessed_data:latest', type='dataset')
artifact_dir = preprocessed_artifact.download()
preprocessed_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
preprocessed_data = list(map(json.loads, preprocessed_data_file.read_text().splitlines()))
preprocessed_data[:2]

In [None]:
#ref: https://platform.openai.com/docs/tutorials/web-qa-embeddings

CHUNK_SIZE=500

# Function to split the text into chunks of a maximum number of tokens


def split_into_chunks(text, max_tokens = CHUNK_SIZE):

    # Split the text into sentences
    sentences = text_to_sentences(text).split("\n")

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenize_text("\n" + sentence).tokens) for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append("\n".join(chunk))
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks

In [None]:
chunked_data = []
for doc in preprocessed_data:
    chunks = split_into_chunks(doc["parsed_content"])
    for chunk in chunks:
        chunked_data.append(
            {
                "parsed_content" : chunk,
                "metadata": {
                    "source": doc["metadata"]["source"],
                    "parsed_tokens": len(tokenize_text(chunk).tokens)
            }})
        
chunked_data[:2]

In [None]:
# Again, we'll store the cleaned data in an artifact for future use and reproducibility

total_cleaned_tokens = sum(map(lambda x: x["metadata"]["parsed_tokens"], chunked_data))

chunked_artifact = wandb.Artifact(
    name="chunked_data",
    type="dataset",
    description="Chunked wandb documentation",
    metadata={
        "total_files": len(chunked_data),
        "date_processed": datetime.now().strftime("%Y-%m-%d"),
        "total_raw_tokens": total_raw_tokens,
        "total_cleaned_tokens": total_cleaned_tokens,
        "chunk_size": CHUNK_SIZE,
    },
)
with chunked_artifact.new_file("documents.jsonl", mode="w") as f:
    for item in chunked_data:
        f.write(json.dumps(item) + "\n")
run.log_artifact(chunked_artifact)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist

class Retriever:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.index = None
        self.data = None

    def index_data(self, data):
        self.data = data
        docs = [doc["parsed_content"] for doc in data]
        self.index = self.vectorizer.fit_transform(docs)

    def search(self, query, k=5):
        query_vec = self.vectorizer.transform([query])
        cosine_distances = cdist(
            query_vec.todense(), self.index.todense(), metric="cosine"
        )[0]
        top_k_indices = cosine_distances.argsort()[:k]
        output = []
        for idx in top_k_indices:
            output.append(
                {
                    "source": self.data[idx]["metadata"]["source"],
                    "text": self.data[idx]["parsed_content"],
                    "score": 1 - cosine_distances[idx],
                }
            )
        return output

In [None]:
# Let's test with a simple query


retriever = Retriever()
retriever.index_data(chunked_data)

query = "How do I use W&B to log metrics in my training script?"
search_results = retriever.search(query)
for result in search_results:
    print(result)

In [None]:
# Now we are ready to generate a response grounded on the documentation.


class ResponseGenerator:
    def __init__(self, model: str, prompt: str):
        self.client = cohere.Client(api_key=os.environ["CO_API_KEY"])
        self.model = model
        self.prompt = prompt

    # @weave.op()

    def generate_response(self, query: str, context: List[Dict[str, any]]) -> str:
        
        documents = [{"source": item['source'], "text": item['text']} for item in context]
        response = self.client.chat(
            preamble=self.prompt,
            message=query,
            model=self.model,
            documents=documents,
            temperature=0.1,
            max_tokens=2000,
        )
        return response.text

In [None]:
PROMPT = "Answer to the following question about W&B. Provide an helful and complete answer based only on the provided documents."

In [None]:
response_generator = ResponseGenerator(model="command-r", prompt=PROMPT)
answer = response_generator.generate_response(query, search_results)
print(answer)

In [None]:
class RAGPipeline:
    def __init__(self, retriever: Retriever, response_generator: ResponseGenerator, top_k: int = 5):
        self.retriever = retriever
        self.response_generator = response_generator
        self.top_k = top_k

    def __call__(self, query: str):
        context = self.retriever.search(query, self.top_k)
        return self.response_generator.generate_response(query, context)

In [None]:
rag_pipeline = RAGPipeline(retriever, response_generator, top_k=10)
response = rag_pipeline(query="Where do I find my API Key?")
print(response)

## Eval the changes

In [None]:
import pandas as pd

In [None]:
eval_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/eval_dataset:latest", type="dataset"
)
eval_dir = eval_artifact.download("../data/eval")
eval_dataset = pd.read_json(
    f"{eval_dir}/eval_dataset.jsonl", lines=True, orient="records"
)
eval_samples = eval_dataset.to_dict(orient="records")
eval_dataset

In [None]:
from ranx import Qrels, Run, evaluate
from typing import Any
from tqdm import tqdm

RETRIEVAL_METRICS = ["ndcg@10", "map@10", "mrr", "hit_rate", "precision", "recall", "f1"]


def evaluate_retriever(retrieved_docs: List[Dict[str, Any]], actual_doc: str) -> Dict[str, Any]:
    qrels = Qrels({"query": {actual_doc: 1}})
    run = Run({"query": {doc["source"]: doc["score"] for doc in retrieved_docs}})
    return evaluate(qrels, run, metrics=RETRIEVAL_METRICS)


retrieval_scores = []
for sample in tqdm(eval_samples):
    query = sample["question"]
    expected_source = sample["source"]
    search_results = retriever.search(query, k=10)
    eval_scores = evaluate_retriever(search_results, expected_source)
    retrieval_scores.append({"query": query, **eval_scores})

retrieval_scores_df = pd.DataFrame(retrieval_scores)
display(retrieval_scores_df)

print("\nMean Overall Retrieval Scores:")
display(pd.DataFrame(retrieval_scores_df[RETRIEVAL_METRICS].mean()).T)

print("\nOverall Retrieval Score Statistics:")
display(pd.DataFrame(retrieval_scores_df[RETRIEVAL_METRICS].describe()).T)

In [None]:

RETRIEVAL_EVAL_PROMPT ="""
Given a query and a document excerpt, you must provide a score on an integer scale of 0 to 3 with the following meanings:
    0 = represent that the excerpt has nothing to do with the query,
    1 = represents that the excerpt seems related to the query but does not help answer it,
    2 = represents that the excerpt has some answer for the query, but the answer may be a bit unclear, or hidden amongst extraneous information and
    3 = represents that the excerpt is dedicated to the query and contains the exact answer.

Important Instruction: Assign category 1 if the excerpt is somewhat related to the topic but not completely, category 2 if excerpt presents something very important related to the entire topic but also has some extra information and category 3 if the excerpt only and entirely refers to the topic. If none of the above satisfies give it category 0.

Query: {query}
Document: {document}

Split this problem into steps:
Consider the underlying intent of the query. Measure how well the content matches a likely intent of the query(M).
Measure how trustworthy the excerpt is (T).
Consider the aspects above and the relative importance of each, and decide on a final score (O). 
Final score must be an integer value only.
Do not provide any code in result. Provide each score in the following JSON format: 


{{"final_score": <integer score without providing any reasoning.>}}

"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

async def evaluate_retriever_using_llm_judge(query: str, passage: str) -> int:
    response = await client.chat(
        message=RETRIEVAL_EVAL_PROMPT.format(query=query, document=passage),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=2000,
    )
    return response.text


In [None]:
import nest_asyncio
nest_asyncio.apply()
import asyncio
import json

sample = eval_samples[0]
query = sample["question"]
search_results = retriever.search(query, k=5)
tasks = []
for result in search_results:
    tasks.append(evaluate_retriever_using_llm_judge(query, result["text"]))
sample_scores = asyncio.run(asyncio.gather(*tasks))
sample_scores

In [None]:
async def run_retriever_evaluation_using_llm(eval_samples):
    scores = []
    for sample in eval_samples:
        query = sample["question"]
        search_results = retriever.search(query, k=5)
        tasks = []
        for result in search_results:
            tasks.append(evaluate_retriever_using_llm_judge(query, result["text"]))
        sample_scores = await asyncio.gather(*tasks)
        sample_scores = map(json.loads, sample_scores)
        sample_scores = list(map(lambda x: x["final_score"], sample_scores))
        scores.append({"query": query, "scores": sample_scores})
    return scores
    

In [None]:
llm_judge_retrieval_results = asyncio.run(run_retriever_evaluation_using_llm(eval_samples))

In [None]:
# we have the scores for each document
llm_judge_retrieval_results_df = pd.DataFrame(llm_judge_retrieval_results)

# we can compute the reciprocal rank of the first document that is relevant to the query i.e. rated as 3 by our llm judge.
def compute_rank_score(scores: List[int]) -> float:
    rank_score = 0
    for rank, result in enumerate(scores, 1):
        if result == 3:
            rank_score = 1 / rank
            return rank_score
    return rank_score

llm_judge_retrieval_results_df["rank_score"] = llm_judge_retrieval_results_df["scores"].map(compute_rank_score)


display(llm_judge_retrieval_results_df)


print(f"Mean Rank Score: {llm_judge_retrieval_results_df['rank_score'].mean():.4f}")
print(f"Std-dev Rank Score: {llm_judge_retrieval_results_df['rank_score'].std():.4f}")


In [None]:
import difflib
import Levenshtein
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate import meteor
from nltk.tokenize import word_tokenize

# We can measure the similarity of the response to the expected answer using difflib and Levenshtein distance
# These are simple metrics.

def calculate_diff_score(candidate, reference):
    return difflib.SequenceMatcher(None, candidate, reference).ratio()


def calculate_levenshtein_score(candidate, reference):
    return Levenshtein.ratio(candidate, reference)



# semantic answer similarity. (SAS) - https://arxiv.org/abs/2108.06130
# Originally, one should use a transformer based cross-encoder to measure and classify this. 
# For example, use something from https://sbert.net/docs/cross_encoder/usage/usage.html
# we can also calculate the cosine similarity between the candidate and the reference using our retriever's vectorizer
def calculate_similarity(candidate, reference):
    vectors = retriever.vectorizer.transform([candidate, reference])
    similarity = cosine_similarity(vectors)[0][1]
    return similarity




# or we can use traditional metrics used to measure generation systems.
# ref: https://blog.paperspace.com/automated-metrics-for-evaluating-generated-text/

def calculate_rouge(candidate, reference):
    rouge = Rouge(metrics=["rouge-l"], stats="f")
    scores = rouge.get_scores(candidate, reference)
    return scores[0]["rouge-l"]["f"]


def calculate_bleu(candidate, reference):
    chencherry = SmoothingFunction()
    smoothing_function = chencherry.method2

    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    score = sentence_bleu([reference], candidate, smoothing_function=smoothing_function)
    return score


def calculate_meteor(candidate, reference):
    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    meteor_score = meteor([candidate], reference)
    return meteor_score


In [None]:
rag_pipeline = RAGPipeline(retriever, response_generator)

response_scores = []
for sample in tqdm(eval_samples):
    query = sample['question']
    actual_answer = rag_pipeline(query)
    expected_answer = sample['answer']
    diff_score = calculate_diff_score(actual_answer, expected_answer)
    levenshtein_score = calculate_levenshtein_score(actual_answer, expected_answer)
    rouge_score = calculate_rouge(actual_answer, expected_answer)
    bleu_score = calculate_bleu(actual_answer, expected_answer)
    meteor_score = calculate_meteor(actual_answer, expected_answer)
    similarity_score = calculate_similarity(actual_answer, expected_answer)

    response_scores.append({
        "query": query,
        "expected_answer": expected_answer,
        "actual_answer": actual_answer,
        "diff_score": diff_score,
        "levenshtein_score": levenshtein_score,
        "rouge_score": rouge_score,
        "bleu_score": bleu_score,
        "meteor_score": meteor_score,
        "similarity_score": similarity_score
    })

In [None]:

response_scores_df = pd.DataFrame(response_scores)
display(response_scores_df)

GENERATION_METRICS = [col for col in response_scores_df.columns if "score" in col]


print("\nMean Overall Generation Scores:")
display(pd.DataFrame(response_scores_df[GENERATION_METRICS].mean()).T)

print("\nOverall Generation Score Statistics:")
display(pd.DataFrame(response_scores_df[GENERATION_METRICS].describe()).T)

In [None]:

CORRECTNESS_EVAL_PROMPT ="""
You are a Weight & Biases support expert tasked with evaluating the correctness of answers to questions asked by users to a technical support chatbot. 
You are tasked with judging the correctness of a generated answer based on the user's query, and a reference answer.

You will be given the following information:

<query>
{query}
</query>

<reference_answer>
{reference_answer}
</reference_answer>

<generated_answer>
{generated_answer}
</generated_answer>

Important Instruction: To evaluate the generated answer, follow these steps:

1. Intent Analysis: Consider the underlying intent of the query.
2. Relevance: Check if the generated answer addresses all aspects of the question.
3. Accuracy: Compare the generated answer to the reference answer for completeness and correctness.
4. Trustworthiness: Measure how trustworthy the generated answer is when compared to the reference.

Assign a score on an integer scale of 0 to 3 with the following meanings:
- 0 = The generated answer is incorrect and does not satisfy any of the criteria.
- 1 = The generated answer is partially correct, contains mistakes or is not factually correct.
- 2 = The generated answer is correct but includes some extra information, is incomplete or misses some evaluation criteria.
- 3 = The generated answer is correct, completely answers the query, does not contain any mistakes, and is factually consistent with the reference answer.

After your analysis, provide your verdict in the following JSON format:

{{
    "reason": "<<Provide a brief explanation for your decision here>>",
    "final_score": <<Provide a score as per the above guidelines>>,
    "decision": "<<Provide your final decision here, either 'correct' or 'incorrect'>>"
}}

Here are some examples of correct output:

Example 1:
{{
    "reason": "The generated answer has the exact details as the reference answer and completely answers the user's query.",
    "final_score": 3,
    "decision": "correct"
}}

Example 2:
{{
    "reason": "The generated answer doesn't match the reference answer and deviates from the user's query.",
    "final_score": 0,
    "decision": "incorrect"
}}

Example 3:
{{
    "reason": "The generated answer follows the same steps as the reference answer. However, it includes assumptions about functions that are not requested in the user's query",
    "final_score": 2,
    "decision": "correct"
}}

Example 4:
{{
    "reason": "The generated answer is incorrect, irrelevant, and not factually correct and completely misses the user's intent.",
    "final_score": 0,
    "decision": "incorrect"
}}

Please provide your evaluation based on the given information and format your response according to the specified JSON structure.
"""

In [None]:
client = cohere.AsyncClient(api_key=os.environ["CO_API_KEY"])

async def evaluate_correctness_using_llm_judge(query: str, reference_answer: str, generated_answer: str) -> int:
    response = await client.chat(
        message=CORRECTNESS_EVAL_PROMPT.format(query=query, reference_answer=reference_answer, generated_answer=generated_answer),
        model="command-r-plus",
        temperature=0.0,
        max_tokens=2000,
    )
    return response.text


In [None]:
async def run_correctness_evaluation_using_llm(response_scores):
    tasks = []
    for row in response_scores:
        query = row["query"]
        expected_answer = row["expected_answer"]
        generated_answer = row["actual_answer"]
        tasks.append(evaluate_correctness_using_llm_judge(query, expected_answer, generated_answer))
    scores = await asyncio.gather(*tasks)
    scores = list(map(json.loads, scores))
    return scores
    

In [None]:
llm_judge_correctness_results = asyncio.run(run_correctness_evaluation_using_llm(response_scores))

In [None]:
correctness_eval_df = pd.DataFrame(llm_judge_correctness_results)
response_evals_df = pd.concat([response_scores_df, correctness_eval_df], axis=1)
response_evals_df

In [None]:
llm_judge_response_accuracy = (response_evals_df["decision"] == "correct").sum()/len(response_evals_df)
print(f"LLM Judge Response Accuracy: {llm_judge_response_accuracy:.4f}")
response_evals_df['final_score'].value_counts().plot(kind="bar");