# Development of a RAG Model for RFP Question Similarity

## Notebook setup

In [None]:
import pandas as pd

In [None]:
%pip install -qU langchain langchain-openai langchain-cohere

In [None]:
import os

import dotenv

dotenv.load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
    raise Exception("OPENAI_API_KEY not found")

In [None]:
import textwrap
from IPython.display import HTML, display
from tabulate import tabulate


def _format_cell_text(text, width=50):
    """Private function to format a cell's text."""
    return "\n".join([textwrap.fill(line, width=width) for line in text.split("\n")])


def _format_dataframe_for_tabulate(df):
    """Private function to format the entire DataFrame for tabulation."""
    df_out = df.copy()

    # Format all string columns
    for column in df_out.columns:
        # Check if column is of type object (likely strings)
        if df_out[column].dtype == object:
            df_out[column] = df_out[column].apply(_format_cell_text)
    return df_out


def _dataframe_to_html_table(df):
    """Private function to convert a DataFrame to an HTML table."""
    headers = df.columns.tolist()
    table_data = df.values.tolist()
    return tabulate(table_data, headers=headers, tablefmt="html")


def display_nice(df, num_rows=None):
    """Primary function to format and display a DataFrame."""
    if num_rows is not None:
        df = df.head(num_rows)
    formatted_df = _format_dataframe_for_tabulate(df)
    html_table = _dataframe_to_html_table(formatted_df)
    display(HTML(html_table))

In [None]:
def print_dict_keys(data, indent=0):
    for key, value in data.items():
        print(' ' * indent + str(key))
        if isinstance(value, dict):  # if the value is another dictionary, recurse
            print_dict_keys(value, indent + 4)

## Data preparation

### Load RFPs

In [None]:
# List of CSV file paths
existing_rfp_paths = [
    "datasets/rag/rfp_existing_questions_client_2.csv",
]

existing_rfp_df = [pd.read_csv(file_path) for file_path in existing_rfp_paths]

# Concatenate all DataFrames into one
existing_rfp_df = pd.concat(existing_rfp_df, ignore_index=True)

In [None]:
existing_rfp_df

### Create Documents from CSV Files

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

documents = []

# Iterate through each file path in the list
for file_path in existing_rfp_paths:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Area"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    documents.extend(doc)

When using `CSVLoader`, each document represents a single row and includes its respective contents:

In [None]:
number_of_documents = 5

for i, document in enumerate(documents[:number_of_documents]):
    print(f"Document {i + 1}: {document}")

Accessing the page content of each document:

In [None]:
number_of_documents = 2

for i, document in enumerate(documents[:number_of_documents]):
    print(f"Page content for document {i + 1}:")
    print(document.page_content)
    print()

Note that when adding metadata, it is appended to the default metadata, which consists of the `row` number and the `source`: 

In [None]:
number_of_documents = 5

for i, document in enumerate(documents[:number_of_documents]):
    print(f"Metadata for document {i + 1}: {document.metadata}")

## Split the documents into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=10, add_start_index=True
)
chunks = text_splitter.split_documents(documents)

Get some general information about the chunks:

In [None]:
print(f"Number of chunks: {len(chunks)}")

See the length of the bigger and smaller chunks:

In [None]:
max_chunk_length = max([len(chunk.page_content) for chunk in chunks])
min_chunk_length = min([len(chunk.page_content) for chunk in chunks])
mean_chunk_length = sum([len(chunk.page_content) for chunk in chunks]) / len(chunks)

print(f"Maximum chunk length: {max_chunk_length}")
print(f"Minimum chunk length: {min_chunk_length}")
print(f"Mean chunk length: {mean_chunk_length}")

Plot the distribution of chunks: 

In [None]:
import plotly.express as px

# Calculate lengths of each chunk's page_content
chunk_lengths = [len(chunk.page_content) for chunk in chunks]

# Creating a histogram of chunk lengths
fig = px.histogram(chunk_lengths, nbins=50, title="Distribution of Chunk Lengths")
fig.update_layout(
    xaxis_title="Chunk Length",
    yaxis_title="Count",
    bargap=0.2,
    showlegend=False
)

# Add summary statistics as text on the plot
fig.add_annotation(
    x=max(chunk_lengths),
    y=0,
    showarrow=False,
    yshift=10
)

# Show the plot
fig.show()

Inspect the chunks: 

In [None]:
number_of_chunks = 5  

for index, chunk in enumerate(chunks[:i]):
    print(f"Chunk {index + 1}: {chunk}")  

See the page content of each chunk:

In [None]:
number_of_chunks = 5

for i, document in enumerate(chunks[:number_of_chunks]):
    print(f"Page content for chunk {i + 1}:")
    print(document.page_content)
    print()

See the metadata for individual chunks:

In [None]:
number_of_chunks = 5  

for i, chunk in enumerate(chunks[:number_of_chunks]):
    print(f"Metadata for chunk {i + 1}: {chunk.metadata}")



Access the source of each chunk:

In [None]:
number_of_chunks = 5  

for i, chunk in enumerate(chunks[:number_of_chunks]):
    print(f"Source for chunk {i + 1}: {chunk.metadata['source']}")

## Store chunks into a vectorstore

In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings_model,
)

## Create evaluation dataset

In [None]:
# Load the new RFP questions 
rag_evaluation_df = pd.read_csv("datasets/rag/rag_evaluation_dataset_01.csv")

# Set the constant variable to the number of rows in the DataFrame
NUM_OF_NEW_RFP_QUESTIONS = len(rag_evaluation_df)

print("Number of New RFP Questions:", NUM_OF_NEW_RFP_QUESTIONS)

In [None]:
rag_evaluation_df.head()

## Create a Retriever 

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

## Create the Question-Answer Prompt

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

## Build the RAG Chain

1. Get the question from the user
2. Pass the question to the retriever 
3. Get the context from the retriever 
4. Combine the context and the question to format the prompt
5. Pass the prompt to the LLM to get the answer

In [None]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

# Step 1: "question": Retrieved from the "question" key.
# Step 2: "context": Retrieved from the "question" key and fed into the retriever.
# Step 3: "context": Assigned to a RunnablePassthrough object using the "context" key from the previous step.
# Step 4: "answer": "context" and "question" are combined to format the prompt, then sent to the LLM and stored under the "answer" key.
# Step 5: "context": Repopulated using the "context" key from the previous step.

llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

rag_chain = (
    
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"answer": prompt | llm, "context": itemgetter("context")}

)

Ask a question to test the chain:

In [None]:
question = "Find a similar question as this one: 'What is your experience in developing AI-based applications?'"
response = rag_chain.invoke({"question" : question})
print(response)

As defined in the earlier chat prompt, the RAG response includes two fields: `answer` and `context`:

In [None]:
print_dict_keys(response)

Inspecting the answer, we see that the `rag_chain` is functioning correctly and identifies the most similar question in the `vectorstore`:

In [None]:
print(f"Question:")
print(question)
print()
print(f"Answer:")
print(response["answer"].content)

Next, we inspect the content of the `answer` and the `context` retrieved based on the `question`. The context should contain `k` chunks, the most relevant based on the question. Remember that we set`k` in the `retriever` earlier. These `k` chunks are pasted into the prompt as text, informing the LLM to generate an answer that is closer in the embedding space to the question.

In [None]:
number_of_chunks = 5  

for i, chunk in enumerate(response["context"][:number_of_chunks]):
    print(f"Content for chunk {i + 1}:")  # i + 1 to start counting from 1 instead of 0
    print(chunk.page_content)
    print()

We now inspect the `response_metadata` object to understand its contents and identify what could be useful to incorporate in our RAG evaluation dataset:

In [None]:
print(response["answer"].response_metadata)

In [None]:
print_dict_keys(response["answer"].response_metadata)

Extracting the LLM used:

In [None]:
print(f"Model: {response['answer'].response_metadata['model_name']}")

As we showed earlier, we can also extract some token usage statistics that can help us understand and optimize our interactions with the language model for cost-effectiveness and efficiency.

- **Prompt tokens**: tokens that form the input text sent to the language model. This includes all the text provided to the LLM to generate a response.
- **Completion tokens**: number of tokens in the generated text or output from the model.
- **Total tokens**: total number of tokens processed by the model. It is the sum of both `prompt_tokens` and `completion_tokens`. 

In [None]:
print(f"Completion tokens: {response['answer'].response_metadata['token_usage']['completion_tokens']}")
print(f"Prompt tokens: {response['answer'].response_metadata['token_usage']['prompt_tokens']}")
print(f"Total tokens: {response['answer'].response_metadata['token_usage']['total_tokens']}")

## Generate LLM Predictions

We will now expand our evaluation dataset to capture some metadata generated by the LLM, which will be used later when validating our RAG pipeline. We will add the following additional columns to our dataframe: `context`, `model_name`, `completion_tokens`, prompt_tokens, and `total_tokens`.

In [None]:
rag_evaluation_df['context'] = ''

rag_evaluation_df['question_embeddings'] = ''
rag_evaluation_df['answer_embeddings'] = ''
rag_evaluation_df['context_embeddings'] = ''

rag_evaluation_df['similarity_score_question_vs_context'] = ''
rag_evaluation_df['similarity_score_question_vs_answer'] = ''
rag_evaluation_df['similarity_score_context_vs_answer'] = ''

rag_evaluation_df['model'] = ''

rag_evaluation_df['completion_tokens'] = ''
rag_evaluation_df['prompt_tokens'] = ''
rag_evaluation_df['total_tokens'] = ''

rag_evaluation_df['response_time'] = ''

We would like to also compute few similarity metrics between embeddings such as cosine similaruty or euclidean distance: 

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_score(embedding1, embedding2):
    """
    Computes the cosine similarity between two embeddings.

    Parameters:
    - embedding1 (array-like): Embedding of the first entity.
    - embedding2 (array-like): Embedding of the second entity.

    Returns:
    - float: Cosine similarity score between the two embeddings.

    Note: The order of the embeddings does not affect the result as cosine similarity is symmetric.
    """
    # Ensure the embeddings are reshaped to 2D arrays for sklearn's cosine_similarity
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # Calculate and return the cosine similarity
    return cosine_similarity(embedding1, embedding2)[0][0]

In [None]:
import numpy as np

def euclidean_distance(embedding1, embedding2):
    """
    Computes the Euclidean distance between two embeddings.

    Parameters:
    - embedding1 (array-like): First embedding vector.
    - embedding2 (array-like): Second embedding vector.

    Returns:
    - float: Euclidean distance between the two embeddings.
    """
    # Convert inputs to NumPy arrays if they aren't already
    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    
    # Calculate and return the Euclidean distance
    return np.linalg.norm(embedding1 - embedding2)

In [None]:
import time


# Number of questions to process by the RAG model
number_of_rows_to_process = NUM_OF_NEW_RFP_QUESTIONS

for i, (index, row) in enumerate(rag_evaluation_df.iloc[:number_of_rows_to_process].iterrows()):
    print(f"Processing row {i}...")

    # Check if the 'answer' field is 'None' (as a string) for the current row
    if row["answer"] == "None":
        print(f"Answer is 'None' for question ID {index}. Invoking RAG model...")

        start_time = time.time()  # Start timing
        
        # Invoke the RAG model with the question from the current row
        response = rag_chain.invoke({"question": row["question_to_llm"]})

        end_time = time.time()  # End timing

        # Calculate the response time and store it
        rag_evaluation_df.at[index, 'response_time'] = round(end_time - start_time, 1)

        # Store whatever response comes from the LLM
        rag_evaluation_df.at[index, "answer"] = response["answer"].content
        print(f"Question ID {index} answer updated with the response from the RAG model.")
    
        # Store the context included in the prompt
        context = "\n\n".join(chunk.page_content for chunk in response["context"])
        rag_evaluation_df.at[index, "context"] = context
        
        # Compute and store embeddings for the question, context and answer
        print("Computing embeddings for the question...")
        question_embeddings = np.array(embeddings_model.embed_query(row["question_to_llm"]))
        rag_evaluation_df.at[index, 'question_embeddings'] = question_embeddings
        
        print("Computing embeddings for the context...")
        context_embeddings = np.array(embeddings_model.embed_query(context))
        rag_evaluation_df.at[index, 'context_embeddings'] = context_embeddings
        
        print("Computing embeddings for the answer...")
        answer_embeddings = np.array(embeddings_model.embed_query(response["answer"].content))
        rag_evaluation_df.at[index, 'answer_embeddings'] = answer_embeddings
        
        # Compute similarity measures between embeddings 
        print("Computing cosine similarity between question and context...")
        rag_evaluation_df.at[index, 'similarity_score_question_vs_context'] = cosine_similarity_score(question_embeddings, context_embeddings)
        
        print("Computing cosine similarity between question and answer...")
        rag_evaluation_df.at[index, 'similarity_score_question_vs_answer'] = cosine_similarity_score(question_embeddings, answer_embeddings)

        print("Computing cosine similarity between context and answer...")
        rag_evaluation_df.at[index, 'similarity_score_context_vs_answer'] = cosine_similarity_score(context_embeddings, answer_embeddings)
        
        # Store some metadata such as model name and tokens statistics
        rag_evaluation_df.at[index, "model"] = response["answer"].response_metadata["model_name"]
        rag_evaluation_df.at[index, "completion_tokens"] = response['answer'].response_metadata['token_usage']['completion_tokens']
        rag_evaluation_df.at[index, "prompt_tokens"] = response['answer'].response_metadata['token_usage']['prompt_tokens']
        rag_evaluation_df.at[index, "total_tokens"] = response['answer'].response_metadata['token_usage']['total_tokens']

print("Processing complete.")

First, check if all responses have been generated by the RAG pipeline or if there are any `None` values in the answers column. If there are any rows with `None` answers, remove these before they are passed to the RAGAS metrics.

In [None]:
rag_evaluation_df = rag_evaluation_df[rag_evaluation_df['answer'] != 'None']
rag_evaluation_df

We save the results in a CSV file for convenience to avoid having to execute the entire RAG pipeline every time we want to test our RAG evaluation metrics:

In [None]:
# Save to CSV
rag_evaluation_df.to_csv('datasets/rag/rag_evaluation_dataset_02.csv', index=False)

## RAG Evaluation with RAGAS

RETRIEVAL 
- **Context Precision**:  how relevant is the `context` to the `question`.
 - **Context Recall**: given the `ground truth`, is the retriever able to retrieve all the relevant `context`.

GENERATION
 - **Answer Relevancy**: how relevant is the generated `answer` to the `question`. 
 - **Faithfulness**: is the `answer` fact-checkable. The number of correct statements from the given `contexts` divided by the total number of statements in the generated answer.

We now proceed to evaluate our RAG pipeline using RAGAS metrics from the `ragas` package. The `evaluate()` function expects a Dataset with specific column names: `question`, `contexts`, `ground_truth`, and `answer`. We will now rename these columns to conform to the expected column names in RAGAS.

In [None]:
# prepare the dataframe for RAGAS evaluation
ragas_results_df = rag_evaluation_df.copy()

# Rename the columns to match ragas convention
ragas_results_df.rename(
    columns={
        "question_to_llm": "question",
        "context": "contexts"}, 
    inplace=True
)


# Convert the 'contexts' column from a string to a list of strings for each row
ragas_results_df['contexts'] = ragas_results_df['contexts'].apply(lambda x: [x])

ragas_results_df.info()

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas import evaluate

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

Now, we apply the RAGAS evaluation metrics row by row, adding the results to corresponding columns for each metric in our evaluation dataset. We first initialize the columns where the evaluation metrics will be stored:

In [None]:
ragas_results_df['context_precision'] = ''
ragas_results_df['faithfulness'] = ''
ragas_results_df['answer_relevancy'] = ''
ragas_results_df['context_recall'] = ''
ragas_results_df['context_relevancy'] = ''
ragas_results_df['answer_correctness'] = ''
ragas_results_df['answer_similarity'] = ''


In [None]:
from datasets import Dataset

required_fields = ["question", "answer", "contexts", "ground_truth"]
metrics = ["context_precision", "faithfulness", "answer_relevancy", "context_recall", "context_relevancy", "answer_correctness", "answer_similarity"]

# Set the variable to the number of rows, limited to a maximum of NUM_OF_NEW_RFP_QUESTIONS
number_of_rows_to_process = min(len(ragas_results_df), NUM_OF_NEW_RFP_QUESTIONS)

# Mapping of metric names to their respective functions, assuming these functions are predefined
metrics_functions = {
    "context_precision": context_precision,
    "faithfulness": faithfulness,
    "answer_relevancy": answer_relevancy,
    "context_recall": context_recall,
    "context_relevancy": context_relevancy,
    "answer_correctness": answer_correctness,
    "answer_similarity": answer_similarity
}

In [None]:
ragas_results_df

In [None]:

# This loop processes each row up to a predefined number of rows, evaluating them with specified metrics and storing the results
for i, (index, row) in enumerate(rag_evaluation_df.iloc[:number_of_rows_to_process].iterrows()):
    print(f"Processing RFP question {i+1}...")
    print(f"Question: {ragas_results_df.iloc[i]['question']}")
    print(f"Answer: {ragas_results_df.iloc[i]['answer']}")

    # Create a temporary Dataset for the current row
    ragas_dataset = Dataset.from_pandas(ragas_results_df.iloc[i: i + 1][required_fields])

    # Evaluate using RAGAS metrics
    evaluation_result = evaluate(
        ragas_dataset, 
        [metrics_functions[metric] for metric in metrics if metric in metrics_functions])
    print("Evaluation completed.")

    # Store evaluation results back into the DataFrame
    for metric in metrics:
        if metric in evaluation_result:
            ragas_results_df.at[i, metric] = evaluation_result[metric]
            print(f"{metric}: {evaluation_result[metric]}")

print("All RFP questions processed.")


In [None]:
ragas_results_df.info()

In [None]:
ragas_results_df

In [None]:
# Save to CSV
ragas_results_df.to_csv('datasets/rag/rag_evaluation_results.csv', index=False)