# RAG for Question Similarity in RFPs

## Notebook setup

In [None]:
import pandas as pd

In [None]:
%pip install -qU langchain langchain-openai langchain-cohere

In [None]:
%pip install -qU qdrant-client lark

In [None]:
import os

import dotenv

dotenv.load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
    raise Exception("OPENAI_API_KEY not found")

In [None]:
import textwrap
from IPython.display import HTML, display
from tabulate import tabulate


def _format_cell_text(text, width=50):
    """Private function to format a cell's text."""
    return "\n".join([textwrap.fill(line, width=width) for line in text.split("\n")])


def _format_dataframe_for_tabulate(df):
    """Private function to format the entire DataFrame for tabulation."""
    df_out = df.copy()

    # Format all string columns
    for column in df_out.columns:
        # Check if column is of type object (likely strings)
        if df_out[column].dtype == object:
            df_out[column] = df_out[column].apply(_format_cell_text)
    return df_out


def _dataframe_to_html_table(df):
    """Private function to convert a DataFrame to an HTML table."""
    headers = df.columns.tolist()
    table_data = df.values.tolist()
    return tabulate(table_data, headers=headers, tablefmt="html")


def display_nice(df, num_rows=None):
    """Primary function to format and display a DataFrame."""
    if num_rows is not None:
        df = df.head(num_rows)
    formatted_df = _format_dataframe_for_tabulate(df)
    html_table = _dataframe_to_html_table(formatted_df)
    display(HTML(html_table))

## Data preparation

### Load existing RFPs

In [None]:
# List of CSV file paths
existing_rfp_paths = [
    "datasets/rag/rfp_existing_questions_client_2.csv",
]

existing_rfp_df = [pd.read_csv(file_path) for file_path in existing_rfp_paths]

# Concatenate all DataFrames into one
existing_rfp_df = pd.concat(existing_rfp_df, ignore_index=True)

In [None]:
existing_rfp_df

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

documents = []

# Iterate through each file path in the list
for file_path in existing_rfp_paths:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Area"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    documents.extend(doc)

When using `CSVLoader`, each document represents a single row and includes its respective contents:

In [None]:
number_of_documents = 5

for i, document in enumerate(documents[:number_of_documents]):
    print(f"Document {i + 1}: {document}")

Accessing the page content of each document:

In [None]:
number_of_documents = 2

for i, document in enumerate(documents[:number_of_documents]):
    print(f"Page content for document {i + 1}:")
    print(document.page_content)
    print()

Note that when adding metadata, it is appended to the default metadata, which consists of the row number and the source: 

In [None]:
number_of_documents = 5

for i, document in enumerate(documents[:number_of_documents]):
    print(f"Metadata for document {i + 1}: {document.metadata}")

## Split the documents into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=10, add_start_index=True
)
chunks = text_splitter.split_documents(documents)

Get some general information about the chunks:

In [None]:
print(f"Number of chunks: {len(chunks)}")

See the length of the bigger and smaller chunks:

In [None]:
max_chunk_length = max([len(chunk.page_content) for chunk in chunks])
min_chunk_length = min([len(chunk.page_content) for chunk in chunks])
mean_chunk_length = sum([len(chunk.page_content) for chunk in chunks]) / len(chunks)

print(f"Maximum chunk length: {max_chunk_length}")
print(f"Minimum chunk length: {min_chunk_length}")
print(f"Mean chunk length: {mean_chunk_length}")

Plot the distribution of chunks: 

In [None]:
import plotly.express as px

# Calculate lengths of each chunk's page_content
chunk_lengths = [len(chunk.page_content) for chunk in chunks]

# Creating a histogram of chunk lengths
fig = px.histogram(chunk_lengths, nbins=50, title="Distribution of Chunk Lengths")
fig.update_layout(
    xaxis_title="Chunk Length",
    yaxis_title="Count",
    bargap=0.2,
    showlegend=False
)

# Add summary statistics as text on the plot
fig.add_annotation(
    x=max(chunk_lengths),
    y=0,
    showarrow=False,
    yshift=10
)

# Show the plot
fig.show()

Inspect the chunks: 

In [None]:
number_of_chunks = 5  

for index, chunk in enumerate(chunks[:i]):
    print(f"Chunk {index + 1}: {chunk}")  

See the page content of each chunk:

In [None]:
number_of_chunks = 5

for i, document in enumerate(chunks[:number_of_chunks]):
    print(f"Page content for chunk {i + 1}:")
    print(document.page_content)
    print()

See the metadata for individual chunks:

In [None]:
number_of_chunks = 5  

for i, chunk in enumerate(chunks[:number_of_chunks]):
    print(f"Metadata for chunk {i + 1}: {chunk.metadata}")



Access the source of each chunk:

In [None]:
number_of_chunks = 5  

for i, chunk in enumerate(chunks[:number_of_chunks]):
    print(f"Source for chunk {i + 1}: {chunk.metadata['source']}")

## Store chunks into a vectorstore

In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings_model,
)

## Create evaluation dataset

In [None]:
# Load all RFPs into a single pandas DataFrame

rag_evaluation_df = pd.read_csv("datasets/rag/rag_evaluation_dataset_v1.csv")

display_nice(rag_evaluation_df, num_rows=2)


In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-turbo", temperature=0.0)
retriever = vectorstore.as_retriever(search_kwargs={"k": 20})

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0)


# Step 1: "question" : populated by getting the value of the "question" key
# Step 2: "context"  : populated by getting the value of the "question" key and chaining it into the retriever
# Step 3: "context"  : is assigned to a RunnablePassthrough object by getting the value of the "context" key from the previous step
# Step 4: "response" : the "context" and "question" values are used to format our prompt object and then piped
#                      into the LLM and stored in a key called "response"
# Step 5: "context"  : populated by getting the value of the "context" key from the previous step

rag_chain = (
    
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}

)

Ask a question to test the chain:

In [None]:
question = "Find a similar question as this one: 'What is your experience in developing AI-based applications?'"
response = rag_chain.invoke(question)
print(response["answer"])

In [None]:
print(response["source_documents"])

In [None]:
#if 'source_documents' not in rag_evaluation_df.columns:
#    rag_evaluation_df['source_documents'] = pd.Series([[] for _ in range(len(rag_evaluation_df))], index=rag_evaluation_df.index)


# Iterate over the DataFrame rows
for i, row in rag_evaluation_df.iterrows():
    print(f"Processing row {i}...")

    # Check if the 'answer' field is 'None' (as a string) for the current row
    if row["answer"] == "None":
        print(f"Answer is 'None' for question ID {i}. Invoking RAG model...")

        # Invoke the RAG model with the question from the current row
        response = rag_chain.invoke(row["question_to_llm"])

        # Store whatever response comes from the LLM
        rag_evaluation_df.at[i, "answer"] = response["answer"]
        print(f"Question ID {i} answer updated with the response from the RAG model.")

        # Compute the hashed metadata for each source document
        source_hashes = [stable_hash_meta(source_document.metadata) for source_document in response["source_documents"]]

        # Check if source_hashes is a list before assignment, throw error if not
        #if not isinstance(source_hashes, list):
        #    print(f"Expected a list for source_hashes but got {type(source_hashes)} instead.")
        #    raise TypeError(f"source_hashes must be a list, but was {type(source_hashes)}")

        # Assign the list of source document hashes directly to the DataFrame cell
        #rag_evaluation_df.at[i, "source_documents"] = source_hashes
        #print(f"Question ID {i} source documents updated with hashed metadata.")

print("Processing complete.")


In [None]:
display_nice(rag_evaluation_df, num_rows=5)

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [None]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

In [None]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [None]:
run = False
if run:

    from typing import List

    from langchain_core.runnables import RunnableParallel, RunnablePassthrough
    from langchain_core.output_parsers import StrOutputParser

    # This function formats a list of Document objects into a single string.
    # Each document's content and source are formatted and separated by two newlines.
    def format_docs(docs: List[Document]) -> str:
        return "\n\n".join(
            f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
        )

    # This chain is used for processing 'source_documents'. It formats the documents
    # using the 'format_docs' function, then passes the formatted string through
    # subsequent unspecified operations (`prompt`, `llm`) and finally parses the output to a string.
    rag_chain_from_docs = (
        RunnablePassthrough.assign(
            source_documents=(lambda x: format_docs(x["source_documents"]))
        )
        | prompt  # Uses the 'prompt' template to format the context and question.
        | llm     # Uses the language model to generate an answer.
        | StrOutputParser()  # Parses the output from the language model into a string format.
    )

    # This RunnableParallel constructs a parallel chain for processing.
    # It takes 'source_documents' from a retriever and a 'question' as inputs.
    # The 'answer' part of the chain is assigned to the previously defined 'rag_chain_from_docs'.
    rag_chain = RunnableParallel(
        {
            "source_documents": retriever,  
            "question": RunnablePassthrough(),  # Passes the question through without modification.
        }
    ).assign(answer=rag_chain_from_docs)  # The final output is determined by the chain that processes documents.

In [None]:
import hashlib
import json
from langchain_core.documents import Document

def stable_hash_meta(doc: Document) -> str:
    """
    Stable hash document based on its metadata. Assumes 'metadata' is always present.
    """
    try:
        metadata_json = json.dumps(doc.metadata, sort_keys=True)
    except AttributeError:
        raise ValueError("Document does not have metadata.")
    return hashlib.sha1(metadata_json.encode()).hexdigest()

In [None]:
#splits_ids = [{"doc": split, "id": stable_hash_meta(split)} for split in splits]

#existing_ids = vectorstore.get()["ids"]

#new_splits_ids = [split for split in splits_ids if split["id"] not in existing_ids]