# RAG for RFP Q&A Demo

## Notebook setup

In [1]:
import pandas as pd

In [2]:
%pip install -qU langchain langchain-openai langchain-cohere


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install -qU qdrant-client lark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os

import dotenv

dotenv.load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
    raise Exception("OPENAI_API_KEY not found")

In [5]:
import textwrap
from IPython.display import HTML, display
from tabulate import tabulate


def _format_cell_text(text, width=50):
    """Private function to format a cell's text."""
    return "\n".join([textwrap.fill(line, width=width) for line in text.split("\n")])


def _format_dataframe_for_tabulate(df):
    """Private function to format the entire DataFrame for tabulation."""
    df_out = df.copy()

    # Format all string columns
    for column in df_out.columns:
        # Check if column is of type object (likely strings)
        if df_out[column].dtype == object:
            df_out[column] = df_out[column].apply(_format_cell_text)
    return df_out


def _dataframe_to_html_table(df):
    """Private function to convert a DataFrame to an HTML table."""
    headers = df.columns.tolist()
    table_data = df.values.tolist()
    return tabulate(table_data, headers=headers, tablefmt="html")


def display_nice(df, num_rows=None):
    """Primary function to format and display a DataFrame."""
    if num_rows is not None:
        df = df.head(num_rows)
    formatted_df = _format_dataframe_for_tabulate(df)
    html_table = _dataframe_to_html_table(formatted_df)
    display(HTML(html_table))

## Data preparation

### Load existing RFPs

In [6]:
# List of CSV file paths
existing_rfp_paths = [
    "datasets/rag/rfp_existing_questions_client_2.csv",
]

existing_rfp_df = [pd.read_csv(file_path) for file_path in existing_rfp_paths]

# Concatenate all DataFrames into one
existing_rfp_df = pd.concat(existing_rfp_df, ignore_index=True)

In [7]:
from langchain_community.document_loaders.csv_loader import CSVLoader

documents = []

# Iterate through each file path in the list
for file_path in existing_rfp_paths:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Area"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    documents.extend(doc)

See the documents loaded: 

In [8]:
documents[:5]

[Document(page_content='Project_Title: AI-Powered Risk Assessment Model Development for Loan Processing\nRFP_Question_ID: 1\nRFP_Question: Can you discuss your expertise in creating AI-driven applications and share examples of your successful implementations?\nRFP_Answer: Our company has 15 years of experience in developing AI-based applications, with a strong portfolio in sectors such as healthcare, finance, and education. For instance, our project MediAI Insight for the healthcare industry demonstrated significant achievements in patient data analysis, resulting in a 30% reduction in diagnostic errors and a 40% improvement in treatment personalization. Our platform has engaged over 200 healthcare facilities, achieving a user satisfaction rate of 95%.\nLast_Accessed_At: 18/12/2022\nRequester: Bank B\nStatus: Awarded', metadata={'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'General'}),
 Document(page_content='Project_Title: AI-Powered Risk Assessment 

Access the metadata:

In [9]:
documents[0].metadata

{'source': 'datasets/rag/rfp_existing_questions_client_2.csv',
 'row': 0,
 'Area': 'General'}

## Split the documents into chunks

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=10, add_start_index=True
)
splits = text_splitter.split_documents(documents)

In [23]:
splits[:10]

[Document(page_content='Project_Title: AI-Powered Risk Assessment Model Development for Loan Processing\nRFP_Question_ID: 1\nRFP_Question: Can you discuss your expertise in creating AI-driven applications and share examples of your successful implementations?', metadata={'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'General', 'start_index': 0}),
 Document(page_content='RFP_Answer: Our company has 15 years of experience in developing AI-based applications, with a strong portfolio in sectors such as healthcare, finance, and education. For instance, our project MediAI Insight for the healthcare industry demonstrated significant achievements in patient data analysis, resulting in a 30% reduction in diagnostic errors and a 40% improvement in treatment personalization. Our platform has engaged over 200 healthcare facilities, achieving a user satisfaction rate of', metadata={'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'Gen

## Store chunks into a vectorstore

In [12]:
from langchain.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

## Create new chunk IDs based on chunk metadata

Expand the existing index to include the metadata fields. For this will define a hash function to create unique identifiers from existing split ids and metadata.

In [27]:
for split in splits[:5]:  
    print(split.metadata)

{'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'General', 'start_index': 0}
{'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'General', 'start_index': 234}
{'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'General', 'start_index': 723}
{'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 0, 'Area': 'General', 'start_index': 736}
{'source': 'datasets/rag/rfp_existing_questions_client_2.csv', 'row': 1, 'Area': 'General', 'start_index': 0}


Extrac the source for example: 

In [26]:
for split in splits[:5]:  
    print(split.metadata["source"])

datasets/rag/rfp_existing_questions_client_2.csv
datasets/rag/rfp_existing_questions_client_2.csv
datasets/rag/rfp_existing_questions_client_2.csv
datasets/rag/rfp_existing_questions_client_2.csv
datasets/rag/rfp_existing_questions_client_2.csv


In [14]:
import hashlib
import json
from langchain_core.documents import Document

def stable_hash_meta(doc: Document) -> str:
    """
    Stable hash document based on its metadata. Assumes 'metadata' is always present.
    """
    try:
        metadata_json = json.dumps(doc.metadata, sort_keys=True)
    except AttributeError:
        raise ValueError("Document does not have metadata.")
    return hashlib.sha1(metadata_json.encode()).hexdigest()

In [15]:
splits_ids = [{"doc": split, "id": stable_hash_meta(split)} for split in splits]

existing_ids = vectorstore.get()["ids"]

new_splits_ids = [split for split in splits_ids if split["id"] not in existing_ids]


## Create evaluation dataset

In [16]:
# Load all RFPs into a single pandas DataFrame

rag_evaluation_df = pd.read_csv("datasets/rag/rag_evaluation_dataset_v1.csv")

display_nice(rag_evaluation_df, num_rows=5)


id,new_rfp,new_question,question_to_llm,answer,ground_truth,existing_rfp,llm
1,rfp_new_questions_client_100.csv,"What is your experience in developing AI-based applications, and can you provide examples of successful projects?","What is the most similar question to: ""What is your experience in developing AI-based applications, and can you provide examples of successful projects?""",,Can you discuss your expertise in creating AI- driven applications and share examples of your successful implementations?,rfp_exisiting_questions_client_2.csv,gpt-4-turbo
2,rfp_new_questions_client_100.csv,How do you ensure your AI-based apps remain up-to- date with the latest AI advancements and technologies?,"What is the most similar question to: ""How do you ensure your AI-based apps remain up-to-date with the latest AI advancements and technologies?""",,How do you keep your AI applications current with ongoing advancements in artificial intelligence?,rfp_exisiting_questions_client_2.csv,gpt-4-turbo
3,rfp_new_questions_client_100.csv,Can your AI-based applications be customized to meet specific user or business needs?,"What is the most similar question to: ""Can your AI-based applications be customized to meet specific user or business needs?""",,Are your AI applications adaptable to specific requirements of users or businesses?,rfp_exisiting_questions_client_2.csv,gpt-4-turbo
4,rfp_new_questions_client_100.csv,What measures do you take to ensure user privacy and data security in your AI-based apps?,"What is the most similar question to: ""What measures do you take to ensure user privacy and data security in your AI-based apps?""",,What steps do you undertake to protect user privacy and secure data within your AI applications?,rfp_exisiting_questions_client_2.csv,gpt-4-turbo
5,rfp_new_questions_client_100.csv,How do you approach user interface and experience design in AI-based apps to ensure ease of use and engagement?,"What is the most similar question to: ""How do you approach user interface and experience design in AI-based apps to ensure ease of use and engagement?""",,What strategies do you employ to design user interfaces and experiences in AI applications to maximize usability and user engagement?,rfp_exisiting_questions_client_2.csv,gpt-4-turbo


In [17]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4-turbo", temperature=0.0)
retriever = vectorstore.as_retriever(search_kwargs={"k": 20})

In [25]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
{question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [19]:
from typing import List

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x["source_documents"]))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)

In [None]:
#if 'source_documents' not in rag_evaluation_df.columns:
#    rag_evaluation_df['source_documents'] = pd.Series([[] for _ in range(len(rag_evaluation_df))], index=rag_evaluation_df.index)


# Iterate over the DataFrame rows
for i, row in rag_evaluation_df.iterrows():
    print(f"Processing row {i}...")

    # Check if the 'answer' field is 'None' (as a string) for the current row
    if row["answer"] == "None":
        print(f"Answer is 'None' for question ID {i}. Invoking RAG model...")

        # Invoke the RAG model with the question from the current row
        response = rag_chain.invoke(row["question_to_llm"])

        # Store whatever response comes from the LLM
        rag_evaluation_df.at[i, "answer"] = response["answer"]
        print(f"Question ID {i} answer updated with the response from the RAG model.")

        # Compute the hashed metadata for each source document
        source_hashes = [stable_hash_meta(source_document.metadata) for source_document in response["source_documents"]]

        # Check if source_hashes is a list before assignment, throw error if not
        #if not isinstance(source_hashes, list):
        #    print(f"Expected a list for source_hashes but got {type(source_hashes)} instead.")
        #    raise TypeError(f"source_hashes must be a list, but was {type(source_hashes)}")

        # Assign the list of source document hashes directly to the DataFrame cell
        #rag_evaluation_df.at[i, "source_documents"] = source_hashes
        #print(f"Question ID {i} source documents updated with hashed metadata.")

print("Processing complete.")


In [None]:
display_nice(rag_evaluation_df, num_rows=5)

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [None]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

In [None]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)