# RAG for RFP Q&A Demo

## Notebook setup

In [None]:
import pandas as pd

In [None]:
%pip install -qU langchain langchain-openai langchain-cohere

In [None]:
%pip install -qU qdrant-client lark

In [None]:
import os

import dotenv

dotenv.load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
    raise Exception("OPENAI_API_KEY not found")

## Data preparation

### Load documents 

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

# List of CSV file paths
rfp_file_paths = [
    "datasets/rfp_genai_app_vendor_ID1.csv",
    "datasets/rfp_genai_app_vendor_ID2.csv",
    "datasets/rfp_genai_app_vendor_ID3.csv",
    "datasets/rfp_genai_app_vendor_ID4.csv",
    "datasets/rfp_genai_app_vendor_ID5.csv"
]

documents = []

# Iterate through each file path in the list
for file_path in rfp_file_paths:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Area", "Project_Title", "Last_Accessed_At", "Requester", "Status"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    documents.extend(doc)

See the documents loaded: 

In [None]:
documents[:5]

Access the metadata:

In [None]:
documents[0].metadata

## Split the documents into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=10, add_start_index=True
)
splits = text_splitter.split_documents(documents)

In [None]:
splits[:10]

## Store chunks into a vectorstore

In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

## Create new chunk IDs based on chunk metadata

Expand the existing index to include the metadata fields. For this will define a hash function to create unique identifiers from existing split ids and metadata.

In [None]:
splits[0].metadata

In [None]:
import hashlib
import json
from langchain_core.documents import Document

def stable_hash_meta(doc: Document) -> str:
    """
    Stable hash document based on its metadata. Assumes 'metadata' is always present.
    """
    try:
        metadata_json = json.dumps(doc.metadata, sort_keys=True)
    except AttributeError:
        raise ValueError("Document does not have metadata.")
    return hashlib.sha1(metadata_json.encode()).hexdigest()

In [None]:
splits_ids = [{"doc": split, "id": stable_hash_meta(split)} for split in splits]

existing_ids = vectorstore.get()["ids"]

new_splits_ids = [split for split in splits_ids if split["id"] not in existing_ids]


## Create evaluation dataset

In [None]:
# Load all RFPs into a single pandas DataFrame

all_rpfs = [pd.read_csv(file_path) for file_path in rfp_file_paths]

# Concatenate all DataFrames into one
df = pd.concat(all_rpfs, ignore_index=True)

df = df[["Project_Title", "RFP_Question_ID", "RFP_Question"]]
df.head()


In [None]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [None]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI()

In [None]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

naive_retrieval_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | naive_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [None]:
naive_retrieval_chain.invoke({"question" : "List the questions related to NIST?"})["response"].content