# RAG Pipeline using LlamaIndex

## Pre-requisites

In [1]:
%pip install llama-index llama-index-embeddings-huggingface llama-index-vector-stores-chroma -q -U

Note: you may need to restart the kernel to use updated packages.


In [3]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

## Dataset Loader

In [2]:
# Import the sample dataset from the library
from validmind.datasets.llm.rag import rfp

raw_df = rfp.load_data()
train_df, test_df = rfp.preprocess(raw_df)

In [4]:
train_df.head()

Unnamed: 0,Project_Title,question,ground_truth,Area,Requester,Status,id
27,Automated Document Processing System Using AI ...,How is user interface and experience considere...,Our design philosophy centers on simplicity an...,General,Bank D,Awarded,0b7db970-f0d0-4199-afe4-7d1fa99a1720
1,Generative AI Solutions for Fraud Detection an...,How do you maintain your AI applications with ...,We maintain a dedicated R&D team focused on in...,General,Bank E,Under Review,96515a41-d3d4-4247-afc8-7bfdfd7e2647
66,Gen AI-Driven Financial Advisory System,Describe how your AI solutions manage and miti...,We implement and maintain robust risk manageme...,AI Regulation,Bank A,Under Review,6cd03a66-ce41-45a5-8d20-67bc4183298f
78,Implementation of AI Chatbots for Enhanced Cus...,How are your LLMs updated to reflect new data ...,We implement advanced continuous learning mech...,Large Language Models,Bank C,Awarded,45050954-b200-48a6-b6cc-75f658376df2
112,AI-Powered Risk Assessment Model Development f...,Explain how you manage and mitigate AI risks i...,We implement and maintain robust risk manageme...,AI Regulation,Bank B,Awarded,4ff9c1c7-cf3d-4742-a4cd-c25ec37335c9


In [5]:
test_df.head()

Unnamed: 0,Project_Title,question,ground_truth,Area,Requester,Status,id
70,Implementation of AI Chatbots for Enhanced Cus...,What strategies do you employ to update your A...,We maintain a dedicated R&D team focused on in...,General,Bank C,Awarded,d4aa5b1f-c3a8-4f7c-8b9d-46cc6dd92f8d
82,Implementation of AI Chatbots for Enhanced Cus...,What is your strategy for integrating LLMs smo...,Our approach involves conducting a thorough an...,Large Language Models,Bank C,Awarded,65a9775f-a524-4363-a6e3-98eca7ebc897
110,AI-Powered Risk Assessment Model Development f...,What steps do you implement to ensure AI decis...,We prioritize transparency by incorporating ex...,AI Regulation,Bank B,Awarded,735e9255-ed08-4dbd-a0b4-3bb096690c2c
79,Implementation of AI Chatbots for Enhanced Cus...,What approaches do you utilize to ensure the d...,We prioritize transparency and explainability ...,Large Language Models,Bank C,Awarded,2562adce-969f-4aac-a617-60de0abdf012
62,Gen AI-Driven Financial Advisory System,Can you describe the governance structures you...,We have established an AI Risk Council that pl...,AI Regulation,Bank A,Under Review,3766d848-2265-4f5f-bd60-98d161fb8985


## Embedding Model Selection

First, we test both the `sentence-transformers` and `openai` embedding models using their native interfaces. 

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
embeddings = embed_model.get_text_embedding("Hello World!")

print(len(embeddings))
print(embeddings[:5])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

768
[0.01917368918657303, 0.028736494481563568, -0.012354077771306038, 0.015822110697627068, 0.07908996194601059]


In [7]:
from openai import OpenAI
client = OpenAI()

embeddings = client.embeddings.create(
    input="Hello World!", 
    model="text-embedding-3-small"
).data[0].embedding

print(len(embeddings))
print(embeddings[:5])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


1536
[-0.0030460702255368233, -0.056743934750556946, 0.02944534458220005, 0.04296409338712692, -0.04081733897328377]


Create `validmind` embedding models.

In [10]:
from validmind.models import FunctionModel

def embed(input):
    """Returns a text embedding for the given text"""
    
    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
    return embed_model.get_text_embedding(input["question"])


vm_embedder_st = FunctionModel(input_id="embedding_model_st", predict_fn=embed)

In [11]:
def embed(question):
    """Returns a text embedding for the given text"""

    return (
        client.embeddings.create(
            input=question,
            model="text-embedding-3-small",
        )
        .data[0]
        .embedding
    )


vm_embedder_openai = FunctionModel(
    input_id="embedding_model_openai", 
    predict_fn=embed
)

Create `validmind` datasets.

In [17]:
import validmind as vm

vm_train_ds = vm.init_dataset(train_df, text_column="question", __log=False)
vm_test_ds = vm.init_dataset(test_df, text_column="question", __log=False)

2024-05-07 15:37:23,480 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2024-05-07 15:37:23,482 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...


Generate embeddings from the text in the `question` column.

In [19]:
vm_test_ds.assign_predictions(vm_embedder_st)
vm_test_ds.assign_predictions(vm_embedder_openai)

2024-05-07 15:38:15,461 - INFO(validmind.vm_models.dataset.utils): Running predict_proba()... This may take a while
2024-05-07 15:38:15,461 - INFO(validmind.vm_models.dataset.utils): Not running predict_proba() for unsupported models.
2024-05-07 15:38:15,462 - INFO(validmind.vm_models.dataset.utils): Running predict()... This may take a while
2024-05-07 15:38:33,688 - INFO(validmind.vm_models.dataset.utils): Done running predict()
2024-05-07 15:38:33,689 - INFO(validmind.vm_models.dataset.utils): Running predict_proba()... This may take a while
2024-05-07 15:38:33,690 - INFO(validmind.vm_models.dataset.utils): Not running predict_proba() for unsupported models.
2024-05-07 15:38:33,690 - INFO(validmind.vm_models.dataset.utils): Running predict()... This may take a while
2024-05-07 15:38:40,568 - INFO(validmind.vm_models.dataset.utils): Done running predict()


Run an embedding test for both models to ensure that the embedding models function properly.

In [20]:
from validmind.tests import run_test

run = True
if run:
    result = run_test(
        "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise:HFEmbeddingModel",
        inputs={"model": vm_embedder_st, "dataset": vm_test_ds},
        params={"probability": 0.3},
    )

VBox(children=(HTML(value='\n            <h1>Stability Analysis Random Noise HF Embedding Model ✅</h1>\n      …

In [21]:
run = True
if run:
    result = run_test(
        "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise:OpenAIEmbeddingModel",
        inputs={"model": vm_embedder_openai, "dataset": vm_test_ds},
        params={"probability": 0.3},
    )

VBox(children=(HTML(value='\n            <h1>Stability Analysis Random Noise Open AI Embedding Model ✅</h1>\n …

## Setup Vector Store

#### Generate embeddings for the questions

In [23]:
train_df["embedding"] = vm_embedder_st.predict(train_df)
train_df.head()

Unnamed: 0,Project_Title,question,ground_truth,Area,Requester,Status,id,embedding
27,Automated Document Processing System Using AI ...,How is user interface and experience considere...,Our design philosophy centers on simplicity an...,General,Bank D,Awarded,0b7db970-f0d0-4199-afe4-7d1fa99a1720,"[-0.006611819379031658, -0.00952684972435236, ..."
1,Generative AI Solutions for Fraud Detection an...,How do you maintain your AI applications with ...,We maintain a dedicated R&D team focused on in...,General,Bank E,Under Review,96515a41-d3d4-4247-afc8-7bfdfd7e2647,"[0.0026103463023900986, 0.05872994288802147, -..."
66,Gen AI-Driven Financial Advisory System,Describe how your AI solutions manage and miti...,We implement and maintain robust risk manageme...,AI Regulation,Bank A,Under Review,6cd03a66-ce41-45a5-8d20-67bc4183298f,"[-0.024822114035487175, -0.0037610677536576986..."
78,Implementation of AI Chatbots for Enhanced Cus...,How are your LLMs updated to reflect new data ...,We implement advanced continuous learning mech...,Large Language Models,Bank C,Awarded,45050954-b200-48a6-b6cc-75f658376df2,"[0.023994842544198036, -0.0015319223748520017,..."
112,AI-Powered Risk Assessment Model Development f...,Explain how you manage and mitigate AI risks i...,We implement and maintain robust risk manageme...,AI Regulation,Bank B,Awarded,4ff9c1c7-cf3d-4742-a4cd-c25ec37335c9,"[-0.01566511206328869, 0.024723073467612267, -..."


#### Insert embeddings and questions into Vector DB

In [24]:
import chromadb
import uuid
chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(name="rfp_rag_collection")

# Initialize lists to store data for batch addition
all_embeddings = []
all_metadatas = []
all_documents = []
all_ids = []

# Loop through the DataFrame rows
for index, row in train_df.iterrows():

    all_embeddings.append(row["embedding"])
    all_metadatas.append({
        'ground_truth': row['ground_truth'],
        'hnsw:space': 'cosine'
    })
    all_documents.append(row['question'])
    all_ids.append(str(uuid.uuid4()))

# Add all data to the collection in a single operation
collection.add(
    ids=all_ids, 
    documents=all_documents,
    embeddings=all_embeddings,
    metadatas=all_metadatas,
)

We test the retriever by directly querying using the pre-computed embedding corresponding to the question. We expect the vector store to return the top k most similar questions, along with the metadata associated with each of these questions.

In [37]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.vector_stores.types import VectorStoreQuery

chroma_vector_store = ChromaVectorStore(chroma_collection=collection)
query = VectorStoreQuery(query_embedding=list(vm_test_ds.y_pred(vm_embedder_st)[0]), similarity_top_k=10)
result = chroma_vector_store.query(query)

In [38]:
for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):
    print("Node ID:", id_)
    print("Question:", node.text)
    print("Answer:", node.metadata['ground_truth'])
    print("Similarity:", similarity)
    print()

Node ID: 175f4048-1ff5-4f92-8667-803cea47a9fb
Question: How do you maintain your AI applications with the newest AI technologies and advancements?
Answer: We maintain a dedicated R&D team focused on integrating the latest AI advancements into our applications. This includes regular updates and feature enhancements based on cutting-edge technologies such as GPT (Generative Pre-trained Transformer) for natural language understanding, CNNs (Convolutional Neural Networks) for advanced image recognition tasks, and DQN (Deep Q-Networks) for decision-making processes in complex environments. Our commitment to these AI methodologies ensures that our applications remain innovative, with capabilities that adapt to evolving market demands and client needs. This approach has enabled us to enhance the predictive accuracy of our financial forecasting tools by 25% and improve the efficiency of our educational content personalization by 40%
Similarity: 0.8464225273161045

Node ID: c6021a31-1152-4227-8

## Setup Retrieval Model

In [39]:
def retrieve(input):

    contexts = []
    
    query = VectorStoreQuery(query_embedding=input["embedding_model_st"], similarity_top_k=10)

    result = chroma_vector_store.query(query)

    for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):

        context = f"Node ID: {id_}\n"
        context = f"Question: {node.text}\n"
        context += f"Answer: {node.metadata['ground_truth']}\n"
        context += f"Similarity: {similarity}\n"

        contexts.append(context)

    return contexts

vm_retriever = FunctionModel(input_id="retrieval_model", predict_fn=retrieve)

In [42]:
from validmind.models import PipelineModel

embed_retrieve_pipeline = PipelineModel(vm_embedder_st | vm_retriever, input_id="embed_retrieve_pipeline")

In [43]:
vm_test_ds.assign_predictions(embed_retrieve_pipeline)
vm_test_ds.df.head()

2024-05-07 15:50:06,897 - INFO(validmind.vm_models.dataset.utils): Running predict_proba()... This may take a while
2024-05-07 15:50:06,898 - INFO(validmind.vm_models.dataset.utils): Not running predict_proba() for unsupported models.
2024-05-07 15:50:06,898 - INFO(validmind.vm_models.dataset.utils): Running predict()... This may take a while
2024-05-07 15:50:24,790 - INFO(validmind.vm_models.dataset.utils): Done running predict()


Unnamed: 0,Project_Title,question,ground_truth,Area,Requester,Status,id,embedding_model_st_prediction,embedding_model_openai_prediction,embed_retrieve_pipeline_prediction
70,Implementation of AI Chatbots for Enhanced Cus...,What strategies do you employ to update your A...,We maintain a dedicated R&D team focused on in...,General,Bank C,Awarded,d4aa5b1f-c3a8-4f7c-8b9d-46cc6dd92f8d,"[0.013661187142133713, 0.041973263025283813, -...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: How do you maintain your AI applica...
82,Implementation of AI Chatbots for Enhanced Cus...,What is your strategy for integrating LLMs smo...,Our approach involves conducting a thorough an...,Large Language Models,Bank C,Awarded,65a9775f-a524-4363-a6e3-98eca7ebc897,"[0.0031792251393198967, 0.0010557029163464904,...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: What is your approach to integratin...
110,AI-Powered Risk Assessment Model Development f...,What steps do you implement to ensure AI decis...,We prioritize transparency by incorporating ex...,AI Regulation,Bank B,Awarded,735e9255-ed08-4dbd-a0b4-3bb096690c2c,"[-0.01779736764729023, 0.05280052870512009, -0...","[0.01272196788340807, 0.0003462419263087213, 0...",[Question: What measures do you take to ensure...
79,Implementation of AI Chatbots for Enhanced Cus...,What approaches do you utilize to ensure the d...,We prioritize transparency and explainability ...,Large Language Models,Bank C,Awarded,2562adce-969f-4aac-a617-60de0abdf012,"[0.01752305217087269, -0.017317142337560654, -...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: What actions do you take to make th...
62,Gen AI-Driven Financial Advisory System,Can you describe the governance structures you...,We have established an AI Risk Council that pl...,AI Regulation,Bank A,Under Review,3766d848-2265-4f5f-bd60-98d161fb8985,"[0.017403364181518555, 0.010175510309636593, -...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: Can you discuss your governance fra...


## Setup Generation Model

In [44]:
from llama_index.core import Prompt

template = """
Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""
prompt = Prompt(template)

In [47]:
formatted_prompt = prompt.format(
    context=vm_test_ds.df.iloc[0]['embed_retrieve_pipeline_prediction'][0], 
    question=vm_test_ds.df.iloc[0]['question'],
)
print(formatted_prompt)


Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
Question: How do you maintain your AI applications with the newest AI technologies and advancements?
Answer: We maintain a dedicated R&D team focused on integrating the latest AI advancements into our applications. This includes regular updates and feature enhancements based on cutting-edge technologies such as GPT (Generative Pre-trained Transformer) for natural language understanding, CNNs (Convolutional Neural Networks) for advanced image recognition tasks, and DQN (Deep Q-Networks) for decision-making processes in complex environments. Our commitment to these AI methodologies ensures that our applications remain innovative, with capabilities that adapt to evolving market demands and client needs. This approach has enabled us to enhance the predictive accuracy of our financial forecasting tools by 25% and improve the efficien

In [52]:
from openai import OpenAI


client = OpenAI()

def generate(input):

    formatted_prompt = prompt.format(
        context=input[vm_retriever.input_id], 
        question=input["question"],
    )
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": formatted_prompt},
        ],
    )

    return response.choices[0].message.content

vm_generator = FunctionModel(input_id="generation_model", predict_fn=generate)

## Setup a ValidMind RAG Model

In [53]:
vm_rag_model = PipelineModel(vm_embedder_st | vm_retriever | vm_generator, input_id="rag_model")

In [54]:
vm_test_ds.assign_predictions(vm_rag_model)
vm_test_ds.df.head()

2024-05-07 15:54:08,599 - INFO(validmind.vm_models.dataset.utils): Running predict_proba()... This may take a while
2024-05-07 15:54:08,600 - INFO(validmind.vm_models.dataset.utils): Not running predict_proba() for unsupported models.
2024-05-07 15:54:08,600 - INFO(validmind.vm_models.dataset.utils): Running predict()... This may take a while
2024-05-07 15:55:19,762 - INFO(validmind.vm_models.dataset.utils): Done running predict()


Unnamed: 0,Project_Title,question,ground_truth,Area,Requester,Status,id,embedding_model_st_prediction,embedding_model_openai_prediction,embed_retrieve_pipeline_prediction,rag_model_prediction
70,Implementation of AI Chatbots for Enhanced Cus...,What strategies do you employ to update your A...,We maintain a dedicated R&D team focused on in...,General,Bank C,Awarded,d4aa5b1f-c3a8-4f7c-8b9d-46cc6dd92f8d,"[0.013661187142133713, 0.041973263025283813, -...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: How do you maintain your AI applica...,Answer: We maintain a dedicated R&D team focus...
82,Implementation of AI Chatbots for Enhanced Cus...,What is your strategy for integrating LLMs smo...,Our approach involves conducting a thorough an...,Large Language Models,Bank C,Awarded,65a9775f-a524-4363-a6e3-98eca7ebc897,"[0.0031792251393198967, 0.0010557029163464904,...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: What is your approach to integratin...,I don't know
110,AI-Powered Risk Assessment Model Development f...,What steps do you implement to ensure AI decis...,We prioritize transparency by incorporating ex...,AI Regulation,Bank B,Awarded,735e9255-ed08-4dbd-a0b4-3bb096690c2c,"[-0.01779736764729023, 0.05280052870512009, -0...","[0.01272196788340807, 0.0003462419263087213, 0...",[Question: What measures do you take to ensure...,Answer: The steps implemented to ensure AI dec...
79,Implementation of AI Chatbots for Enhanced Cus...,What approaches do you utilize to ensure the d...,We prioritize transparency and explainability ...,Large Language Models,Bank C,Awarded,2562adce-969f-4aac-a617-60de0abdf012,"[0.01752305217087269, -0.017317142337560654, -...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: What actions do you take to make th...,We prioritize transparency and explainability ...
62,Gen AI-Driven Financial Advisory System,Can you describe the governance structures you...,We have established an AI Risk Council that pl...,AI Regulation,Bank A,Under Review,3766d848-2265-4f5f-bd60-98d161fb8985,"[0.017403364181518555, 0.010175510309636593, -...","[0.012765799649059772, 0.0003663294482976198, ...",[Question: Can you discuss your governance fra...,Answer: We have established an AI Risk Council...


In [None]:
# vm_ragas_ds = vm.init_dataset(result_df, __log=False)

In [None]:
import plotly.express as px

def plot_distribution(scores):
    # plot distribution of scores (0-1) from ragas metric
    # scores is a list of floats
    fig = px.histogram(x=scores, nbins=10)
    fig.show()

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
# result = run_test(
#     "validmind.model_validation.ragas.AnswerSimilarity",
#     inputs={"dataset": vm_ragas_ds},
#     show=False,
# )
# plot_distribution(result.metric.summary.results[0].data)