# RAG Pipeline using LlamaIndex

## Pre-requisites

In [None]:
%pip install llama-index llama-index-embeddings-huggingface llama-index-vector-stores-chroma -q -U

In [None]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

## Dataset Loader

In [None]:
import pandas as pd

TRAIN_FILES = [
    "datasets/rag/rfp_existing_questions_client_1.csv",
    "datasets/rag/rfp_existing_questions_client_2.csv",
    "datasets/rag/rfp_existing_questions_client_3.csv",
]

TEST_FILES = [
    "datasets/rag/rfp_existing_questions_client_4.csv",
    "datasets/rag/rfp_existing_questions_client_5.csv",
]

def load(file_paths):
    # Load each file into a DataFrame and concatenate them into one
    df_list = [pd.read_csv(file) for file in file_paths]
    concatenated_df = pd.concat(df_list, ignore_index=True)

    # Rename columns to match the expected format 
    concatenated_df = concatenated_df.rename(
        columns={"RFP_Question": "question", "RFP_Answer": "ground_truth"}
    )

    return concatenated_df

# Load and concatenate training and testing data
train_df = load(TRAIN_FILES)
test_df = load(TEST_FILES)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Embedding Model Selection

First, we test both the `sentence-transformers` and `openai` embedding models using their native interfaces. 

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
embeddings = embed_model.get_text_embedding("Hello World!")

print(len(embeddings))
print(embeddings[:5])

In [None]:
from openai import OpenAI
client = OpenAI()

embeddings = client.embeddings.create(
    input="Hello World!", 
    model="text-embedding-3-small"
).data[0].embedding

print(len(embeddings))
print(embeddings[:5])

Create `validmind` embedding models.

In [None]:
from validmind.models import EmbeddingModel

def embed(question):
    """Returns a text embedding for the given text"""
    
    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
    return embed_model.get_text_embedding(question)


vm_embedder_st = EmbeddingModel(input_id="embedding_model_st", predict_fn=embed)

In [None]:
def embed(question):
    """Returns a text embedding for the given text"""

    return (
        client.embeddings.create(
            input=question,
            model="text-embedding-3-small",
        )
        .data[0]
        .embedding
    )


vm_embedder_openai = EmbeddingModel(input_id="embedding_model_openai", predict_fn=embed)

Generate embeddings from the text in the `question` column.

In [None]:
test_df[vm_embedder_openai.output_column] = vm_embedder_openai.predict(test_df)
test_df.head()

In [None]:
test_df[vm_embedder_st.output_column] = vm_embedder_st.predict(test_df)
test_df.head()

Create `validmind` datasets.

In [None]:
import validmind as vm

vm_train_ds = vm.init_dataset(train_df, text_column="question", __log=False)
vm_test_ds = vm.init_dataset(test_df, text_column="question", __log=False)


Run an embedding test for both models to ensure that the embedding models function properly.

In [None]:
from validmind.tests import run_test

run = True
if run:
    result = run_test(
        "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise:HFEmbeddingModel",
        inputs={"model": vm_embedder_st, "dataset": vm_test_ds},
        params={"probability": 0.3},
    )

In [None]:
run = True
if run:
    result = run_test(
        "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise:OpenAIEmbeddingModel",
        inputs={"model": vm_embedder_openai, "dataset": vm_test_ds},
        params={"probability": 0.3},
    )

## Setup Vector Store

#### Generate embeddings for the questions

In [None]:
train_df[vm_embedder_st.output_column] = vm_embedder_st.predict(train_df)
train_df.head()

#### Insert embeddings and questions into Vector DB

In [None]:
import chromadb
import uuid
chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(name="rfp_rag_collection")

# Initialize lists to store data for batch addition
all_embeddings = []
all_metadatas = []
all_documents = []
all_ids = []

# Loop through the DataFrame rows
for index, row in train_df.iterrows():

    all_embeddings.append(row[vm_embedder_openai.output_column])
    all_metadatas.append({
        'ground_truth': row['ground_truth'],
        'hnsw:space': 'cosine'
    })
    all_documents.append(row['question'])
    all_ids.append(str(uuid.uuid4()))

# Add all data to the collection in a single operation
collection.add(
    ids=all_ids, 
    documents=all_documents,
    embeddings=all_embeddings,
    metadatas=all_metadatas,
)

We test the retriever by directly querying using the pre-computed embedding corresponding to the question. We expect the vector store to return the top k most similar questions, along with the metadata associated with each of these questions.

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.vector_stores.types import VectorStoreQuery

chroma_vector_store = ChromaVectorStore(chroma_collection=collection)
query = VectorStoreQuery(query_embedding=test_df['embedding'][0], similarity_top_k=10)
result = chroma_vector_store.query(query)

In [None]:
for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):
    print("Node ID:", id_)
    print("Question:", node.text)
    print("Answer:", node.metadata['ground_truth'])
    print("Similarity:", similarity)
    print()

## Setup Retrieval Model

In [None]:
from validmind.models import RetrievalModel

def retrieve(embedding):

    contexts = []
    
    query = VectorStoreQuery(query_embedding=embedding, similarity_top_k=10)

    result = chroma_vector_store.query(query)

    for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):

        context = f"Node ID: {id_}\n"
        context = f"Question: {node.text}\n"
        context += f"Answer: {node.metadata['ground_truth']}\n"
        context += f"Similarity: {similarity}\n"

        contexts.append(context)

    return contexts

vm_retriever = RetrievalModel(input_id="retrieval_model", predict_fn=retrieve)

In [None]:
test_df[vm_retriever.output_column] = vm_retriever.predict(test_df)
test_df.head()

## Setup Generation Model

In [None]:
from llama_index.core import Prompt

template = """
Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""
prompt = Prompt(template)

In [None]:
formatted_prompt = prompt.format(
    context=test_df[vm_retriever.output_column][0], 
    question=test_df['question'][0]
)
print(formatted_prompt)

In [None]:
from openai import OpenAI
from validmind.models import GenerationModel


client = OpenAI()

def generate(question, contexts):

    formatted_prompt = prompt.format(
        context=contexts, 
        question=question
    )
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": formatted_prompt},
        ],
    )

    return response.choices[0].message.content

vm_generator = GenerationModel(input_id="generation_model", predict_fn=generate)

In [None]:
test_df[vm_generator.output_column] = vm_generator.predict(test_df)
test_df.head()

## Setup a ValidMind RAG Model

In [None]:
from validmind.models import RAGModel

vm_rag_model = RAGModel(
    embedder=vm_embedder_openai,
    retriever=vm_retriever,
    generator=vm_generator,
    input_id="rag_pipeline",
)

In [None]:
result_df = vm_rag_model.predict(test_df)
result_df.head()

In [None]:
vm_ragas_ds = vm.init_dataset(result_df, __log=False)

In [None]:
import plotly.express as px

def plot_distribution(scores):
    # plot distribution of scores (0-1) from ragas metric
    # scores is a list of floats
    fig = px.histogram(x=scores, nbins=10)
    fig.show()

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
result = run_test(
    "validmind.model_validation.ragas.AnswerSimilarity",
    inputs={"dataset": vm_ragas_ds},
    show=False,
)
plot_distribution(result.metric.summary.results[0].data)