# WIP: RAG Pipeline using LlamaIndex

## Pre-requisites

In [None]:
%pip install llama-index llama-index-embeddings-huggingface llama-index-vector-stores-chroma -q -U

In [None]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

In [None]:
os.environ['TOKENIZERS_PARALLELISM'] = "false"

In [None]:
import csv

def extract_metadata_from_csv(file_path, metadata_columns):
    metadata = {}
    with open(file_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            for column in metadata_columns:
                if column in row:
                    # Assuming you want to collect unique values from each column
                    if column not in metadata:
                        metadata[column] = set()
                    metadata[column].add(row[column])
    # Convert sets to lists for JSON serializability
    for key in metadata:
        metadata[key] = list(metadata[key])
    return metadata

## Data exploration

### Vendor contracts dataset

Explore the contracts by first loading them as pandas. 

In [None]:
import pandas as pd

CONTRACT_FILES = [
    "datasets/rag/vendor_contracts_001_020.csv",
    "datasets/rag/vendor_contracts_021_040.csv",
    "datasets/rag/vendor_contracts_041_060.csv",
]

# Concatenate all DataFrames into a single DataFrame
contracts_df = pd.concat(
    [pd.read_csv(file) for file in CONTRACT_FILES], 
    ignore_index=True
)

contracts_df.head()

### Explore the questions and answers dataset

Now we explore the questions and answers dataset. 

In [None]:
QUESTION_FILES = [
    "datasets/rag/vendor_contracts_questions.csv",
]

# Concatenate all DataFrames into a single DataFrame
questions_df = pd.concat(
    [pd.read_csv(file) for file in QUESTION_FILES], 
    ignore_index=True
)

questions_df.head()

The `train_questions_df`, which contains questions and answers, will be stored in the vector store. This will simulate questions that have already been answered and are stored in the database. The `test_questions_df`, on the other hand, will act as a set of new questions posed by the user. The answers in this dataset will be used as ground truth for evaluation.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

questions_df = pd.read_csv("datasets/rag/vendor_contracts_questions.csv")
train_questions_df, test_questions_df = train_test_split(questions_df, test_size=0.20, random_state=42)

# Rename columns 
train_questions_df = train_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'answer'})
test_questions_df = test_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'ground_truth'})

In [None]:
import validmind as vm

vm_train_questions_ds = vm.init_dataset(
    input_id="train_questions",
    dataset=train_questions_df,
    text_column="question",
    __log=False,
)

vm_test_questions_ds = vm.init_dataset(
    input_id="test_questions",
    dataset=test_questions_df,
    text_column="question",
    __log=False,
)

In [None]:
vm_train_questions_ds.df.head()

In [None]:
vm_test_questions_ds.df

### Evaluation

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.Duplicates",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.nlp.StopWords",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.nlp.Punctuations",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.nlp.CommonWords",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.nlp.LanguageDetection",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    "validmind.data_validation.nlp.Toxicity",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    "validmind.data_validation.nlp.PolarityAndSubjectivity",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

In [None]:
vm.tests.run_test(
    "validmind.data_validation.nlp.Sentiment",
    inputs={
        "dataset": vm_train_questions_ds
    }
)

## Document loaders

### Explore `CSVLoader` from LangChain

Now we will try the `CSVLoader` from `langchain` and check the document properties.

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

lc_documents = [] 

# Iterate through each file path in the list
for file_path in CONTRACT_FILES:
    loader = CSVLoader(
        file_path=file_path,
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    lc_documents.extend(doc)

In [None]:
number_to_print = 2

for index, doc in enumerate(lc_documents[:number_to_print]):
    print(f"Document {index + 1}:")
    print("Page Content:")
    print(doc.page_content)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

Controlling what is stored as metadata and what is stored as page content, which will be converted into embeddings.

In [None]:
lc_documents = [] 

# Iterate through each file path in the list
for file_path in CONTRACT_FILES:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Contract ID","Supported BSLs", "Engagement Terms"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    lc_documents.extend(doc)

In [None]:
number_to_print = 2

for index, doc in enumerate(lc_documents[:number_to_print]):
    print(f"Document {index + 1}:")
    print("Page Content:")
    print(doc.page_content)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

### Explore `CSVReader` from LlamaIndex

We will test the `CSVReader` from `llama_index` and review the properties of the documents it loads.

In [None]:
from llama_index.readers.file import CSVReader
from pathlib import Path

reader = CSVReader(concat_rows=False)
llama_documents = []
metadata_columns = contracts_df.columns.tolist()
print(f"Metadata columns: {metadata_columns}")

# Iterate over each file path in the FILES list
for file_path in CONTRACT_FILES:
    
    # Convert string file path to Path object
    path_obj = Path(file_path)
    
    # Load data from each file and append to contract_docs list
    documents = reader.load_data(
        file=path_obj,
    )

    # Remove firs document corresponding to the header 
    documents = documents[1:]

    llama_documents.extend(documents)

print(f"Loaded {len(llama_documents)} documents")

Insert metadata in each document created by `CVSReader`.

In [None]:
for i, doc in enumerate(llama_documents):
    # Check if metadata already exists and is a dictionary; if not, initialize it
    if not hasattr(doc, 'metadata') or not isinstance(doc.metadata, dict):
        doc.metadata = {}

    # Existing metadata is preserved and new keys are added or updated
    doc.metadata.update({
        column: contracts_df.iloc[i][column] for column in metadata_columns if column in contracts_df.columns
    })

In [None]:
number_to_print = 2

for i, doc in enumerate(llama_documents[:number_to_print]):
    print("Document ID:", doc.id_)
    print("Text Content:", doc.text)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

## Split documents into chunks

We use `SentenceSplitter`, which aims to keep sentences and paragraphs together. This reduces the likelihood of hanging sentences or sentence fragments at the end of the node chunk.

Split the llama documents into chunk nodes:

In [None]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter()
nodes = splitter.get_nodes_from_documents(llama_documents)
nodes[:2]

Inspect the chunk nodes:

In [None]:
num_to_print = 2

for index, node in enumerate(nodes[:num_to_print]):
    print(f"Node ID: {node.id_}")
    print(f"Text Content: {node.text}")
    print(f"Start Char IDX: {node.start_char_idx}")
    print(f"End Char IDX: {node.end_char_idx}")
    print("Metadata:")
    for key, value in node.metadata.items():
        print(f"{key}: {value}")
    print(f"Embeddings: {node.embedding}")
    print() 

Convert the chunk nodes into a dataframe:

In [None]:
data = []
for index, node in enumerate(nodes):
    # Start with non-metadata fields
    node_data = {
        "Node ID": node.id_,
        "Text Content": node.text
    }
    
    # Add metadata fields dynamically if they exist in the node's metadata
    node_data.update({
        key: node.metadata[key]  # Use keys directly without removing spaces
        for key in metadata_columns
        if key in node.metadata
    })
    
    data.append(node_data)

# Convert the list of dictionaries to a DataFrame
contract_chunk_nodes_df = pd.DataFrame(data)
contract_chunk_nodes_df.head()

In [None]:
vm_contracts_ds = vm.init_dataset(
    input_id="contracts_nodes",
    dataset=contract_chunk_nodes_df,
    text_column="Text Content", 
    __log=False,
)

## Compute embeddings on contract chunks

Define the embed function:

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from validmind.models import FunctionModel

client = OpenAIEmbedding()

def embed_contracts(input):
    model = OpenAIEmbedding(model_name="text-embedding-3-small")
    return model.get_text_embedding(input["Text Content"])

Create an embedding model using the ValidMind `FunctionModel`:

In [None]:
vm_embedder_contracts = FunctionModel(input_id="contracts_openai_embedding", predict_fn=embed_contracts)

Compute embeddings by assigning predictions from the `vm_embedder` model to the `Text Content` column in the `vm_contracts_ds` dataset:

In [None]:
vm_contracts_ds.assign_predictions(vm_embedder_contracts)

Verify that the embeddings corresponding to the `Text Content` column have been correctly assigned to the dataset in a column named `<input_id_predictions>`.

In [None]:
vm_contracts_ds.df.head()

### Evaluation

In [None]:
from validmind.tests import run_test

In [None]:
run = False
if run:
    
    test= run_test(
        "validmind.model_validation.embeddings.CosineSimilarityHeatmap",
        inputs = {
            "dataset": vm_contracts_ds,
            "model": vm_embedder_contracts,
        }
    )

In [None]:
run = False
if run:
    
    test= run_test(
        "validmind.model_validation.embeddings.EuclideanDistanceHeatmap",
        inputs = {
            "dataset": vm_contracts_ds,
            "model": vm_embedder_contracts,
        },
    )

In [None]:
run = False
if run:

    test= run_test(
        "validmind.model_validation.embeddings.PCAComponentsPairwisePlots",
        inputs = {
            "dataset": vm_contracts_ds,
            "model": vm_embedder_contracts,
        },
        params = {
            "n_components": 3
        }
    )

In [None]:
run = False
if run:

    test= run_test(
        "validmind.model_validation.embeddings.TSNEComponentsPairwisePlots",
        inputs = {
            "dataset": vm_contracts_ds,
            "model": vm_embedder_contracts,
        },
        params = {
            "n_components": 3
        }
    )

## Insert contract embeddings into the vectorstore

In [None]:
import chromadb
import uuid

# Create or get a ChromaDB collection
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="contracts_collection")

# Initialize lists to store data for batch addition
all_embeddings = []
all_metadatas = []
all_documents = []
all_ids = []

# Loop through the DataFrame rows
for index, row in vm_contracts_ds.df.iterrows():

    # Append document-specific data
    all_embeddings.append(row[vm_embedder_contracts.input_id + '_prediction'])
    all_documents.append(row['Text Content'])
    all_ids.append(str(uuid.uuid4()))

    # Prepare metadata dictionary dynamically
    metadata = {
        'hnsw:space': 'cosine'
    }

    # Dynamically add additional metadata from the defined list
    metadata.update({
        key: row[key] for key in metadata_columns if key in row
    })
    
    all_metadatas.append(metadata)

# Add all data to the collection in a single operation
collection.add(
    ids=all_ids, 
    documents=all_documents,
    embeddings=all_embeddings,
    metadatas=all_metadatas,
)


### Compute embeddings for questions

In [None]:
def embed_question(input):
    model = OpenAIEmbedding(model_name="text-embedding-3-small")
    return model.get_text_embedding(input["question"])

vm_embedder_question = FunctionModel(input_id="question_openai_embedding", predict_fn=embed_question)

In [None]:
vm_train_questions_ds.assign_predictions(vm_embedder_question)

In [None]:
vm_train_questions_ds.df.head()

In [None]:
vm_test_questions_ds.df

## Query the vectorstore

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.vector_stores.types import VectorStoreQuery

question_id = 4

print(f"Question: {vm_train_questions_ds.df['question'][question_id-1]}")
question_embedding = vm_train_questions_ds.df[vm_embedder_question.input_id + '_prediction'][question_id-1]

chroma_vector_store = ChromaVectorStore(chroma_collection=collection)
query = VectorStoreQuery(query_embedding=question_embedding, similarity_top_k=10)
result = chroma_vector_store.query(query)

In [None]:
num_to_print = 2  

for node, similarity, id_ in zip(result.nodes[:num_to_print], result.similarities[:num_to_print], result.ids[:num_to_print]):
    print("Node ID:", id_)
    print("Text Content:")
    print(node.text)
    print("Metadata:")
    for key, value in node.metadata.items():
        print(f"{key}: {value}")
    print("Similarity:", similarity)
    print()

Now, we create a dataframe to evaluate the relevance of the retrieved context. This dataframe will contain the query question and the retrieved text content for each node, allowing us to check the similarities between the question and the context.

In [None]:
import pandas as pd

# Initialize lists to store data
questions = []
contexts = []
similarities = []

# Get the specific question you want to use for all entries (assuming index 3 is the question you want)
constant_question = vm_train_questions_ds.df['question'][question_id-1]

# Assuming 'result' is your object containing nodes and similarities
for node, similarity in zip(result.nodes, result.similarities):
    # Append the same question to the list for each node
    questions.append(constant_question)
    # Append the text content of each node to the contexts list
    contexts.append(node.text)
    # Append similarity score
    similarities.append(similarity)

# Create a DataFrame
question_context_df = pd.DataFrame({
    'Question': questions,
    'Retrieved Context': contexts,
    'Retrieved Similarity': similarities
})

question_context_df


## Compute embeddings for question and retrieved context

First, we rename the columns to enable the use of the predefined embedding functions for questions and contracts.

In [None]:
question_context_df = question_context_df.rename(columns={
    'Question': 'question', 
    'Retrieved Context': 'Text Content'})

Now, we convert this dataframe into a ValidMind dataset to enable the computation of embeddings using assigned predictions.

In [None]:
vm_question_context_ds = vm.init_dataset(
    input_id="question_context",
    dataset=question_context_df,
    __log=False,
)

In [None]:
vm_question_context_ds.assign_predictions(vm_embedder_question)

In [None]:
vm_question_context_ds.assign_predictions(vm_embedder_contracts)

In [None]:
vm_question_context_ds.df

### Evaluation

In [None]:
run = True
if run:
    
    test= run_test(
        "validmind.model_validation.embeddings.CosineSimilarityComparison",
        inputs = {
            "dataset": vm_question_context_ds,
            "models": [vm_embedder_contracts,vm_embedder_question],
        }
    )

In [None]:
run = True
if run:
    
    test= run_test(
        "validmind.model_validation.embeddings.EuclideanDistanceComparison",
        inputs = {
            "dataset": vm_question_context_ds,
            "models": [vm_embedder_contracts,vm_embedder_question],
        }
    )

## Setup retrieval model

In [None]:
vm_test_questions_ds.df

In [None]:
vm_test_questions_ds.assign_predictions(vm_embedder_question)
vm_test_questions_ds.df

In [None]:
vm_embedder_question.input_id

In [None]:
def retrieve(input):
    contexts = []
    
    # Assuming VectorStoreQuery and chroma_vector_store are predefined elsewhere in the application
    query = VectorStoreQuery(query_embedding=input["question_openai_embedding"], similarity_top_k=10)
    result = chroma_vector_store.query(query)

    for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):
        # Initialize the context string with the Node ID
        context = f"Node ID: {id_}\n"
        
        # Append the contract text from the node
        context += f"Contract: {node.text}\n"
        
        # Append each metadata key-value pair to the context string
        for key, value in node.metadata.items():
            context += f"{key}: {value}\n"
        
        # Append the similarity score
        context += f"Similarity: {similarity:.2f}\n"  # Formatting the similarity to two decimal places

        # Add the fully constructed context for this node to the list
        contexts.append(context)

    return contexts


vm_retriever = FunctionModel(input_id="retrieval_model", predict_fn=retrieve)

In [None]:
from validmind.models import PipelineModel

embed_retrieve_pipeline = PipelineModel(vm_embedder_question | vm_retriever, input_id="embed_retrieve_pipeline")

In [None]:
vm_test_questions_ds.assign_predictions(embed_retrieve_pipeline)
vm_test_questions_ds.df

## Setup generation model

In [None]:
from llama_index.core import Prompt

template = """
Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""
prompt = Prompt(template)

In [None]:
formatted_prompt = prompt.format(
    context=vm_test_questions_ds.df.iloc[0]['embed_retrieve_pipeline_prediction'][0], 
    question=vm_test_questions_ds.df.iloc[0]['question'],
)
print(formatted_prompt)

In [None]:
from openai import OpenAI


client = OpenAI()

def generate(input):

    formatted_prompt = prompt.format(
        context=input[vm_retriever.input_id], 
        question=input["question"],
    )
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": formatted_prompt},
        ],
    )

    return response.choices[0].message.content

vm_generator = FunctionModel(input_id="generation_model", predict_fn=generate)

## Setup a ValidMind RAG Model

In [None]:
vm_rag_model = PipelineModel(vm_embedder_question | vm_retriever | vm_generator, input_id="rag_model")

In [None]:
vm_test_questions_ds.assign_predictions(vm_rag_model)
vm_test_questions_ds.df.head()

### Evaluation

In [None]:
import plotly.express as px

def plot_distribution(scores):
    # plot distribution of scores (0-1) from ragas metric
    # scores is a list of floats
    fig = px.histogram(x=scores, nbins=10)
    fig.show()

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
ragas_params= {
    "question_column":"question",
    "answer_column":"rag_model_prediction",
    "ground_truth_column":"ground_truth",
    "contexts_column":"embed_retrieve_pipeline_prediction"
}

In [None]:
result = vm.tests.run_test(
    "validmind.model_validation.ragas.AnswerSimilarity",
    inputs={"dataset": vm_test_questions_ds},
    params= ragas_params,
    show=False,
)
plot_distribution(result.metric.summary.results[0].data)

In [None]:
result = run_test(
    "validmind.model_validation.ragas.ContextEntityRecall",
    inputs={"dataset": vm_test_questions_ds},
    params=ragas_params,
    show=False,
)
plot_distribution(result.metric.summary.results[0].data)

In [None]:
result = run_test(
    "validmind.model_validation.ragas.ContextPrecision",
    inputs={"dataset": vm_test_questions_ds},
    params=ragas_params,
    show=False,
)
plot_distribution(result.metric.summary.results[0].data)

In [None]:
result = run_test(
    "validmind.model_validation.ragas.ContextRelevancy",
    inputs={"dataset": vm_test_questions_ds},
    params=ragas_params,
    show=False,
)
plot_distribution(result.metric.summary.results[0].data)