# WIP: RAG Pipeline using LlamaIndex

## Pre-requisites

In [None]:
%pip install llama-index llama-index-embeddings-huggingface llama-index-vector-stores-chroma -q -U

In [None]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

In [None]:
os.environ['TOKENIZERS_PARALLELISM'] = "false"

## Load data

### Explore contracts datasets

Explore the contracts by first loading them as pandas. 

In [None]:
import pandas as pd

CONTRACT_FILES = [
    "datasets/rag/vendor_contracts_001_020.csv",
    "datasets/rag/vendor_contracts_021_040.csv",
    "datasets/rag/vendor_contracts_041_060.csv",
]

# Concatenate all DataFrames into a single DataFrame
contracts_df = pd.concat(
    [pd.read_csv(file) for file in CONTRACT_FILES], 
    ignore_index=True
)

contracts_df.head()

### Explore questions and answers datasets

Now we explore the questions and answers dataset. 

In [None]:
QUESTION_FILES = [
    "datasets/rag/vendor_contracts_questions.csv",
]

# Concatenate all DataFrames into a single DataFrame
questions_df = pd.concat(
    [pd.read_csv(file) for file in QUESTION_FILES], 
    ignore_index=True
)

questions_df.head()

The `train_questions_df`, which contains questions and answers, will be stored in the vector store. This will simulate questions that have already been answered and are stored in the database. The `test_questions_df`, on the other hand, will act as a set of new questions posed by the user. The answers in this dataset will be used as ground truth for evaluation.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

questions_df = pd.read_csv("datasets/rag/vendor_contracts_questions.csv")
train_questions_df, test_questions_df = train_test_split(questions_df, test_size=0.20)

# Rename columns 
train_questions_df = train_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'answer'})
test_questions_df = test_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'ground_truth'})

In [None]:
train_questions_df.head()

In [None]:
test_questions_df.head()

## Document loaders

### Explore different document loaders for contracts

We will test the `CSVReader` from `llama_index` and review the properties of the documents it loads.

In [None]:
from llama_index.readers.file import CSVReader
from pathlib import Path

reader = CSVReader(concat_rows=False)
llama_documents = []

# Iterate over each file path in the FILES list
for file_path in CONTRACT_FILES:
    # Convert string file path to Path object
    path_obj = Path(file_path)
    
    # Load data from each file and append to contract_docs list
    llama_documents.extend(reader.load_data(path_obj))

In [None]:
number_to_print = 5

for i, doc in enumerate(llama_documents[:number_to_print]):
    print("Document ID:", doc.id_)
    print("File Name:", doc.metadata['filename']) 
    print("Text Content:\n", doc.text)
    print()

Now we will try the `CSVLoader` from `langchain` and check the properties of the documents it loads.

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

lc_documents = [] 

# Iterate through each file path in the list
for file_path in CONTRACT_FILES:
    loader = CSVLoader(
        file_path=file_path,
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    lc_documents.extend(doc)

In [None]:
number_to_print = 2

for index, doc in enumerate(lc_documents[:number_to_print]):
    print(f"Document {index + 1}:")
    print("Page Content:")
    print(doc.page_content)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

Controlling what is stored as metadata and what is stored as page content, which will be converted into embeddings.

In [None]:
lc_documents = [] 

# Iterate through each file path in the list
for file_path in CONTRACT_FILES:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Contract ID","Supported BSLs", "Engagement Terms"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    lc_documents.extend(doc)

In [None]:
number_to_print = 2

for index, doc in enumerate(lc_documents[:number_to_print]):
    print(f"Document {index + 1}:")
    print("Page Content:")
    print(doc.page_content)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

## Insert contract documents into vectorstore

### Split documents into chunks

In [None]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(llama_documents)
nodes[:5]

In [None]:
num_to_print = 2

for index, node in enumerate(nodes[:num_to_print]):
    print(f"Node {index + 1}:")
    print("Metadata:")
    for key, value in node.metadata.items():
        print(f"{key}: {value}")
    print("Text Content:")
    print(node.text)
    print(f"Embeddings: {node.embedding}")
    print() 

Convert nodes into dataframe.

In [None]:
data = []
for index, node in enumerate(nodes):
    data.append({
        "Node ID": index + 1,
        "Text Content": node.text,
    })
contracts_nodes_df = pd.DataFrame(data)

In [None]:
import validmind as vm

vm_contracts_ds = vm.init_dataset(
    input_id="contracts_nodes",
    dataset=contracts_nodes_df,
    text_column="Text Content", 
    __log=False,
)

vm_contracts_ds.df.head()

### Compute embeddings for each node

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from validmind.models import FunctionModel

client = OpenAIEmbedding()

def embed(input):
    model = OpenAIEmbedding(model_name="text-embedding-3-small")
    return model.get_text_embedding(input["Text Content"])

vm_embedder = FunctionModel(input_id="text-embedding-3-small", predict_fn=embed)

In [None]:
vm_contracts_ds.assign_predictions(vm_embedder)

In [None]:
vm_contracts_ds.df.head()

In [None]:
from validmind.tests import run_test

test= run_test(
    "validmind.model_validation.embeddings.CosineSimilarityHeatmap",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    }
)

In [None]:
test= run_test(
    "validmind.model_validation.embeddings.EuclideanDistanceHeatmap",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    },
)

In [None]:
test= run_test(
    "validmind.model_validation.embeddings.PCAComponentsPairwisePlots",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    },
    params = {
        "n_components": 3
    }
)

In [None]:
test= run_test(
    "validmind.model_validation.embeddings.TSNEComponentsPairwisePlots",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    },
    params = {
        "n_components": 3
    }
)

The `train_questions_df`, which contains questions and answers, will be stored in the vector store. This will simulate questions that have already been answered and are stored in the database. The `test_questions_df`, on the other hand, will act as a set of new questions posed by the user. The answers in this dataset will be used as ground truth for evaluation.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

questions_df = pd.read_csv("datasets/rag/vendor_contracts_questions.csv")
train_questions_df, test_questions_df = train_test_split(questions_df, test_size=0.20)

# Rename columns 
train_questions_df = train_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'answer'})
test_questions_df = test_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'ground_truth'})

In [None]:
train_questions_df.head()

In [None]:
test_questions_df.head()