# RAG Pipeline using LlamaIndex

## Pre-requisites

In [1]:
%pip install llama-index llama-index-embeddings-huggingface llama-index-vector-stores-chroma -q -U


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

In [3]:
os.environ['TOKENIZERS_PARALLELISM'] = "false"

## Load data

### Explore contracts datasets

Explore the contracts by first loading them as pandas. 

In [4]:
import pandas as pd

CONTRACT_FILES = [
    "datasets/rag/vendor_contracts_001_020.csv",
    "datasets/rag/vendor_contracts_021_040.csv",
    "datasets/rag/vendor_contracts_041_060.csv",
]

# Concatenate all DataFrames into a single DataFrame
contracts_df = pd.concat(
    [pd.read_csv(file) for file in CONTRACT_FILES], 
    ignore_index=True
)

contracts_df.head()

Unnamed: 0,Contract ID,Vendor Name,Start Date,End Date,Services Provided,Annual Spend,Features,Software Provided,Category,Preferred Vendor,Historical Spend,Exclusivity,Engagement Terms,Supported BSLs
0,C001,SirionLabs,2023-01-01,2025-12-31,Contract management for legal teams,"$20,000","AI analytics, Contract lifecycle",ContractHub,Contract Management,Yes,"$40,000",No,Full lifecycle management,"Legal, Finance"
1,C002,Evisort,2023-02-01,2024-01-31,AI-driven contract analysis,"$18,000","Data extraction, Compliance tracking",Evisort Analytics,Data Analytics,No,"$36,000",Yes,Data-driven contract management,Legal
2,C003,DocuSign,2023-03-01,2025-02-28,Digital signature and workflow management,"$12,000","Secure signature, Integration",DocuSign Platform,Document Management,Yes,"$24,000",No,Digital signing and workflow,All Departments
3,C004,Icertis,2022-07-01,2024-06-30,Enterprise contract management,"$25,000","Compliance, Contract authoring",Icertis Management,Contract Management,Yes,"$50,000",No,Enterprise-wide contract management,"Operations, Legal"
4,C005,Concord,2023-05-01,2024-04-30,End-to-end contract lifecycle management,"$30,000","Automated workflows, Collaboration",Concord Suite,Contract Management,No,"$60,000",No,Comprehensive contract management,Business


### Explore questions and answers datasets

Now we explore the questions and answers dataset. 

In [5]:
QUESTION_FILES = [
    "datasets/rag/vendor_contracts_questions.csv",
]

# Concatenate all DataFrames into a single DataFrame
questions_df = pd.concat(
    [pd.read_csv(file) for file in QUESTION_FILES], 
    ignore_index=True
)

questions_df.head()

Unnamed: 0,Question #,Question,Answer
0,1,What software do we have available for data an...,"Evisort Analytics, Kira QuickSearch, Seal Disc..."
1,2,What taxonomy category is Tableau in?,Not listed. The dataset doesn't include Tableau.
2,3,Who are the BSLs that support Asset Servicing?,Not specifically mentioned. BSLs include Legal...
3,4,Is Cisco a preferred vendor?,Cisco is not listed as a vendor in the provide...
4,5,Which active vendors are covering comparable c...,HighQ might offer collaborative features simil...


The `train_questions_df`, which contains questions and answers, will be stored in the vector store. This will simulate questions that have already been answered and are stored in the database. The `test_questions_df`, on the other hand, will act as a set of new questions posed by the user. The answers in this dataset will be used as ground truth for evaluation.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

questions_df = pd.read_csv("datasets/rag/vendor_contracts_questions.csv")
train_questions_df, test_questions_df = train_test_split(questions_df, test_size=0.20)

# Rename columns 
train_questions_df = train_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'answer'})
test_questions_df = test_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'ground_truth'})

In [7]:
train_questions_df.head()

Unnamed: 0,question_id,question,answer
0,1,What software do we have available for data an...,"Evisort Analytics, Kira QuickSearch, Seal Disc..."
1,2,What taxonomy category is Tableau in?,Not listed. The dataset doesn't include Tableau.
23,24,"Which contracts are valued at over $20,000 but...","SirionLabs, Concord, Agiloft, Zycus, Seal Soft..."
17,18,How much was spent on legal management softwar...,"$112,000 (Sum of annual spends for CobbleStone..."
38,39,List the vendors that offer Electronic Signatu...,"DocuSign, PandaDoc"


In [8]:
test_questions_df.head()

Unnamed: 0,question_id,question,ground_truth
20,21,Which contract has the most diverse set of fea...,"Mitratech, offering comprehensive legal operat..."
14,15,How many contracts support the IT department?,1 (Agiloft)
48,49,Which contracts include machine learning and c...,None of the contracts include both features to...
44,45,What is the average duration of contracts focu...,"2.14 years (Average duration for CobbleStone, ..."
26,27,How many contracts provide automated workflows...,"5 (SirionLabs, Concord, Onit, Mitratech, Preci..."


## Document loaders

### Explore different document loaders for contracts

We will test the `CSVReader` from `llama_index` and review the properties of the documents it loads.

In [9]:
from llama_index.readers.file import CSVReader
from pathlib import Path

reader = CSVReader(concat_rows=False)
llama_documents = []

# Iterate over each file path in the FILES list
for file_path in CONTRACT_FILES:
    # Convert string file path to Path object
    path_obj = Path(file_path)
    
    # Load data from each file and append to contract_docs list
    llama_documents.extend(reader.load_data(path_obj))

In [23]:
number_to_print = 5

for i, doc in enumerate(llama_documents[:number_to_print]):
    print("Document ID:", doc.id_)
    print("File Name:", doc.metadata['filename']) 
    print("Text Content:\n", doc.text)
    print()

Document ID: 0b731406-bb47-457c-b4f4-44764993d9ea
File Name: vendor_contracts_001_020.csv
Text Content:
 Contract ID, Vendor Name, Start Date, End Date, Services Provided, Annual Spend, Features, Software Provided, Category, Preferred Vendor, Historical Spend, Exclusivity, Engagement Terms, Supported BSLs

Document ID: 2d72c5ba-b660-41d4-8a65-265f4664b515
File Name: vendor_contracts_001_020.csv
Text Content:
 C001, SirionLabs, 2023-01-01, 2025-12-31, Contract management for legal teams, $20,000, AI analytics, Contract lifecycle, ContractHub, Contract Management, Yes, $40,000, No, Full lifecycle management, Legal, Finance

Document ID: 04f852d2-5fd7-4563-862a-658291fa4339
File Name: vendor_contracts_001_020.csv
Text Content:
 C002, Evisort, 2023-02-01, 2024-01-31, AI-driven contract analysis, $18,000, Data extraction, Compliance tracking, Evisort Analytics, Data Analytics, No, $36,000, Yes, Data-driven contract management, Legal

Document ID: 62bbf011-5e49-41fd-a808-b5e700ae8291
File Na

Now we will try the `CSVLoader` from `langchain` and check the properties of the documents it loads.

In [11]:
from langchain_community.document_loaders.csv_loader import CSVLoader

lc_documents = [] 

# Iterate through each file path in the list
for file_path in CONTRACT_FILES:
    loader = CSVLoader(
        file_path=file_path,
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    lc_documents.extend(doc)

In [12]:
number_to_print = 2

for index, doc in enumerate(lc_documents[:number_to_print]):
    print(f"Document {index + 1}:")
    print("Page Content:")
    print(doc.page_content)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

Document 1:
Page Content:
Contract ID: C001
Vendor Name: SirionLabs
Start Date: 2023-01-01
End Date: 2025-12-31
Services Provided: Contract management for legal teams
Annual Spend: $20,000
Features: AI analytics, Contract lifecycle
Software Provided: ContractHub
Category: Contract Management
Preferred Vendor: Yes
Historical Spend: $40,000
Exclusivity: No
Engagement Terms: Full lifecycle management
Supported BSLs: Legal, Finance
Metadata:
source: datasets/rag/vendor_contracts_001_020.csv
row: 0

Document 2:
Page Content:
Contract ID: C002
Vendor Name: Evisort
Start Date: 2023-02-01
End Date: 2024-01-31
Services Provided: AI-driven contract analysis
Annual Spend: $18,000
Features: Data extraction, Compliance tracking
Software Provided: Evisort Analytics
Category: Data Analytics
Preferred Vendor: No
Historical Spend: $36,000
Exclusivity: Yes
Engagement Terms: Data-driven contract management
Supported BSLs: Legal
Metadata:
source: datasets/rag/vendor_contracts_001_020.csv
row: 1



Controlling what is stored as metadata and what is stored as page content, which will be converted into embeddings.

In [13]:
lc_documents = [] 

# Iterate through each file path in the list
for file_path in CONTRACT_FILES:
    loader = CSVLoader(
        file_path=file_path,
        metadata_columns=["Contract ID","Supported BSLs", "Engagement Terms"]
    )

    # Load a document from the current CSV file
    doc = loader.load()
    
    # Append documents
    lc_documents.extend(doc)

In [14]:
number_to_print = 2

for index, doc in enumerate(lc_documents[:number_to_print]):
    print(f"Document {index + 1}:")
    print("Page Content:")
    print(doc.page_content)
    print("Metadata:")
    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print()

Document 1:
Page Content:
Vendor Name: SirionLabs
Start Date: 2023-01-01
End Date: 2025-12-31
Services Provided: Contract management for legal teams
Annual Spend: $20,000
Features: AI analytics, Contract lifecycle
Software Provided: ContractHub
Category: Contract Management
Preferred Vendor: Yes
Historical Spend: $40,000
Exclusivity: No
Metadata:
source: datasets/rag/vendor_contracts_001_020.csv
row: 0
Contract ID: C001
Supported BSLs: Legal, Finance
Engagement Terms: Full lifecycle management

Document 2:
Page Content:
Vendor Name: Evisort
Start Date: 2023-02-01
End Date: 2024-01-31
Services Provided: AI-driven contract analysis
Annual Spend: $18,000
Features: Data extraction, Compliance tracking
Software Provided: Evisort Analytics
Category: Data Analytics
Preferred Vendor: No
Historical Spend: $36,000
Exclusivity: Yes
Metadata:
source: datasets/rag/vendor_contracts_001_020.csv
row: 1
Contract ID: C002
Supported BSLs: Legal
Engagement Terms: Data-driven contract management



## Insert contract documents into vectorstore

### Split documents into chunks

In [15]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(llama_documents)
nodes[:5]

[TextNode(id_='2eb390b6-5397-4a6c-9951-6393145d885d', embedding=None, metadata={'filename': 'vendor_contracts_001_020.csv', 'extension': '.csv'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0b731406-bb47-457c-b4f4-44764993d9ea', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'vendor_contracts_001_020.csv', 'extension': '.csv'}, hash='056619adc42f470bce860c49a55866399590bc2e7aa6326f34af4a770b0f7ea7')}, text='Contract ID, Vendor Name, Start Date, End Date, Services Provided, Annual Spend, Features, Software Provided, Category, Preferred Vendor, Historical Spend, Exclusivity, Engagement Terms, Supported BSLs', start_char_idx=0, end_char_idx=201, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='0de47516-f3cc-46d3-840a-1477c82e18cb', embedding=None, metadata={'filename': 'vendor_contracts_001_020.csv', 'extension': 

In [16]:
num_to_print = 2

for index, node in enumerate(nodes[:num_to_print]):
    print(f"Node {index + 1}:")
    print("Metadata:")
    for key, value in node.metadata.items():
        print(f"{key}: {value}")
    print("Text Content:")
    print(node.text)
    print(f"Embeddings: {node.embedding}")
    print() 

Node 1:
Metadata:
filename: vendor_contracts_001_020.csv
extension: .csv
Text Content:
Contract ID, Vendor Name, Start Date, End Date, Services Provided, Annual Spend, Features, Software Provided, Category, Preferred Vendor, Historical Spend, Exclusivity, Engagement Terms, Supported BSLs
Embeddings: None

Node 2:
Metadata:
filename: vendor_contracts_001_020.csv
extension: .csv
Text Content:
C001, SirionLabs, 2023-01-01, 2025-12-31, Contract management for legal teams, $20,000, AI analytics, Contract lifecycle, ContractHub, Contract Management, Yes, $40,000, No, Full lifecycle management, Legal, Finance
Embeddings: None



Convert nodes into dataframe.

In [17]:
data = []
for index, node in enumerate(nodes):
    data.append({
        "Node ID": index + 1,
        "Text Content": node.text,
    })
contracts_nodes_df = pd.DataFrame(data)

In [18]:
import validmind as vm

vm_contracts_ds = vm.init_dataset(
    input_id="contracts_nodes",
    dataset=contracts_nodes_df,
    text_column="Text Content", 
    __log=False,
)

vm_contracts_ds.df.head()

2024-05-08 17:14:38,545 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...


Unnamed: 0,Node ID,Text Content
0,1,"Contract ID, Vendor Name, Start Date, End Date..."
1,2,"C001, SirionLabs, 2023-01-01, 2025-12-31, Cont..."
2,3,"C002, Evisort, 2023-02-01, 2024-01-31, AI-driv..."
3,4,"C003, DocuSign, 2023-03-01, 2025-02-28, Digita..."
4,5,"C004, Icertis, 2022-07-01, 2024-06-30, Enterpr..."


### Compute embeddings for each node

In [33]:
from llama_index.embeddings.openai import OpenAIEmbedding
from validmind.models import FunctionModel

client = OpenAIEmbedding()

def embed(input):
    model = OpenAIEmbedding(model_name="text-embedding-3-small")
    return model.get_text_embedding(input["Text Content"])

vm_embedder = FunctionModel(input_id="text-embedding-3-small", predict_fn=embed)

In [35]:
vm_contracts_ds.assign_predictions(vm_embedder)

2024-05-08 20:16:43,316 - INFO(validmind.vm_models.dataset.utils): Running predict_proba()... This may take a while
2024-05-08 20:16:43,316 - INFO(validmind.vm_models.dataset.utils): Not running predict_proba() for unsupported models.
2024-05-08 20:16:43,317 - INFO(validmind.vm_models.dataset.utils): Running predict()... This may take a while
2024-05-08 20:18:00,216 - INFO(validmind.vm_models.dataset.utils): Done running predict()


In [36]:
vm_contracts_ds.df.head()

Unnamed: 0,Node ID,Text Content,embedding_model_prediction,text-embedding-3-small_prediction
0,1,"Contract ID, Vendor Name, Start Date, End Date...","[-0.0532541498541832, -0.01645655557513237, 0....","[-0.0532541498541832, -0.01645655557513237, 0...."
1,2,"C001, SirionLabs, 2023-01-01, 2025-12-31, Cont...","[-0.020622380077838898, 0.010668843053281307, ...","[-0.020622380077838898, 0.010668843053281307, ..."
2,3,"C002, Evisort, 2023-02-01, 2024-01-31, AI-driv...","[-0.019044026732444763, 0.011319427751004696, ...","[-0.019044026732444763, 0.011319427751004696, ..."
3,4,"C003, DocuSign, 2023-03-01, 2025-02-28, Digita...","[-0.011594770476222038, 0.008915002457797527, ...","[-0.011594770476222038, 0.008915002457797527, ..."
4,5,"C004, Icertis, 2022-07-01, 2024-06-30, Enterpr...","[0.000986601458862424, 0.0023517629597336054, ...","[0.000986601458862424, 0.0023517629597336054, ..."


In [37]:
from validmind.tests import run_test

test= run_test(
    "validmind.model_validation.embeddings.CosineSimilarityHeatmap",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    }
)

VBox(children=(HTML(value='<h1>Cosine Similarity Heatmap</h1>'), HTML(value='<p>Plots an interactive heatmap o…

In [38]:
test= run_test(
    "validmind.model_validation.embeddings.EuclideanDistanceHeatmap",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    },
)

VBox(children=(HTML(value='<h1>Euclidean Distance Heatmap</h1>'), HTML(value='<p>Plots an interactive heatmap …

In [44]:
test= run_test(
    "validmind.model_validation.embeddings.PCAComponentsPairwisePlots",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    },
    params = {
        "n_components": 3
    }
)

VBox(children=(HTML(value='<h1>PCA Components Pairwise Plots</h1>'), HTML(value="<p>Plots individual scatter p…

In [47]:
test= run_test(
    "validmind.model_validation.embeddings.TSNEComponentsPairwisePlots",
    inputs = {
        "dataset": vm_contracts_ds,
        "model": vm_embedder,
    },
    params = {
        "n_components": 3
    }
)

VBox(children=(HTML(value='<h1>TSNE Components Pairwise Plots</h1>'), HTML(value="<p>Plots individual scatter …

The `train_questions_df`, which contains questions and answers, will be stored in the vector store. This will simulate questions that have already been answered and are stored in the database. The `test_questions_df`, on the other hand, will act as a set of new questions posed by the user. The answers in this dataset will be used as ground truth for evaluation.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

questions_df = pd.read_csv("datasets/rag/vendor_contracts_questions.csv")
train_questions_df, test_questions_df = train_test_split(questions_df, test_size=0.20)

# Rename columns 
train_questions_df = train_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'answer'})
test_questions_df = test_questions_df.rename(columns={
    'Question #': 'question_id',
    'Question': 'question', 
    'Answer': 'ground_truth'})

contracts_df = pd.read_csv("datasets/rag/vendor_contracts.csv")

In [None]:
train_questions_df.head()

In [None]:
test_questions_df.head()

In [None]:
contracts_df.head()

## Store data into the vectorstore

### 1. Store vendor contracts

In [None]:
import chromadb
import uuid
chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(name="chroma_collection")

# Initialize lists to store data for batch addition
all_embeddings = []
all_metadatas = []
all_documents = []
all_ids = []

# Loop through the DataFrame rows
for index, row in train_df.iterrows():

    all_embeddings.append(row[vm_embedder_openai.predict_col])
    all_metadatas.append({
        'ground_truth': row['ground_truth'],
        'hnsw:space': 'cosine'
    })
    all_documents.append(row['question'])
    all_ids.append(str(uuid.uuid4()))

# Add all data to the collection in a single operation
collection.add(
    ids=all_ids, 
    documents=all_documents,
    embeddings=all_embeddings,
    metadatas=all_metadatas,
)

## Embedding Model Selection

First, we test both the `sentence-transformers` and `openai` embedding models using their native interfaces. 

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")

text = train_questions_df['question'].iloc[0]
embeddings = embed_model.get_text_embedding(text)

print(f"Question: {text}")
print(f"Dimension: {len(embeddings)}")

In [None]:
from openai import OpenAI
client = OpenAI()

embeddings = client.embeddings.create(
    input=text, 
    model="text-embedding-3-small"
).data[0].embedding

print(f"Question: {text}")
print(f"Dimension: {len(embeddings)}")

Create `validmind` embedding models.

In [None]:
from validmind.models import EmbeddingModel

def embed(question):
    """Returns an embedding vector for the given text"""
    
    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
    return embed_model.get_text_embedding(question)


vm_embedder_st = EmbeddingModel(
    input_id="embedding_model_st",
    predict_col="embedding_st",
    predict_fn=embed
)

In [None]:
def embed(question):
    """Returns an embedding vector for the given text"""

    return (
        client.embeddings.create(
            input=question,
            model="text-embedding-3-small",
        )
        .data[0]
        .embedding
    )


vm_embedder_openai = EmbeddingModel(
    input_id="embedding_model_openai", 
    predict_col="embedding_openai",
    predict_fn=embed
)

Generate embeddings from the text in the `question` column.

In [None]:
#test_df[vm_embedder_openai.predict_col] = vm_embedder_openai.predict(test_df)
#test_df[vm_embedder_st.predict_col] = vm_embedder_st.predict(test_df)
#test_df.head()

In [None]:
#train_df[vm_embedder_openai.predict_col] = vm_embedder_openai.predict(train_df)
#train_df[vm_embedder_st.predict_col] = vm_embedder_st.predict(train_df)
#train_df.head()

Create `validmind` datasets.

In [None]:
import validmind as vm

#vm_train_ds = vm.init_dataset(train_df, text_column="question", __log=False)
#vm_test_ds = vm.init_dataset(test_df, text_column="question", __log=False)


Run an embedding test for both models to ensure that the embedding models function properly.

In [None]:
from validmind.tests import run_test

run = False
if run:
    result = run_test(
        "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise:HFEmbeddingModel",
        inputs={"model": vm_embedder_st, "dataset": vm_test_ds},
        params={"probability": 0.3},
    )

In [None]:
run = False
if run:
    result = run_test(
        "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise:OpenAIEmbeddingModel",
        inputs={"model": vm_embedder_openai, "dataset": vm_test_ds},
        params={"probability": 0.3},
    )

In [None]:
from validmind.tests import run_test

run = True
if run:
    result = run_test(
        "validmind.model_validation.embeddings.EmbeddingsDistances",
        inputs={"dataset": vm_test_ds},
        params={
            "embedding_col_A": vm_embedder_st.predict_col,
            "embedding_col_B": vm_embedder_openai.predict_col
        },
    )

## Setup Vector Store

#### Insert embeddings and questions into Vector DB

In [None]:
import chromadb
import uuid
chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(name="rfp_rag_collection")

# Initialize lists to store data for batch addition
all_embeddings = []
all_metadatas = []
all_documents = []
all_ids = []

# Loop through the DataFrame rows
for index, row in train_df.iterrows():

    all_embeddings.append(row[vm_embedder_openai.predict_col])
    all_metadatas.append({
        'ground_truth': row['ground_truth'],
        'hnsw:space': 'cosine'
    })
    all_documents.append(row['question'])
    all_ids.append(str(uuid.uuid4()))

# Add all data to the collection in a single operation
collection.add(
    ids=all_ids, 
    documents=all_documents,
    embeddings=all_embeddings,
    metadatas=all_metadatas,
)

Check the dimensionality of the collection by examining the dimensions of any embedding within it.

In [None]:
res = collection.get(include=['embeddings'])
len(res['embeddings'][0])

We test the retriever by directly querying using the pre-computed embedding corresponding to the question. We expect the vector store to return the top k most similar questions, along with the metadata associated with each of these questions.

In [None]:
test_df.head()

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.vector_stores.types import VectorStoreQuery

chroma_vector_store = ChromaVectorStore(chroma_collection=collection)
query = VectorStoreQuery(query_embedding=train_df['embedding_openai'][0], similarity_top_k=10)
result = chroma_vector_store.query(query)

In [None]:
print(f"Question: {train_df['question'][0]}\n")

for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):
    print("Node ID:", id_)
    print("Question:", node.text)
    print("Answer:", node.metadata['ground_truth'])
    print("Similarity:", similarity)
    print()

## Setup Retrieval Model

In [None]:
from validmind.models import RetrievalModel

def retrieve(embedding):

    contexts = []
    
    query = VectorStoreQuery(query_embedding=embedding, similarity_top_k=10)

    result = chroma_vector_store.query(query)

    for node, similarity, id_ in zip(result.nodes, result.similarities, result.ids):

        context = f"Node ID: {id_}\n"
        context = f"Question: {node.text}\n"
        context += f"Answer: {node.metadata['ground_truth']}\n"
        context += f"Similarity: {similarity}\n"

        contexts.append(context)

    return contexts

vm_retriever = RetrievalModel(input_id="retrieval_model", predict_fn=retrieve)

In [None]:
test_df[vm_retriever.predict_col] = vm_retriever.predict(test_df)
test_df.head()

## Setup Generation Model

In [None]:
from llama_index.core import Prompt

template = """
Answer the question based only on the following context. 
If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""
prompt = Prompt(template)

In [None]:
formatted_prompt = prompt.format(
    context=test_df[vm_retriever.predict_col][0], 
    question=test_df['question'][0]
)
print(formatted_prompt)

In [None]:
from openai import OpenAI
from validmind.models import GenerationModel


client = OpenAI()

def generate(question, contexts):

    formatted_prompt = prompt.format(
        context=contexts, 
        question=question
    )
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": formatted_prompt},
        ],
    )

    return response.choices[0].message.content

vm_generator = GenerationModel(input_id="generation_model", predict_fn=generate)

In [None]:
test_df[vm_generator.predict_col] = vm_generator.predict(test_df)
test_df.head()

## Setup a ValidMind RAG Model

In [None]:
from validmind.models import RAGModel

vm_rag_model = RAGModel(
    embedder=vm_embedder_openai,
    retriever=vm_retriever,
    generator=vm_generator,
    input_id="rag_pipeline",
)

In [None]:
result_df = vm_rag_model.predict(test_df)
result_df.head()

In [None]:
vm_ragas_ds = vm.init_dataset(result_df, __log=False)

In [None]:
import plotly.express as px

def plot_distribution(scores):
    # plot distribution of scores (0-1) from ragas metric
    # scores is a list of floats
    fig = px.histogram(x=scores, nbins=10)
    fig.show()

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
result = run_test(
    "validmind.model_validation.ragas.AnswerSimilarity",
    inputs={"dataset": vm_ragas_ds},
    show=False,
)
plot_distribution(result.metric.summary.results[0].data)