# Initial Framework RAG Model Support

## Pre-requisites

In [1]:
%pip install -q qdrant-client

Note: you may need to restart the kernel to use updated packages.


In [2]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

## Dataset Loader

In [3]:
# load documents
import os
from csv import DictReader
from uuid import uuid4

import pandas as pd


column_map = {"RFP_Question": "question", "RFP_Answer": "answer"}


def load_documents(prefix):
    documents = []
    root_dir = "datasets/rag/"
    for file in os.listdir(root_dir):
        if file.startswith(prefix) and file.endswith(".csv"):
            # use csv dict reader to load the csv file
            with open(os.path.join(root_dir, file)) as f:
                reader = DictReader(f)
                for row in reader:
                    # add a unique id to the row
                    row["id"] = str(uuid4())
                    documents.append(row)

    df = pd.DataFrame(documents)
    df = df[["id", "RFP_Question", "RFP_Answer"]]
    df.rename(columns=column_map, inplace=True)

    return df

def load_test_dataset():
    return load_documents("rfp_new_questions")

def load_train_dataset():
    return load_documents("rfp_existing_questions")

## Embedding Model Selection

First let's setup our embedding model and run some tests to make sure its working well.

In [4]:
from openai import OpenAI

from validmind.models import EmbeddingModel

client = OpenAI()


def embed(question):
    """Returns a text embedding for the given text"""
    return (
        client.embeddings.create(
            input=question,
            model="text-embedding-3-small",
        )
        .data[0]
        .embedding
    )


vm_embedder = EmbeddingModel(input_id="embedding_model", predict_fn=embed)

In [5]:
import validmind as vm

test_df = load_test_dataset()
vm_test_ds = vm.init_dataset(test_df, text_column="question", __log=False)

test_df.head()

2024-04-30 23:30:39,274 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...


Unnamed: 0,id,question,answer
0,cf5705d3-739f-46c8-b856-637403898c4c,What is your experience in developing AI-based...,
1,079fad3e-3719-46d6-9ce3-ac885dc3ec14,How do you ensure your AI-based apps remain up...,
2,f03cafe2-9a0d-49a0-90ad-5412a22ab87e,Can your AI-based applications be customized t...,
3,77931a6b-1b4c-404b-bff8-f481c1794bda,What measures do you take to ensure user priva...,
4,f33e6690-4488-457a-bd91-22263fa048fb,How do you approach user interface and experie...,


In [6]:
test_df[vm_embedder.output_column] = vm_embedder.predict(test_df)
test_df.head()

Unnamed: 0,id,question,answer,embedding
0,cf5705d3-739f-46c8-b856-637403898c4c,What is your experience in developing AI-based...,,"[0.009233377873897552, -0.030979381874203682, ..."
1,079fad3e-3719-46d6-9ce3-ac885dc3ec14,How do you ensure your AI-based apps remain up...,,"[-0.015234623104333878, 0.0011848123976960778,..."
2,f03cafe2-9a0d-49a0-90ad-5412a22ab87e,Can your AI-based applications be customized t...,,"[-0.012747124768793583, 0.0019352560630068183,..."
3,77931a6b-1b4c-404b-bff8-f481c1794bda,What measures do you take to ensure user priva...,,"[-0.0071608577854931355, -0.016291096806526184..."
4,f33e6690-4488-457a-bd91-22263fa048fb,How do you approach user interface and experie...,,"[-0.014469269663095474, 0.007049784995615482, ..."


In [7]:
from validmind.tests import run_test

result = run_test(
    "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise",
    inputs={"model": vm_embedder, "dataset": vm_test_ds},
    params={"probability": 0.3},
)

VBox(children=(HTML(value='\n            <h1>Stability Analysis Random Noise ✅</h1>\n            <p>Evaluate r…

## Setup Vector Store

#### Load RFP Question/Answer Dataset

In [8]:
train_df = load_train_dataset()
train_df.head()

Unnamed: 0,id,question,answer
0,7c000bef-c0bd-425b-97f2-6d89d8846e25,Please share your experience with developing A...,Our company has 15 years of experience in deve...
1,23630480-aaaf-4b5a-be4b-cf82d8eee011,How do you maintain your AI applications with ...,We maintain a dedicated R&D team focused on in...
2,5a8d43f9-a49e-413f-a2d2-17e569bb7193,Can your AI applications be tailored to meet u...,"Absolutely, customization is a core aspect of ..."
3,eb6e6cb0-aaee-4faf-a738-c19af00fc298,What actions do you undertake to secure user d...,User privacy and data security are paramount. ...
4,0071c2ba-bf7e-42a9-b74f-1494864405a6,What considerations do you take into account f...,Our design philosophy centers on simplicity an...


#### Generate embeddings for the questions

In [9]:
train_df[vm_embedder.output_column] = vm_embedder.predict(train_df)
train_df.head()

Unnamed: 0,id,question,answer,embedding
0,7c000bef-c0bd-425b-97f2-6d89d8846e25,Please share your experience with developing A...,Our company has 15 years of experience in deve...,"[0.006856707856059074, -0.04714655876159668, 0..."
1,23630480-aaaf-4b5a-be4b-cf82d8eee011,How do you maintain your AI applications with ...,We maintain a dedicated R&D team focused on in...,"[0.011783392168581486, 0.010354681871831417, 0..."
2,5a8d43f9-a49e-413f-a2d2-17e569bb7193,Can your AI applications be tailored to meet u...,"Absolutely, customization is a core aspect of ...","[-0.008340949192643166, 0.011337274685502052, ..."
3,eb6e6cb0-aaee-4faf-a738-c19af00fc298,What actions do you undertake to secure user d...,User privacy and data security are paramount. ...,"[0.007698851637542248, 0.007591660600155592, 0..."
4,0071c2ba-bf7e-42a9-b74f-1494864405a6,What considerations do you take into account f...,Our design philosophy centers on simplicity an...,"[-0.0029329643584787846, -0.003287967294454574..."


#### Insert embeddings and questions into Vector DB

In [10]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, PointStruct, VectorParams

qdrant = QdrantClient(":memory:")
qdrant.recreate_collection(
    "rfp_rag_collection",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)
qdrant.upsert(
    "rfp_rag_collection",
    points=[
        PointStruct(
            id=row["id"],
            vector=row[vm_embedder.output_column],
            payload={"question": row["question"], "answer": row["answer"]},
        )
        for _, row in train_df.iterrows()
    ],
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## Setup Retrieval Model

In [11]:
from validmind.models import RetrievalModel

def retrieve(embedding):
    contexts = []

    for result in qdrant.search(
        "rfp_rag_collection",
        query_vector=embedding,
        limit=10,
    ):
        context = f"Q: {result.payload['question']}\n"
        context += f"A: {result.payload['answer']}\n"

        contexts.append(context)

    return contexts

vm_retriever = RetrievalModel(input_id="retrieval_model", predict_fn=retrieve)

In [12]:
test_df[vm_retriever.output_column] = vm_retriever.predict(test_df)
test_df.head()

Unnamed: 0,id,question,answer,embedding,contexts
0,cf5705d3-739f-46c8-b856-637403898c4c,What is your experience in developing AI-based...,,"[0.009233377873897552, -0.030979381874203682, ...",[Q: What is your experience in developing AI-b...
1,079fad3e-3719-46d6-9ce3-ac885dc3ec14,How do you ensure your AI-based apps remain up...,,"[-0.015234623104333878, 0.0011848123976960778,...",[Q: How do you ensure your AI-based apps remai...
2,f03cafe2-9a0d-49a0-90ad-5412a22ab87e,Can your AI-based applications be customized t...,,"[-0.012747124768793583, 0.0019352560630068183,...",[Q: Can your AI-based applications be customiz...
3,77931a6b-1b4c-404b-bff8-f481c1794bda,What measures do you take to ensure user priva...,,"[-0.0071608577854931355, -0.016291096806526184...",[Q: What measures do you take to ensure user p...
4,f33e6690-4488-457a-bd91-22263fa048fb,How do you approach user interface and experie...,,"[-0.014469269663095474, 0.007049784995615482, ...",[Q: How do you approach user interface and exp...


## Setup Generation Model

In [13]:
from validmind.models import GenerationModel

system_prompt = """
You are an expert RFP AI assistant.
You are tasked with answering new RFP questions based on existing RFP questions and answers.
You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.
After that you will be provided with a new RFP question.
You will generate an answer and respond only with the answer.
Ignore your pre-existing knowledge and answer the question based on the provided context.
""".strip()


def generate(question, contexts):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "\n\n".join(contexts)},
            {"role": "user", "content": question},
        ],
    )

    return response.choices[0].message.content

vm_generator = GenerationModel(input_id="generation_model", predict_fn=generate)

In [14]:
test_df[vm_generator.output_column] = vm_generator.predict(test_df)
test_df

Unnamed: 0,id,question,answer,embedding,contexts
0,cf5705d3-739f-46c8-b856-637403898c4c,What is your experience in developing AI-based...,Our company has 15 years of experience in deve...,"[0.009233377873897552, -0.030979381874203682, ...",[Q: What is your experience in developing AI-b...
1,079fad3e-3719-46d6-9ce3-ac885dc3ec14,How do you ensure your AI-based apps remain up...,We maintain a dedicated R&D team focused on in...,"[-0.015234623104333878, 0.0011848123976960778,...",[Q: How do you ensure your AI-based apps remai...
2,f03cafe2-9a0d-49a0-90ad-5412a22ab87e,Can your AI-based applications be customized t...,"Yes, our AI-based applications can be customiz...","[-0.012747124768793583, 0.0019352560630068183,...",[Q: Can your AI-based applications be customiz...
3,77931a6b-1b4c-404b-bff8-f481c1794bda,What measures do you take to ensure user priva...,User privacy and data security are paramount. ...,"[-0.0071608577854931355, -0.016291096806526184...",[Q: What measures do you take to ensure user p...
4,f33e6690-4488-457a-bd91-22263fa048fb,How do you approach user interface and experie...,Our design philosophy centers on simplicity an...,"[-0.014469269663095474, 0.007049784995615482, ...",[Q: How do you approach user interface and exp...
5,2b01c410-0150-4605-b4d3-cb7c2970b003,Describe your support and maintenance services...,"Post-launch, we offer comprehensive support an...","[-0.01282538566738367, 0.027432076632976532, 0...",[Q: Describe your support and maintenance serv...
6,1568a1d4-f768-4c35-b4c1-f864c0bfd265,How do you measure the success and impact of y...,Success measurement is tailored to each projec...,"[0.008592551574110985, 0.01133714523166418, 0....",[Q: How do you measure the success and impact ...
7,c85476db-9742-4028-ad96-403b36fb7013,How do you ensure the ethical use of LLMs in y...,We adhere to ethical AI practices by implement...,"[0.04022055119276047, 0.02517019584774971, 0.0...",[Q: How do you ensure the ethical use of LLMs ...
8,53770109-e518-4e28-9b38-3c10617fb095,Can you describe the process of training your ...,Our LLM training process begins with the metic...,"[-0.01286401879042387, 0.027707116678357124, 0...",[Q: Can you describe the process of training y...
9,c134a612-d4b1-4215-a8ed-ed3260796148,How do you handle the continuous learning and ...,We implement advanced continuous learning mech...,"[0.004479296039789915, 0.02997581847012043, 0....",[Q: How do you handle the continuous learning ...


## Setup RAG Model (Pipeline of "Component" Models)

In [15]:
from validmind.models import RAGModel

vm_rag_model = RAGModel(
    embedder=vm_embedder,
    retriever=vm_retriever,
    generator=vm_generator,
    input_id="rag_pipeline",
)

In [16]:
result_df = vm_rag_model.predict(test_df)
result_df.head()

Unnamed: 0,id,question,answer,embedding,contexts
0,cf5705d3-739f-46c8-b856-637403898c4c,What is your experience in developing AI-based...,Our company has 15 years of experience in deve...,"[0.009233377873897552, -0.030979381874203682, ...",[Q: What is your experience in developing AI-b...
1,079fad3e-3719-46d6-9ce3-ac885dc3ec14,How do you ensure your AI-based apps remain up...,We maintain a dedicated R&D team focused on in...,"[-0.015557997860014439, 0.001083239447325468, ...",[Q: How do you ensure your AI-based apps remai...
2,f03cafe2-9a0d-49a0-90ad-5412a22ab87e,Can your AI-based applications be customized t...,"Yes, our AI-based applications can be customiz...","[-0.012713181786239147, 0.0019399791490286589,...",[Q: Can your AI-based applications be customiz...
3,77931a6b-1b4c-404b-bff8-f481c1794bda,What measures do you take to ensure user priva...,User privacy and data security are paramount. ...,"[-0.007135083898901939, -0.01626216620206833, ...",[Q: What measures do you take to ensure user p...
4,f33e6690-4488-457a-bd91-22263fa048fb,How do you approach user interface and experie...,Our design philosophy centers on simplicity an...,"[-0.014495181851089, 0.007043612655252218, 0.0...",[Q: How do you approach user interface and exp...
5,2b01c410-0150-4605-b4d3-cb7c2970b003,Describe your support and maintenance services...,"Post-launch, we offer comprehensive support an...","[-0.01282538566738367, 0.027432076632976532, 0...",[Q: Describe your support and maintenance serv...
6,1568a1d4-f768-4c35-b4c1-f864c0bfd265,How do you measure the success and impact of y...,Success measurement is tailored to each projec...,"[0.008592551574110985, 0.01133714523166418, 0....",[Q: How do you measure the success and impact ...
7,c85476db-9742-4028-ad96-403b36fb7013,How do you ensure the ethical use of LLMs in y...,We adhere to ethical AI practices by implement...,"[0.04022055119276047, 0.02517019584774971, 0.0...",[Q: How do you ensure the ethical use of LLMs ...
8,53770109-e518-4e28-9b38-3c10617fb095,Can you describe the process of training your ...,Our LLM training process begins with the metic...,"[-0.012864182703197002, 0.027756337076425552, ...",[Q: Can you describe the process of training y...
9,c134a612-d4b1-4215-a8ed-ed3260796148,How do you handle the continuous learning and ...,We implement advanced continuous learning mech...,"[0.004479296039789915, 0.02997581847012043, 0....",[Q: How do you handle the continuous learning ...
