# Vendor Contract QA Agent Documentation POC

## Pre-requisites

In [1]:
%pip install duckduckgo-search beautifulsoup4 --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
# load openai api key
import os

from dotenv import load_dotenv
load_dotenv()

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError('OPENAI_API_KEY is not set')

In [3]:
import sys

sys.path.append(os.getcwd())

from utils import (
    client,
    init_db,
    get_schema_description,
    get_tools_spec,
)

In [4]:
init_db()

# Create Agent

### Load Tools and DB Schema Spec

In [5]:
tools_spec = get_tools_spec()

Loaded 2 tool definitions
[
      {
            "type": "function",
            "function": {
                  "name": "query_database",
                  "description": "Query the contract database to retrieve information about contracts with a particular vendor",
                  "parameters": {
                        "type": "object",
                        "properties": {
                              "query": {
                                    "type": "string",
                                    "description": "SQL query to execute against the database"
                              }
                        },
                        "required": [
                              "query"
                        ]
                  }
            }
      },
      {
            "type": "function",
            "function": {
                  "name": "search_online",
                  "description": "Search the web for information",
                  "parameters": {
             

In [6]:
print(get_schema_description())

Table: vendors
  vendor_id: TEXT
  vendor_name: TEXT
  category: TEXT
  total_spend: INTEGER
  contracts: TEXT
Table: contracts
  contract_id: TEXT
  vendor_id: TEXT
  start_date: TEXT
  end_date: TEXT
  contract_value: INTEGER
  terms_conditions: TEXT
  products_description: TEXT



### Create an Assistant

In [7]:
AGENT_SYSTEM_PROMPT = f"""
# Mission:
You are an AI Agent that helps employees answer questions they might have about everything related to software vendors.
You will be asked questions such as "Do we have a vendor for cloud storage?" or "I need a tool for project management".
You should use the tools available to you as well as semantic search on the documents you have access to to answer these questions.

# Guidelines:
For "Do we have a vendor for cloud storage?", you could use the `query_database` to query the contracts database for cloud storage vendors.
Then you could search your document repository for information on the vendors you found.
If none are found, then you might search online using the `search_online` tool to discover new vendors.
Or, for the question "I need a tool for project management", if you cannot find a relevant vendor in the database,
  you could use the `search_online` tool to find out if any existing vendors provide project management tools.

# Constraints:
You should always try and find relevant information from the database.
You can search online to find new information or confirm information that you already know.
You should only fall back to your existing knowledge of vendors to help you come up with good search queries or when you want to enrich your answers.
  - For example, if the user is asking for a certain product and you find a vendor in the db that doesn't mention that product but you know they offer it, you can share that with the user.
You should only use your document retrieval system to find extra information related to vendors found in the database or online - essentially to enrich your knowledge before answering.
Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.

# DB Schema:
{get_schema_description()}
""".strip()

AGENT_NAME = "Vendor Contract Q/A Assistant"
AGENT_ID = None

for assistant in client.beta.assistants.list():
    # if we already have an agent with the same name, use that
    print("Using existing agent...")
    if assistant.name == AGENT_NAME:
        AGENT_ID = assistant.id
        break

if AGENT_ID is None:
    # Create an agent using the OpenAI Assistants API
    print("Creating agent...")
    agent = client.beta.assistants.create(
        name=AGENT_NAME,
        instructions=AGENT_SYSTEM_PROMPT,
        model="gpt-4o",
        tools=tools_spec,
    )
    AGENT_ID = agent.id

print(f"Assistant ID: {AGENT_ID}")

Using existing agent...
Assistant ID: asst_bKIFBsFBdhzcJBySqCXaypeN


# Use the Agent

In [8]:
from utils import AgentEventHandler

os.environ["DEBUG"] = "0"

def single_pass_agent(input):
    thread = client.beta.threads.create()

    client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=input["question"],
    )

    with client.beta.threads.runs.stream(
        thread_id=thread.id,
        assistant_id=AGENT_ID,
        event_handler=AgentEventHandler(input),
    ) as stream:
        stream.until_done()

    return input

In [9]:
import json

input = {"question": "Do we have contracts with microsoft?"}

single_pass_agent(input)

print(json.dumps(input, indent=5))

> Querying database with:  {'query': "SELECT * FROM contracts WHERE vendor_id IN (SELECT vendor_id FROM vendors WHERE vendor_name = 'Microsoft')"}
> Result:  [["C014", "V011", "2022-09-01", "2025-08-31", 210000, "Annual subscription for office productivity suite.", "Microsoft Office 365 suite including Word, Excel, PowerPoint, and Teams."], ["C015", "V011", "2023-10-01", "2026-09-30", 240000, "Azure compute and storage services. Pay-as-you-go pricing.", "Microsoft Azure services including virtual machines, storage accounts, and networking capabilities."]]


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

{
     "question": "Do we have contracts with microsoft?",
     "tool_calls": [
          {
               "function": "query_database",
               "arguments": {
                    "query": "SELECT * FROM contracts WHERE vendor_id IN (SELECT vendor_id FROM vendors WHERE vendor_name = 'Microsoft')"
               }
          }
     ],
     "contexts": [
          "[[\"C014\", \"V011\", \"2022-09-01\", \"2025-08-31\", 210000, \"Annual subscription for office productivity suite.\", \"Microsoft Office 365 suite including Word, Excel, PowerPoint, and Teams.\"], [\"C015\", \"V011\", \"2023-10-01\", \"2026-09-30\", 240000, \"Azure compute and storage services. Pay-as-you-go pricing.\", \"Microsoft Azure services including virtual machines, storage accounts, and networking capabilities.\"]]"
     ],
     "messages": [
          "Yes, we have active contracts with Microsoft:\n\n1. **Contract ID:** C014\n   - **Start Date:** 2022-09-01\n   - **End Date:** 2025-08-31\n   - **Contract Value:

# Use the Agent with ValidMind

In [10]:
import pandas as pd
import validmind as vm

vm.init(
    api_host = "http://localhost:3000/api/v1/tracking",
    api_key = "974b92e85e9cce568a9aad4351119f0e",
    api_secret = "f96ac8381bb60c386c620f0fb92b0883de3c4cb725173b4845844b8070193e14",
    project = "clw0v92xv0008eyryki8uax3o"
)

vm_model = vm.init_model(predict_fn=single_pass_agent, input_id="vendor_qa_agent")
vm_model.predict(pd.DataFrame({"question": ["Do we have contracts with microsoft?"]}))

2024-05-13 22:17:59,532 - INFO(validmind.api_client): Connected to ValidMind. Project: RAG Demo - Initial Validation (clw0v92xv0008eyryki8uax3o)


> Querying database with:  {'query': "SELECT * FROM vendors WHERE vendor_name = 'Microsoft';"}
> Result:  [["V011", "Microsoft", "Productivity Software", 400000, "C014,C015"]]


> Querying database with:  {'query': "SELECT * FROM contracts WHERE vendor_id = 'V011';"}
> Result:  [["C014", "V011", "2022-09-01", "2025-08-31", 210000, "Annual subscription for office productivity suite.", "Microsoft Office 365 suite including Word, Excel, PowerPoint, and Teams."], ["C015", "V011", "2023-10-01", "2026-09-30", 240000, "Azure compute and storage services. Pay-as-you-go pricing.", "Microsoft Azure services including virtual machines, storage accounts, and networking capabilities."]]


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

[{'question': 'Do we have contracts with microsoft?',
  'tool_calls': [{'function': 'query_database',
    'arguments': {'query': "SELECT * FROM vendors WHERE vendor_name = 'Microsoft';"}},
   {'function': 'query_database',
    'arguments': {'query': "SELECT * FROM contracts WHERE vendor_id = 'V011';"}}],
  'contexts': ['[["V011", "Microsoft", "Productivity Software", 400000, "C014,C015"]]',
   '[["C014", "V011", "2022-09-01", "2025-08-31", 210000, "Annual subscription for office productivity suite.", "Microsoft Office 365 suite including Word, Excel, PowerPoint, and Teams."], ["C015", "V011", "2023-10-01", "2026-09-30", 240000, "Azure compute and storage services. Pay-as-you-go pricing.", "Microsoft Azure services including virtual machines, storage accounts, and networking capabilities."]]'],
  'messages': ['Yes, we have contracts with Microsoft. Here are the details:\n\n1. **Contract ID:** C014\n   - **Start Date:** 2022-09-01\n   - **End Date:** 2025-08-31\n   - **Contract Value:** 

In [11]:
test_df = pd.DataFrame({
    "question": [
        "Do we have contracts with microsoft?",
        "Do we have contracts with google?",
        "Do we have contracts with amazon?",
        "What vendors do we have that offer cloud storage?",
        "Do we have relationships with server hardware vendors?",
        "How much is our total spend on project management software?",
        "I need an ERP system for our company. Can you help me find one?",
    ],
    "ground_truth": [
        "Yes, we have 2 contracts with Microsoft: C014 and C015.",
        "No, we do not have contracts with Google.",
        "Yes, we have 2 contracts with Amazon: C007 and C008.",
        "We have multiple vendors who have a cloud storage offering: Amazon Web Services (Vendor ID: V005), Microsoft (Vendor ID: V011), IBM (Vendor ID: V012) and Oracle (Vendor ID: V014)",
        "We have relationships with the following server hardware vendors: Dell (Vendor ID: V013), HP (Vendor ID: V015) and Cisco (Vendor ID: V001).",
        "We don't have any existing contracts for project management software. So the total spend is $0.",
        "We have an existing relationship with SAP for ERP software (Vendor ID: V004). The following two contracts are in place: C005 and C006.",
    ],
})

vm_test_dataset = vm.init_dataset(test_df, input_id="vendor_qa_test_dataset")

2024-05-13 22:18:07,620 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...


In [12]:
vm_test_dataset.assign_predictions(vm_model)

2024-05-13 22:18:07,701 - INFO(validmind.vm_models.dataset.utils): Running predict_proba()... This may take a while
2024-05-13 22:18:07,701 - INFO(validmind.vm_models.dataset.utils): Not running predict_proba() for unsupported models.
2024-05-13 22:18:07,702 - INFO(validmind.vm_models.dataset.utils): Running predict()... This may take a while


> Querying database with:  {'query': "SELECT * FROM vendors WHERE vendor_name = 'Microsoft'"}
> Result:  [["V011", "Microsoft", "Productivity Software", 400000, "C014,C015"]]


> Querying database with:  {'query': "SELECT * FROM contracts WHERE contract_id IN ('C014', 'C015')"}
> Result:  [["C014", "V011", "2022-09-01", "2025-08-31", 210000, "Annual subscription for office productivity suite.", "Microsoft Office 365 suite including Word, Excel, PowerPoint, and Teams."], ["C015", "V011", "2023-10-01", "2026-09-30", 240000, "Azure compute and storage services. Pay-as-you-go pricing.", "Microsoft Azure services including virtual machines, storage accounts, and networking capabilities."]]


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

> Querying database with:  {'query': "SELECT * FROM contracts WHERE vendor_id IN (SELECT vendor_id FROM vendors WHERE vendor_name = 'Google')"}
> Result:  []


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

> Querying database with:  {'query': "SELECT * FROM contracts WHERE vendor_id IN (SELECT vendor_id FROM vendors WHERE vendor_name = 'Amazon')"}
> Result:  []


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

> Querying database with:  {'query': "SELECT vendor_name, vendor_id FROM vendors WHERE category='cloud storage'"}
> Result:  []


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

> Querying database with:  {'query': "SELECT * FROM vendors WHERE category = 'server hardware'"}
> Result:  []


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

> Querying database with:  {'query': "SELECT SUM(total_spend) as total_project_management_spend FROM vendors WHERE category = 'project management'"}
> Result:  [[null]]


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

> Querying database with:  {'query': "SELECT vendor_name, category FROM vendors WHERE category='ERP'"}
> Result:  []


> Searching online with:  {'search_type': 'browse', 'query': 'ERP system vendors'}
> Result:  Error calling tool: search_online: https://links.duckduckgo.com/d.js?q=ERP+system+vendors&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-245467545902323179755033259344786483365&ex=-1 202 Ratelimit


> Searching online with:  {'search_type': 'browse', 'query': 'top ERP system vendors 2023'}
> Result:  Error calling tool: search_online: https://links.duckduckgo.com/d.js?q=top+ERP+system+vendors+2023&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-178284893945079144533161432034867526753&ex=-1 202 Ratelimit


> Receiving message from agent:


HTML(value='\n<div id="message_container">\n\n</div>\n<style>\n#message_container {\n    padding: 10px;\n    b…

2024-05-13 22:18:44,175 - INFO(validmind.vm_models.dataset.utils): Done running predict()


In [13]:
import json

pred_column = vm_test_dataset.prediction_column(vm_model)

print(json.dumps(vm_test_dataset.df[pred_column][0], indent=5))

{
     "question": "Do we have contracts with microsoft?",
     "ground_truth": "Yes, we have 2 contracts with Microsoft: C014 and C015.",
     "tool_calls": [
          {
               "function": "query_database",
               "arguments": {
                    "query": "SELECT * FROM vendors WHERE vendor_name = 'Microsoft'"
               }
          },
          {
               "function": "query_database",
               "arguments": {
                    "query": "SELECT * FROM contracts WHERE contract_id IN ('C014', 'C015')"
               }
          }
     ],
     "contexts": [
          "[[\"V011\", \"Microsoft\", \"Productivity Software\", 400000, \"C014,C015\"]]",
          "[[\"C014\", \"V011\", \"2022-09-01\", \"2025-08-31\", 210000, \"Annual subscription for office productivity suite.\", \"Microsoft Office 365 suite including Word, Excel, PowerPoint, and Teams.\"], [\"C015\", \"V011\", \"2023-10-01\", \"2026-09-30\", 240000, \"Azure compute and storage services. Pay-

In [14]:
vm.tests.list_tests(filter="rag")

ID,Name,Test Type,Description,Required Inputs,Params
validmind.model_validation.ragas.ContextRecall,Context Recall,Metric,Context recall measures the extent to which the retrieved context aligns with the...,['dataset'],"{'question_column': 'question', 'contexts_column': 'contexts', 'ground_truth_column': 'ground_truth'}"
validmind.model_validation.ragas.AspectCritique,Aspect Critique,Metric,"Evaluates generations against the following aspects: harmfulness, maliciousness,...",['dataset'],"{'question_column': 'question', 'answer_column': 'answer', 'contexts_column': 'contexts', 'aspects': ['coherence', 'conciseness', 'correctness', 'harmfulness', 'maliciousness'], 'additional_aspects': []}"
validmind.model_validation.ragas.ContextPrecision,Context Precision,Metric,Context Precision is a metric that evaluates whether all of the ground-truth...,['dataset'],"{'question_column': 'question', 'contexts_column': 'contexts', 'ground_truth_column': 'ground_truth'}"
validmind.model_validation.ragas.AnswerSimilarity,Answer Similarity,Metric,Calculates the semantic similarity between generated answers and ground truths...,['dataset'],"{'answer_column': 'answer', 'ground_truth_column': 'ground_truth'}"
validmind.model_validation.ragas.ContextEntityRecall,Context Entity Recall,Metric,Evaluates the context entity recall for dataset entries and visualizes the results....,['dataset'],"{'contexts_column': 'contexts', 'ground_truth_column': 'ground_truth'}"
validmind.model_validation.ragas.Faithfulness,Faithfulness,Metric,Evaluates the faithfulness of the generated answers with respect to retrieved contexts....,['dataset'],"{'answer_column': 'answer', 'contexts_column': 'contexts'}"
validmind.model_validation.ragas.AnswerRelevance,Answer Relevance,Metric,Assesses how pertinent the generated answer is to the given prompt....,['dataset'],"{'question_column': 'question', 'contexts_column': 'contexts', 'answer_column': 'answer'}"
validmind.model_validation.ragas.AnswerCorrectness,Answer Correctness,Metric,Evaluates the correctness of answers in a dataset with respect to the provided ground...,['dataset'],"{'question_column': 'question', 'answer_column': 'answer', 'ground_truth_column': 'ground_truth'}"
validmind.model_validation.ragas.ContextRelevancy,Context Relevancy,Metric,Evaluates the context relevancy metric for entries in a dataset and visualizes the...,['dataset'],"{'question_column': 'question', 'contexts_column': 'contexts'}"


In [15]:
for test_id in sorted(vm.tests.list_tests(filter="rag", pretty=False)):
    vm.tests.describe_test(test_id)

HTML(value='\n<div>\n  <h2>Answer Correctness</h2>\n  <p>Evaluates the correctness of answers in a dataset wit…

HTML(value='\n<div>\n  <h2>Answer Relevance</h2>\n  <p>Assesses how pertinent the generated answer is to the g…

HTML(value='\n<div>\n  <h2>Answer Similarity</h2>\n  <p>Calculates the semantic similarity between generated a…

HTML(value='\n<div>\n  <h2>Aspect Critique</h2>\n  <p>Evaluates generations against the following aspects: har…

HTML(value='\n<div>\n  <h2>Context Entity Recall</h2>\n  <p>Evaluates the context entity recall for dataset en…

HTML(value='\n<div>\n  <h2>Context Precision</h2>\n  <p>Context Precision is a metric that evaluates whether a…

HTML(value='\n<div>\n  <h2>Context Recall</h2>\n  <p>Context recall measures the extent to which the retrieved…

HTML(value='\n<div>\n  <h2>Context Relevancy</h2>\n  <p>Evaluates the context relevancy metric for entries in …

HTML(value='\n<div>\n  <h2>Faithfulness</h2>\n  <p>Evaluates the faithfulness of the generated answers with re…

In [16]:
from validmind.tests import run_test

rag_tests_inputs = {"dataset": vm_test_dataset}

# for the tests that need the contexts and generated answer, we can pass a mapping as parameters
rag_tests_params = {
    "contexts_column": f"{pred_column}.contexts",
    "answer_column": lambda row: "\n\n".join(row[pred_column]["messages"]),
}

In [17]:
vm_test_dataset.df.head()

Unnamed: 0,question,ground_truth,vendor_qa_agent_prediction
0,Do we have contracts with microsoft?,"Yes, we have 2 contracts with Microsoft: C014 ...",{'question': 'Do we have contracts with micros...
1,Do we have contracts with google?,"No, we do not have contracts with Google.",{'question': 'Do we have contracts with google...
2,Do we have contracts with amazon?,"Yes, we have 2 contracts with Amazon: C007 and...",{'question': 'Do we have contracts with amazon...
3,What vendors do we have that offer cloud storage?,We have multiple vendors who have a cloud stor...,{'question': 'What vendors do we have that off...
4,Do we have relationships with server hardware ...,We have relationships with the following serve...,{'question': 'Do we have relationships with se...


In [18]:
result = run_test(
    "validmind.model_validation.ragas.AnswerCorrectness",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Answer Correctness</h1>'), HTML(value='<p>Evaluates the correctness of answers …

In [19]:
result = run_test(
    "validmind.model_validation.ragas.AnswerRelevance",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Answer Relevance</h1>'), HTML(value='<p>Assesses how pertinent the generated an…

In [20]:
import warnings
# warnings.filterwarnings("ignore", category=FutureWarning, message="promote has been superseded by promote_options='default'.")
# warnings.resetwarnings()
result = run_test(
    "validmind.model_validation.ragas.AnswerSimilarity",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Answer Similarity</h1>'), HTML(value='<p>Calculates the semantic similarity bet…

In [21]:
result = run_test(
    "validmind.model_validation.ragas.AspectCritique",
    inputs=rag_tests_inputs,
    params={
        **rag_tests_params,
        "additional_aspects": [
            ("professionalism", "Ensure the response is professional and appropriate for a business setting."),
        ],
    },
)
result.log()

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Aspect Critique</h1>'), HTML(value='<p>Evaluates generations against the follow…

In [22]:
result = run_test(
    "validmind.model_validation.ragas.ContextEntityRecall",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Context Entity Recall</h1>'), HTML(value='<p>Evaluates the context entity recal…

In [23]:
result = run_test(
    "validmind.model_validation.ragas.ContextPrecision",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Context Precision</h1>'), HTML(value='<p>Context Precision is a metric that eva…

In [24]:
result = run_test(
    "validmind.model_validation.ragas.ContextRecall",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Context Recall</h1>'), HTML(value='<p>Context recall measures the extent to whi…

In [25]:
result = run_test(
    "validmind.model_validation.ragas.ContextRelevancy",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)
result.log()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

VBox(children=(HTML(value='<h1>Context Relevancy</h1>'), HTML(value='<p>Evaluates the context relevancy metric…

In [26]:
result = run_test(
    "validmind.model_validation.ragas.Faithfulness",
    inputs=rag_tests_inputs,
    params=rag_tests_params,
)

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

No statements were generated from the answer.


VBox(children=(HTML(value='<h1>Faithfulness</h1>'), HTML(value='<p>Evaluates the faithfulness of the generated…