# NVInfo Bot onboarding

Background: NVInfo (URL: https://nvbot-sandbox.nvidia.com/bot/nvinfo) is based on NV Graph. Bot configuration is at: config_manager/.../nvinfo.json and config_manager/.../nvinfo_mixtral_agent.json.


Steps:

- 1) create evaluation project (if not exist), optionally bot configuration can be added. If bot configuration is not provided, will use Model, System to fetch bot config.

- 2) prepare dataset - queries, categories, reference answer

- 3) generate answer based on NVplatform backend 

- 4) launch evaluation job


In [None]:
# ! pip3 install huggingface

In [None]:

import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Add the project root to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

from service_library.constants import *
import requests
import huggingface_hub as hh
import os
import json
import pprint
import ipywidgets as widgets
from IPython.display import display, HTML

import sys

sys.path.append(os.getcwd())

NVBOT_EVALUATION_URL: str = "https://devbot-api.nvidia.com/evaluation"

HEADER = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

display(HTML(f'<a href="{NVBOT_EVALUATION_URL}/docs" target="_blank">Swagger API Docs</a>'))

## 1) create evaluation project

require: system, model

In [None]:
# ‚úçÔ∏è  Define each field

project_name = "nvinfo_mixtral_agent"
project_description = "nvinfo"
status = ""
model = "mixtral_agent"
system = "nvinfo"

email_subscription = "nvinfo-evaluation@exchange.nvidia.com"  # example: ""
nt_account = "" or NT_ACCOUNT_ID


In [None]:

# Construct the dictionary using the parameters
project_info = {
    "ProjectName": project_name,
    "Description": project_description,
    "Status": status,
    "Model": model,
    "System": system,
    "EmailSubscription": email_subscription,
    "NtAccount": nt_account
}

# Print the dictionary
print(project_info)

In [None]:
print("Verify evaluation project:")
pprint.pprint(project_info)

print("\n When you have confirmed the above info, execute next shell")

In [None]:
# Make the POST request
response = requests.post(f"{NVBOT_EVALUATION_URL}/evaluations_project", headers=HEADER,
                         json=project_info)
project_id_created = None
# Check if the request was successful (status code 200)
if response.status_code == 200:
    print('request success.')
    project_id_created = response.json()
    print(f"Project created in database, id: {project_id_created}.")
else:
    print(f'failed with status code {response.status_code}.')
    print(response.text)


In [None]:
# Verify
assert project_id_created is not None

project_creation_verification_response = requests.get(f"{NVBOT_EVALUATION_URL}/evaluations_project",
                                                      headers=HEADER,
                                                      params={
                                                          'is_active': 'true',
                                                          'project_id': project_id_created
                                                      })

if project_creation_verification_response.status_code == 200:
    print('project creation success.')
    project_info = project_creation_verification_response.json()
    print("project info: ")
    pprint.pprint(project_info[0])




## 2) create dataset

require:
1. excel file with "Query", "Answer", optional columns include to provide as variables ingested into judge prompt. Example columns: "Query", "Answer", "Required Citations".
2. judge module should be added to directory: prompt_module": `eval_prompt_library/metrics_eval_prompt/`. For example, if python class module name: YourEvaluationPrompt. Inside this class, should have :"output_format", "template".
output_format: which specifies the llm-as-a-judge evaluation output format
template: Judge prompt to be used for evaluation, where variables in 1. can be inserted in snake case. For example: "Required Citations" will be inserted as "{required_citations}".

judge prompts can also be provided in a json, and "prompt_template", "output_format".
```
    judge_template = {
        "name": "single-ref-v1",
        "type": "single",
        "prompt_template": "",
        "description": "for general LLM response evaluation",
        "category": "general",
        "output_format": "",
        "system_prompt": "You are a helpful assistant."
    }
```

3. project_id
4. dataset_name (optional)


Loading Excel into current directory ... 

In [None]:
import pandas as pd
import os

max_column_count_to_view = 10

filename = ""  # "nvinfo_queries.xlsx"
print(f"üéàFile to be loaded: {os.path.exists(filename)}")
df = pd.read_excel(filename)
print(f"Columns: \n{df.columns[:max_column_count_to_view]}\n")
print(f"Sample: \n{df.iloc[1, max_column_count_to_view]}\n")

In [ ]:
# ‚úçÔ∏è  Define each field
question_column_name = "Query"  # (will convert to snake case in prompt variable)
reference_column_name = "Correct Answer"  # (will convert to snake case in prompt variable)
additional_column_names_to_be_added_in_judge_prompt = [
    "Correct Answer"]  # (will convert to snake case in prompt as variable)

prompt_module_class_path_in_directory_eval_prompt_library = "eval_prompt_library.metrics_eval_prompt.MetricsEvaluationPrompt"
# prompt_module also needs to have constants "output_format, "template"

judgement_prompt_in_prompt_module = "eval_template"
# additional columns are expected to be embedded in prompt as snake case  with curly braces {}
# for example, Column value "Require Citations", should be converted to "{require_citations}

# LLM evaluation post-processing, example the following are shown in key-value pairs from judgement response, please verify in build.nvidia.com with provided judge prompts.
parsing_scores = ["Correctness Answer"]
# example: [
#         "Correctness Answer",
#         "Helpfulness",
#         "Empathy",
#         "Conciseness"
#     ]
parsing_text = ["Explanation"]  # example: ["Explanation"]

In [None]:
print(
    "If you are intended to add this dataset to an existing evaluation project, or created evaluation project in above 1) section, verify project_id below.")

if project_id_created is not None:
    project_id = project_id_created

print("project_id:", project_id)

In [None]:
column_map = {
    "question": question_column_name,
    "reference": reference_column_name,
    "additional_params": additional_column_names_to_be_added_in_judge_prompt}

judge_config = {
    "prompt_module": prompt_module_class_path_in_directory_eval_prompt_library,
    "output_format": "output_format",
    "template": judgement_prompt_in_prompt_module,
    "scorers": parsing_scores,
    "parse_keys": parsing_text
}


In [ ]:
# Verify columns existed in file
df_columns = list(
    df.columns)
print(f"All columns defined above is in the file {filename}: {
all(item in df_columns for item in [question_column_name, reference_column_name, *additional_column_names_to_be_added_in_judge_prompt])}")


In [None]:
# Sanity Checking Required fields
column_map_str = json.dumps(column_map)
judge_config_str = json.dumps(judge_config)

print(column_map_str)
print(judge_config_str)

assert isinstance(json.loads(column_map_str), dict), "column map should be parsable to a dict"
assert isinstance(json.loads(judge_config_str), dict), "judge config should be parsable to a dict"

In [None]:
# prepare excel dataset

print(
    f"1. Verify column_map is correct, columns exist in your excel dataset (case matters): {all(col in list(df.columns[:max_column_count_to_view]) for col in [question_column_name, reference_column_name, *additional_column_names_to_be_added_in_judge_prompt])}\n")

print(
    f"2. check judge_config is located in current path, such as `eval_prompt_library.metrics_eval_prompt` and constants `output_format` and `template` has been defined:\n{judge_config}\n")

print(f"3. Check project_id to be located in previous creation request, or define an existed project: {project_id}\n")



In [None]:
import requests
from ipywidgets import FileUpload

upload_widget = FileUpload(multiple=False, description="Select a dataset file")
dataset_upload_response = None


# Function to handle the file upload and send a POST request
def handle_file_upload(change):
    global dataset_upload_response
    uploaded_file = change["new"][0]
    print("uploading file....")
    file_content = uploaded_file["content"]

    file = (uploaded_file["name"], file_content)
    params = {
        "column_map_str": column_map_str,
        "judge_config": judge_config_str,
        "eval_type": "llm_as_a_judge",
        "project_id": project_id,
    }
    api_url = f"{NVBOT_EVALUATION_URL}/dataset"
    response = requests.post(api_url, files={"dataset_file": file}, params=params)

    print(response.status_code)
    print(response.json())

    if response.ok:
        print("successfully uploaded")
        dataset_upload_response = response.json()

    else:
        print(response.text)
        raise Exception("error uploading")


# Attach the file upload handler to the widget
upload_widget.observe(handle_file_upload, names="value")

# Display the widget
print("CLICK The Button to upload your dataset to evaluation server")
display(upload_widget)

In [None]:
assert dataset_upload_response is not None, "You did not upload anything. Click the button above"

In [None]:
print(f"Copy response for dataset creation response")
pprint.pprint(dataset_upload_response)
dataset_response = dataset_upload_response.get('dataset_config')

## verify dataset

In [None]:
import requests
from IPython.display import display, HTML

dataset_id = dataset_response.get('id')
url = f"{NVBOT_EVALUATION_URL}/datasets/download/{dataset_id}"
headers = {
    'accept': 'application/json'
}

download_response = requests.get(url, headers=headers)
download_url = None
if download_response.status_code == 200:
    download_url = download_response.url
    print(f"Download dataset request completed: {download_url}")

display(HTML(f'<a href="{download_url}" target="_blank">Download Dataset</a>'))


In [None]:
# verify dataset structure

import requests
import pprint

dataset_id = dataset_response["dataset_config"].get("id")
url = f"https://datastore.stg.llm.ngc.nvidia.com/v1/datasets/{dataset_id}"
headers = {'accept': 'application/json'}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Response:")
    pprint.pprint(response.json())
else:
    print('Failed to get dataset:', response.status_code, response.text)



## 3) generate evaluation schema

### 3.1 LLM-as-a-judge: 
LLM can be evaluated by using another LLM as a judge (Generally, an LLM regarded as a high quality model should be used as the judge).


In [None]:
# ‚úçÔ∏è Define each field

email_subscription_list = ["yourname@nvidia.com"]
assert email_subscription_list[0] != "yourname@nvidia.com", "enter a real email address"
env = "dev"  # example: dev, stg, prd
model_name = "mistralai/mixtral-8x22b-instruct-v0.1"  # please select available model in build.nvidia.com, e.g: https://build.nvidia.com/mistralai/mixtral-8x22b-instruct


In [None]:
assert project_id
llm_as_a_judge_evaluator_payload = {
    "eval_type": "llm_as_a_judge",
    "eval_subtype": "mtbench",
    "mode": "single",
    "judge_inference_params": {
        "top_p": 0.1,
        "top_k": 40,
        "temperature": 0,
        "stop": [],
        "tokens_to_generate": 250
    },
    "inference_params": {
        **EVAL_INFERENCE_DEFAULT,
        "extra_body": {
            "project_id": project_id,
            "env": env,
            "system": system,
            "model": model
        }
    }
}


In [None]:
print("Verify below content for dataset config, which fills the template based on dataset creation response above")

dataset_config = {
    "Engine": "Datastore",
    "Name": dataset_response.get("name"),
    "DatasetId": dataset_response.get("id"),
    "DatasetFolder": dataset_response.get("name"),
    "Files": dataset_response.get("files"),
}
nemo_evaluator = {
    "DatasetConfig": dataset_config,
    "Evaluators": [
        {
            "name": "llm_evaluation",
            "model": {
                "llm_name": model_name
            },
            "evaluator_payload": llm_as_a_judge_evaluator_payload,
            "judge_config": judge_config
        }
    ]

}

In [None]:
print(f"Verify the NemoEvaluator payload for LLM-as-a-judge: \n")
pprint.pprint(nemo_evaluator, width=120)


In [None]:
from data_models.api.run_maker import EvaluationSchema, Notification, NemoEvaluator

evaluation_schema = EvaluationSchema(
    Notification=Notification(
        EmailRecipients=email_subscription_list
    ),
    NemoEvaluator=NemoEvaluator.parse_obj(nemo_evaluator)
)

In [None]:
import json


def create_json_in_codebase(data, file_name):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file, indent=4)


create_json_in_codebase({"EvaluationSchema": evaluation_schema.dict()}, f"../../asset/{project_name}.json")

In [None]:
print(f"Great job üëè! Now please check on /asset/{project_name}.json, and commit the MR.")

### 3.2 Custom Evaluation: 
comparing the LLM generated response with a ground truth response, results in scorers.
Scorers available: ["accuracy", "bleu", "rouge", "em", "f1", "bert"]

In [1]:
# ‚úçÔ∏è  Define each field
email_subscription_list = ["yourname@nvidia.com"]

assert email_subscription_list[0] != "yourname@nvidia.com", "enter a real email address"


In [2]:
# ‚úçÔ∏è  Define each field
question_column_name = "Query"  # (will convert to snake case in prompt variable)
reference_column_name = "Correct Answer"  # (will convert to snake case in prompt variable)
column_map = {
    "question": "Query",
    "reference": "Correct Answer",
    "answer": ""
}

In [3]:
# ‚úçÔ∏è  Define each field
scorers = ["bleu", "rouge", "bert"]  #  ["accuracy", "bleu", "rouge", "em", "f1", "bert"]

In [5]:
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))

# Add the project root to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

In [49]:
from service_library.handler.s3_dataset_handler import S3DatasetHandler
import requests
from ipywidgets import FileUpload

upload_widget = FileUpload(multiple=False, description="Select a dataset file")
dataset_upload_response = None

max_column_count_to_view = 10

# system = ""
assert system, "require system to be defined above"

# model = ""
assert model, "require system to be defined above"

# project_name = ""
assert project_name, "require project_name to be defined above"

# Function to handle the file upload and send a POST request
def handle_file_upload(change):
    global dataset_upload_response
    uploaded_file = change["new"][0]
    print("uploading file....")
    file_content = uploaded_file["content"]

    file = (uploaded_file["name"], file_content)
    params = {
        "eval_type": "custom_evaluation",
        "project_id": project_id,
    }
    api_url = f"{NVBOT_EVALUATION_URL}/dataset"
    response = requests.post(api_url, files={"dataset_file": file}, params=params)

    print(response.status_code)
    print(response.json())

    if response.ok:
        print("successfully uploaded")
        dataset_upload_response = response.json()

    else:
        print(response.text)
        raise Exception("error uploading")

# Attach the file upload handler to the widget
upload_widget.observe(handle_file_upload, names="value")

# Display the widget
print("CLICK The Button to upload your dataset to evaluation server")
display(upload_widget)

ConnectionError: HTTPConnectionPool(host='localhost', port=8200): Max retries exceeded with url: /v1/auth/approle/nvbot/login (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x118621720>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [ ]:
dataset_upload_response

To define which fields to be extract from response, which later will be logged as part of response, we suggest 

1. execute the command to print out bot response
2. find helper method to extract response, and add functions if you have specific needs that has not been added to `service_library.run.parser_library.response_parser`.
3. verify the functions you defined could be execute in a sequencial manner, and verify the extract answer has key-value pairs as expected

Define columns to compare against. 
In general, we would expect to have a column for 'Ground truth', and a column for 'Bot response' to compare. But we also support select columns that would like to compare intermediate steps, such as 'Rephased Query', 'SQL query' with column for 'Query Ground truth'.


In [ ]:
{
    "EvaluationSchema": {
        "Notification": {
            "EmailRecipients": email_subscription_list
        },
        "NemoEvaluator": {
            "DatasetConfig": {
                "Engine": "local"
            },
            "Evaluators": [
                {
                    "name": "custom_evaluation",
                    "column_map": column_map,
                    "model": {
                        "llm_name": "mistralai/mixtral-8x22b-instruct-v0.1"
                    },
                    "evaluator_payload": {
                        "eval_type": "automatic",
                        "eval_subtype": "custom_eval",
                        "input_file": "",
                        "inference_configs": [
                            {
                                "run_inference": False,
                                "inference_params": {
                                    "tokens_to_generate": 600,
                                    "temperature": 0,
                                    "top_k": 1
                                }
                            }
                        ],
                        "num_of_samples": -1,
                        "scorers": scorers
                    }
                }
            ]
        }
    },
    "RegressionSchema": {
        "Notification": {
        },
        "DatasetConfig": {
            "Engine": "s3",
            "DatasetFolder": "nvbot_for_nvhelp_mixtral_agent",
            "Name": "nvbot_for_nvhelp_mixtral_agent",
            "RunFile": "dataset/default_evaluation.xlsx",
            "DatasetPath": "dataset/default_evaluation.xlsx"
        },
        "RunConfig": {
            "Inputs": [
                {
                    "name": "",
                    "type": "Attribute",
                    "value": ""
                },
                {
                    "name": "",
                    "type": "Function",
                    "value": "service_library.run.parser_library.input_parser.generate_fulfillment_request",
                    "args": [
                        "Query",
                        "Category"
                    ]
                }
            ],
            "Outputs": [
                {
                    "name": "Text",
                    "type": "Attribute",
                    "value": "Response.Json.Text.text"
                },
                {
                    "name": "AgentAction",
                    "type": "Attribute",
                    "value": "Response.CleanedText"
                },
                {
                    "name": "Json",
                    "type": "Attribute",
                    "value": "Response.Json"
                },
                {
                    "name": "IRResults",
                    "type": "Attribute",
                    "value": "CustomData.IRResults"
                }
            ]
        }
    }
}