# Bot onboarding

requirement:
- bot definition: system, model
- [suggested] evaluation dataset excel: question, ground-truth
- llm-as-a-judge: judge prompt, judge parameters (for each question, if there are extra parameters, such as "Require Citation", prepare them in dataset excel)
    
Steps:


- 1) create evaluation project (if not exist), optionally bot configuration can be added. If bot configuration is not provided, will use Model, System to fetch bot config.

- 2) prepare dataset - queries, categories, reference answer

- 3) generate answer based on NVplatform backend 

- 4) launch evaluation job


In [None]:
# ! pip3 install ipywidgets
# ! pip3 install huggingface

In [None]:
from service_library.constants import *
import requests
import huggingface_hub as hh
import os
import json

import ipywidgets as widgets
from IPython.display import display

import sys

sys.path.append(os.getcwd())

NVBOT_EVALUATION_URL: str = "https://devbot-api.nvidia.com/evaluation"
HEADER = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}


## 1) create evaluation project

require: system, model

In [ ]:
# ‚úçÔ∏è  Define each field 
project_name = ""  # example: "scout_mixtral_agent"

project_description = ""  # example: "scout mixtral agent"
status = ""  # example:"healthy"
model = ""  # example:"mixtral_agent"
system = ""  # example:"scout"

email_subscription = ""  # example: ""
nt_account = "" or NT_ACCOUNT_ID


In [None]:

# Construct the dictionary using the parameters
project_info = {
    "ProjectName": project_name,
    "Description": project_description,
    "Status": status,
    "Model": model,
    "System": system,
    "EmailSubscription": email_subscription,
    "NtAccount": nt_account
}

# Print the dictionary
print(project_info)

In [None]:
print("Verify evaluation project:")
print(project_info)

print("\nIf confirm on above info, execute next shell")

In [None]:

# Make the POST request
response = requests.post(f"{NVBOT_EVALUATION_URL}/evaluations_project", headers=HEADER,
                         data=json.dumps(project_info))
project_id_created = None
# Check if the request was successful (status code 200)
if response.status_code == 200:
    print('evaluations_project post request successful.')
    project_id_created = response.get("id")
    print(f"Project created: {project_id_created}")
else:
    print(f'failed with status code {response.status_code}.')
    print(response.text)


## 2) create dataset

require:
1. excel file with "Query", "Answer", optional columns include to provide as variables ingested into judge prompt. Example columns: "Query", "Answer", "Required Citations".
2. judge module should be added to directory: prompt_module": `eval_prompt_library/metrics_eval_prompt/`. For example, if python class module name: YourEvaluationPrompt. Inside this class, should have :"output_format", "template".
output_format: which specifies the llm-as-a-judge evaluation output format
template: Judge prompt to be used for evaluation, where variables in 1. can be inserted in snake case. For example: "Required Citations" will be inserted as "{required_citations}".

judge prompts can also be provided in a json, and "prompt_template", "output_format".
```
    judge_template = {
        "name": "single-ref-v1",
        "type": "single",
        "prompt_template": "",
        "description": "for general LLM response evaluation",
        "category": "general",
        "output_format": "",
        "system_prompt": "You are a helpful assistant."
    }
```

3. project_id
4. dataset_name (optional)



In [None]:
# ‚úçÔ∏è Define each field 
question_column_name = ""  # example:"Query"
reference_column_name = ""  # example:"Answer"
additional_column_names_to_be_added_in_judge_prompt = []

prompt_module_class_path_in_directory_eval_prompt_library = ""  # example: "eval_prompt_library.metrics_eval_prompt.MetricsEvaluationPrompt"
# prompt_module also needs to have constants "output_format, "template"

judgement_prompt_in_prompt_module = "eval_template_v2"
# additional columns are expected to be embedded in prompt as snake case  with curly braces {}
# for example, Column value "Require Citations", should be converted to "{require_citations}

# LLM evaluation post-processing, example the following are shown in key-value pairs from judgement response, please verify in build.nvidia.com with provided judge prompts.
parsing_scores = []
# example: [
#         "Correctness Answer",
#         "Helpfulness",
#         "Empathy",
#         "Conciseness"
#     ]
parsing_text = []  # example: ["Explanation"]
project_id = None  # example: 1
if project_id_created is not None:
    project_id = project_id_created


In [ ]:
column_map = {
    "question": question_column_name,
    "reference": reference_column_name,
    "additional_params": additional_column_names_to_be_added_in_judge_prompt}

judge_config = {
    "prompt_module": prompt_module_class_path_in_directory_eval_prompt_library,
    "output_format": "output_format",
    "template": judgement_prompt_in_prompt_module,
    "scorers": parsing_scores,
    "parse_keys": parsing_text
}


In [ ]:
# prepare excel dataset

print("Verify column_map is correct, columns exist in your excel dataset (case matters):\n")
print(column_map)

print(
    "Check judge_config is located in current path, such as `eval_prompt_library.metrics_eval_prompt` and constants `output_format` and `template` has been defined:\n")
print(judge_config)

print("Check project_id to be located in previous creation request, or define an existed project: \n")
print(project_id)

print("\nIf confirm on above info (please double check with dataset provided (example screenshot below), execute POST request https://devbot-api.nvidia.com/evaluation/dataset.")


![dataset_upload](./images/dataset_upload.png){: width="100px"}

In [None]:
print(f"Copy response for dataset creation response")
# ‚úçÔ∏è Copy response after dataset created 
dataset_response = ""

# example response:
# 
# dataset_response = {
#   "Datastore": {
#     "id": "dataset-Fqus9cyeLrMZGmUWgpCNxu",
#     "path": "nvidia/nvhelp_groundtruth-llm_as_a_judge-0708-1547-Dlk",
#     "name": "nvhelp_groundtruth-llm_as_a_judge-0708-1547-Dlk",
#     "files": [
#       "judge_prompt_parameters/correct_answer.json",
#       "judge_prompt_parameters/empathy_expected.json",
#       "judge_prompt_parameters/helpfulness_criteria.json",
#       "judge_prompt_parameters/required_citations.json",
#       "judge_prompt_parameters/short_answer_expected.json",
#       "judge_prompts.jsonl",
#       "question.jsonl",
#       "reference_answer/reference.jsonl"
#     ]
#   }
# }

datastore = dataset_response.get("Datastore")

#### (optional) Download and verify dataset

In [None]:
import requests
from IPython.display import HTML

dataset_id = dataset_response["Datastore"].get("id")
url = f"{NVBOT_EVALUATION_URL}/datasets/download/{dataset_id}"
headers = {
    'accept': 'application/json'
}

download_response = requests.get(url, headers=headers)
download_url = None
if download_response.status_code == 200:
    # print(f"Dataset download request was successful: {download_response.url}")
    download_url = download_response.url
    print(f"Download dataset request completed: {download_url}")

from IPython.display import display, HTML

display(HTML(f'<a href="{download_url}" target="_blank">Download Dataset</a>'))

In [None]:
# verify dataset structure



## 3) generate evaluation schema


In [None]:
# ‚úçÔ∏è Define each field 

email_subscription_list = []
env = ""  # example: dev, stg, prd
model_name = "" # model_name, models available here: https://build.nvidia.com/search?term=Chat, example: mistralai/mixtral-8x22b-instruct-v0.1"

llm_as_a_judge_evaluator_payload = {
    "eval_type":
        "llm_as_a_judge",
    "eval_subtype": "mtbench",
    "mode": "single",
    "judge_inference_params": {
        "top_p": 0.1,
        "top_k": 40,
        "temperature": 0,
        "stop": [],
        "tokens_to_generate": 250
    },
    "inference_params": {
        **EVAL_INFERENCE_DEFAULT,
        "extra_body": {
            "project_id": project_id,
            "env": env
        }
    }
}


In [None]:
print("Verify below content for dataset config, which fills the template based on dataset creation response above")

dataset_config = {
    "Engine": "Datastore",
    "Name": dataset_response.get("name"),
    "DatasetId": dataset_response.get("id"),
    "DatasetFolder": dataset_response.get("name"),
    "Files": dataset_response.get("files"),
}

In [ ]:
print(f"Verify the NemoEvaluator payload for LLM-as-a-judge")

nemo_evaluator = {
    "DatasetConfig": dataset_config,
    "Evaluators": [
        {
            "name": "llm_evaluation",
            "model": {
                "llm_name": model_name
            },
            "evaluator_payload": llm_as_a_judge_evaluator_payload,
            "judge_config": judge_config
        }
    ]

}

In [ ]:
from data_models.api.run_maker import EvaluationSchema, Notification, NemoEvaluator

evaluation_schema = EvaluationSchema(
    Notification = Notification(
        EmailRecipients = email_subscription_list
    ),
    NemoEvaluator = NemoEvaluator.model_validate(nemo_evaluator)        
)

In [ ]:
import json
import os
project_name = "eval_nvinfo_mixtral_agent_prd"
def create_json_in_codebase(data, file_name):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file, indent=4)
        
create_json_in_codebase({"EvaluationSchema": evaluation_schema.dict()}, f"../../asset/{project_name}.json")

In [ ]:
print (f"Great job üëè! Now please check on /asset/{project_name}.json, and commit the MR.")