In [None]:
%cd ../
%load_ext autoreload
%autoreload 2

import numpy as np
from loguru import logger
from datetime import datetime
import sys

sys.path.append("src/")
import evaluator
import toml

In [None]:
secrets = toml.load(".streamlit/secrets.toml")
chatbot_model = secrets["launch_parameters"]["chatbot_model"]
chatbot_model_dict = {
    "model": chatbot_model,
    "api_base": secrets["api_bases"][chatbot_model],
    "api_key": secrets["api_keys"][chatbot_model],
}
analyzer_model = secrets["launch_parameters"]["analyzer_model"]
analyzer_model_dict = {
    "model": analyzer_model,
    "api_base": secrets["api_bases"][analyzer_model],
    "api_key": secrets["api_keys"][analyzer_model],
}
evaluator_model = secrets["launch_parameters"]["evaluator_model"]
evaluator_model_dict = {
    "model": analyzer_model,
    "api_base": secrets["api_bases"][evaluator_model],
    "api_key": secrets["api_keys"][evaluator_model],
}
provocator_model = secrets["launch_parameters"]["provocator_model"]
provocator_model_dict = {
    "model": analyzer_model,
    "api_base": secrets["api_bases"][provocator_model],
    "api_key": secrets["api_keys"][provocator_model],
}
model_dicts = [chatbot_model_dict, analyzer_model_dict, evaluator_model_dict]

In [None]:
provocator_modes = [
    "hacker that wants to jailbreak the chatbot",
    "a person that types with a lot of typos and grammar mistakes",
    "a person who wants to make fun out of the chatbot",
]
use_provocator = secrets["launch_parameters"]["use_provocator"]
provocator_mode = secrets["launch_parameters"]["provocator_mode"]
if use_provocator:
    model_dicts.append(provocator_model_dict)

In [None]:
factual_correctness_list = []
appropriateness_list = []
for i in range(1, 20):
    now = datetime.now()
    formatted_now = now.strftime("%Y-%m-%d_%H:%M:%S")
    logfile = f"logs/{formatted_now}_{i}.txt"
    with open(logfile, "w") as fout:
        fout.write("evaluator_model_dict" + " " + str(evaluator_model_dict) + "\n")
        fout.write("chatbot_model_dict" + " " + str(chatbot_model_dict) + "\n")
        fout.write("analyzer_model_dict" + " " + str(analyzer_model_dict) + "\n")
        if use_provocator:
            fout.write(
                "provocator_model_dict" + " " + str(provocator_model_dict) + "\n"
            )

    logger.add(logfile)
    factual_correctness, appropriateness = evaluator.evaluate_scenario(
        i,
        model_dicts,
        provocator_mode,
    )
    factual_correctness_list.append(factual_correctness)
    appropriateness_list.append(appropriateness)
    logger.remove()

    with open(logfile, "a") as fout:
        print(
            factual_correctness_list,
            np.mean(np.array(factual_correctness_list)),
            np.std(np.array(factual_correctness_list)),
            file=fout,
        )
        print(
            appropriateness_list,
            np.mean(np.array(appropriateness_list)),
            np.std(np.array(appropriateness_list)),
            file=fout,
        )
        print("#" * 20, file=fout)