In [1]:
%cd ../src

/Users/mikhail_zybin/food-order-chat-bot/src


In [2]:
import chatbot
import openai
from loguru import logger

In [3]:
def read_scenario(scenario_id: int) -> list[dict[str, str]]:
    with open(f"../evaluator_scenarios/scenario{scenario_id}.txt", "r") as fin:
        scenario = fin.readlines()

    # This is to remove the commentaries
    while not scenario[0].startswith("Chatbot"):
        scenario = scenario[1:]
    
    messages = []
    current_replica = "Please help me to order food"
    chatbot_speaks = True
    for line in scenario:
        line = line.lstrip()
        if line.startswith("User"):
            messages.append({"role": "assistant", "content": current_replica.strip()})
            chatbot_speaks = False
            current_replica = line[5:]
        elif line.startswith("Chatbot"):
            messages.append({"role": "user", "content": current_replica.strip()})
            chatbot_speaks = True
            current_replica = line[9:]
        else:
            current_replica += line
    
    return messages


def evaluate_ai_reply(
    template: str,
    messages: list[dict[str, str]],
    predicted_message, 
    ground_truth,
    chatbot_data,
    model: str,
    client: openai.OpenAI,    
    temperature: float = 0.0,
) -> str:
    evaluator_prompt = template.format(chatbot_data, str(messages), predicted_message, ground_truth)
    evaluator = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": evaluator_prompt}
        ],
        temperature=temperature,
    )
    return evaluator.choices[0].message.content


def evaluate_scenario(scenario_id: int) -> tuple[float, float]:
    api_base = "https://llama3-1-8b-api.llm.lab.epam.com/v1"
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    client = openai.OpenAI(api_key="***REMOVED***", base_url=api_base)
    with open("prompts/evaluator_prompt.txt") as fin:
        evaluator_prompt = fin.read()
    descriptions, menus_string = chatbot.initialize_menus_string()
    
    chatbot_data = descriptions + "\n" + menus_string
    logger.info(f"Evaluating scenario {scenario_id}")
    system_prompt = chatbot.initialize_system_prompt()
    messages = [{"role": "system", "content": system_prompt}] + read_scenario(scenario_id)
    logger.disable("chatbot")
    factual_correctness_list = []
    appropriateness_list = []
    for i in range(4, len(messages), 2):
        predicted_message = chatbot.get_next_ai_message(messages[:i], model, client)
        ground_truth = messages[i]["content"]
        logger.info("-"*20 + "predicted_message" + "-"*20)
        logger.info(predicted_message)
        logger.info("-"*20 + "ground_truth" + "-"*20)
        logger.info(ground_truth)
        evaluation = evaluate_ai_reply(evaluator_prompt, messages[1:i], predicted_message, ground_truth, chatbot_data, model, client)
        evaluation = chatbot.parse_llm_json(evaluation)
        factual_correctness_list.append(evaluation["factual_correctness"])
        appropriateness_list.append(evaluation["appropriateness"])
        logger.info(str(evaluation))
    factual_correctness = sum(factual_correctness_list) / len(factual_correctness_list)
    appropriateness = sum(appropriateness_list) / len(appropriateness_list)
    return factual_correctness, appropriateness

In [4]:
factual_correctness_list = []
appropriateness_list = []
for i in range(1, 8):
    factual_correctness, appropriateness = evaluate_scenario(i)
    factual_correctness_list.append(factual_correctness)
    appropriateness_list.append(appropriateness)

[32m2024-11-27 15:01:39.754[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m58[0m - [1mEvaluating scenario 1[0m
[32m2024-11-27 15:01:53.963[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m67[0m - [1m--------------------predicted_message--------------------[0m
[32m2024-11-27 15:01:53.967[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m68[0m - [1mPasta sounds delicious! Unfortunately, I don't see any pasta dishes in the list of available restaurants. However, I can suggest some restaurants that might have pasta dishes.

Based on your interest in pasta, I recommend checking out Roman Holiday or Nippon. Roman Holiday has a Spaghetti Carbonara and Lasagna al Forno, while Nippon has a Ramen dish that might be similar to a pasta dish.

Would you like to choose one of these restaurants or would you like to explore other options?[0m
[32m2024-11-27 15:01:53.971[0m | [1mINFO    [0m | [36m__main__[0m

In [6]:
factual_correctness_list, appropriateness_list

([0.5875,
  0.9333333333333332,
  0.8000000000000002,
  0.775,
  0.9199999999999999,
  0.95,
  0.5875],
 [0.6125, 0.7333333333333334, 0.6, 0.625, 0.76, 0.75, 0.42500000000000004])