In [1]:
%cd ../src

/Users/mikhail_zybin/food-order-chat-bot/src


In [2]:
import chatbot
import openai
from loguru import logger

In [3]:
def read_scenario(scenario_id: int) -> list[dict[str, str]]:
    with open(f"../evaluator_scenarios/scenario{scenario_id}.txt", "r") as fin:
        scenario = fin.readlines()

    # This is to remove the commentaries
    while not scenario[0].startswith("Chatbot"):
        scenario = scenario[1:]
    
    messages = []
    current_replica = "Please help me to order food"
    chatbot_speaks = True
    for line in scenario:
        line = line.lstrip()
        if line.startswith("User"):
            messages.append({"role": "assistant", "content": current_replica.strip()})
            chatbot_speaks = False
            current_replica = line[5:]
        elif line.startswith("Chatbot"):
            messages.append({"role": "user", "content": current_replica.strip()})
            chatbot_speaks = True
            current_replica = line[9:]
        else:
            current_replica += line
    
    return messages


def evaluate_ai_reply(
    template: str,
    messages: list[dict[str, str]],
    predicted_message, 
    ground_truth,
    chatbot_data,
    model: str,
    client: openai.OpenAI,    
    temperature: float = 0.0,
) -> str:
    evaluator_prompt = template.format(chatbot_data, str(messages), predicted_message, ground_truth)
    evaluator = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": evaluator_prompt}
        ],
        temperature=temperature,
    )
    return evaluator.choices[0].message.content


def evaluate_scenario(scenario_id: int) -> tuple[float, float]:
    api_base = "https://llama3-1-8b-api.llm.lab.epam.com/v1"
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    client = openai.OpenAI(api_key="***REMOVED***", base_url=api_base)
    with open("prompts/evaluator_prompt.txt") as fin:
        evaluator_prompt = fin.read()
    descriptions, menus_string = chatbot.initialize_menus_string()
    
    chatbot_data = descriptions + "\n" + menus_string
    logger.info(f"Evaluating scenario {scenario_id}")
    system_prompt = chatbot.initialize_system_prompt()
    messages = [{"role": "system", "content": system_prompt}] + read_scenario(scenario_id)
    logger.disable("chatbot")
    factual_correctness_list = []
    appropriateness_list = []
    confirmation_requested = False
    for i in range(4, len(messages), 2):
        predicted_message, confirmation_requested, is_finished = chatbot.get_next_ai_message(
            messages[:i],
            confirmation_requested,
            model,
            client,
        )
        ground_truth = messages[i]["content"]
        logger.info("-"*20 + "predicted_message" + "-"*20)
        logger.info(predicted_message)
        logger.info("-"*20 + "ground_truth" + "-"*20)
        logger.info(ground_truth)
        evaluation = evaluate_ai_reply(evaluator_prompt, messages[1:i], predicted_message, ground_truth, chatbot_data, model, client)
        evaluation = chatbot.parse_llm_json(evaluation)
        factual_correctness_list.append(evaluation["factual_correctness"])
        appropriateness_list.append(evaluation["appropriateness"])
        logger.info(str(evaluation))
    factual_correctness = sum(factual_correctness_list) / len(factual_correctness_list)
    appropriateness = sum(appropriateness_list) / len(appropriateness_list)
    return factual_correctness, appropriateness

In [4]:
factual_correctness_list = []
appropriateness_list = []
for i in range(1, 11):
    factual_correctness, appropriateness = evaluate_scenario(i)
    factual_correctness_list.append(factual_correctness)
    appropriateness_list.append(appropriateness)

[32m2024-12-04 14:16:52.756[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m58[0m - [1mEvaluating scenario 1[0m
[32m2024-12-04 14:17:03.252[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m73[0m - [1m--------------------predicted_message--------------------[0m
[32m2024-12-04 14:17:03.255[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m74[0m - [1mPasta sounds delicious! Unfortunately, I don't see any pasta dishes in the available restaurants. However, I can suggest some restaurants that might have similar dishes.

Based on your interest in pasta, I recommend checking out Roman Holiday, which serves Italian cuisine. They have a variety of pasta dishes on their menu.

Would you like to order from Roman Holiday or would you like to explore other options?[0m
[32m2024-12-04 14:17:03.256[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m75[0m - [1m--------------------ground_

Your order of 10 portions of Khinkali, 3 portions of Churchkhela from Gagimarjos was successfully received and will be delivered to you by 12:00


[32m2024-12-04 14:27:41.373[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m81[0m - [1m{'factual_correctness': 1.0, 'factual_correctness_explanation': "The chatbot's response accurately reflects the user's order, including the restaurant and the dishes.", 'appropriateness': 0.8, 'appropriateness_explanation': "The chatbot's response is mostly appropriate, but it includes unnecessary details (the quantity of Khinkali) that are not present in the ground truth response."}[0m
[32m2024-12-04 14:27:48.747[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m73[0m - [1m--------------------predicted_message--------------------[0m
[32m2024-12-04 14:27:48.749[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m74[0m - [1mYou have chosen to order 10 portions of Khinkali, 3 portions of Churchkhela, 1 portion of Lobio from Gagimarjos by 12:00. Is that correct?[0m
[32m2024-12-04 14:27:48.750[0m | [1mINFO    [0m |

Your order of 10 portions of Khinkali, 3 portions of Churchkhela, 1 portion of Lobio from Gagimarjos was successfully received and will be delivered to you by 12:00


[32m2024-12-04 14:27:59.677[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m81[0m - [1m{'factual_correctness': 0.0, 'factual_correctness_explanation': "The chatbot's response includes Khinkali in the order, which the user had previously removed.", 'appropriateness': 0.0, 'appropriateness_explanation': "The chatbot's response does not reflect the user's updated order."}[0m
[32m2024-12-04 14:27:59.722[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m58[0m - [1mEvaluating scenario 7[0m
[32m2024-12-04 14:28:14.250[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m73[0m - [1m--------------------predicted_message--------------------[0m
[32m2024-12-04 14:28:14.252[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_scenario[0m:[36m74[0m - [1mSounds like you're in the mood for something hearty!

Based on your preference for a big dinner with a lot of meat, I would recommend the following restaur

In [5]:
factual_correctness_list, appropriateness_list

([0.8500000000000001,
  0.8666666666666667,
  0.8666666666666667,
  0.8250000000000001,
  0.85,
  0.675,
  0.45,
  0.7833333333333332,
  0.77,
  1.0],
 [0.44999999999999996,
  0.6666666666666666,
  0.5666666666666668,
  0.625,
  0.7,
  0.55,
  0.15,
  0.5,
  0.6599999999999999,
  1.0])