In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
import os
from typing import List
import sys
import uuid
import pandas as pd

from llm_app import BaseApp, InputFormat, OutputFormat, GptBaseApp
from evaluator import TestCase, EvalProperty, PropertyResult, Evaluator
from eval_properties import evaluate_property_with_llm, cosine_similarity, get_embedding

import openai
import instructor

openai.api_key_path = "../../openai_key"
instructor.patch()

In [10]:
# Define some tests.
test_cases = [
    TestCase(
        test_id=uuid.uuid4().hex,
        test_input={"question": "Why should a victim go to the doctor after a Heimlich manoeuvre?"},
        reference_output={"answer": "Because the Heimlich manoeuvre may have caused internal bleeding."},
    ),
    TestCase(
        test_id=uuid.uuid4().hex,
        test_input={"question": "What are the four steps of first aid?"},
        reference_output={"answer": "1. Ensure safety, 2. Assess the victim's condition, 3. Notify emergency services if necessary, 4. Provide further first aid."},
    ),
    TestCase(
        test_id=uuid.uuid4().hex,
        test_input={"question": "What should you do if the victim is not breathing?"},
        reference_output={"answer": "Call the emergencies and start CPR."},
    ),
]

In [11]:
# Define properties.
def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
    return evaluate_property_with_llm(
        model="gpt-4",
        system_message="Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.",
        user_message=f"Answer: {llm_app_result.answer}\nReference Answer: {test_case.reference_output.answer}",
    )

def output_similarity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
    app_output_emb = get_embedding(llm_app_result.answer)
    reference_emb = get_embedding(test_case.reference_output.answer)
    return PropertyResult(feedback="", score=cosine_similarity(app_output_emb, reference_emb))


def output_verbosity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
    return PropertyResult(feedback="", score=len(llm_app_result.answer) / len(test_case.reference_output.answer))

properties = [
    EvalProperty(
        property_name="FactuallyConsistent",
        description="The answer is factually consistent with the reference answer.",
        eval_func=factually_consistent,
    ),
    EvalProperty(
        property_name="CosineSimilarity",
        description="The answer is similar to the reference answer.",
        eval_func=output_similarity,
    ),
    EvalProperty(
        property_name="Verbosity",
        description="The answer is not too verbose.",
        eval_func=output_verbosity,
    ),
]

In [12]:
# Define LLM app versions.
llm_apps = [
    GptBaseApp({
        "gpt_version": "gpt-3.5-turbo-0613",
        "system_prompt": "Answer the question."
    }),
    GptBaseApp({
        "gpt_version": "gpt-3.5-turbo-0613",
        "system_prompt": "You are a first-aid expert. Answer the question. Be accurate and concise."
    }),
    GptBaseApp({
        "gpt_version": "gpt-4",
        "system_prompt": "You are a first-aid expert. Answer the question. Be accurate and concise."}),
]

In [14]:
# Evaluate the LLM apps on the test set by using the properties.
ev = Evaluator(test_set=test_cases, properties=properties, results_dir="data/eval_results")
exp_name = input("Experiment name: ")
results_df = ev.evaluate(llm_apps, exp_name)

Evaluating test cases: 100%|██████████| 3/3 [00:23<00:00,  7.75s/test case]
Evaluating test cases: 100%|██████████| 3/3 [00:20<00:00,  6.79s/test case]
Evaluating test cases: 100%|██████████| 3/3 [00:29<00:00,  9.91s/test case]


In [15]:
results_df

Unnamed: 0,gpt_version,system_prompt,CosineSimilarity.score,FactuallyConsistent.score,Verbosity.score,latency
0,gpt-3.5-turbo-0613,Answer the question.,0.904431,0.666667,8.37079,2.814149
0,gpt-3.5-turbo-0613,You are a first-aid expert. Answer the questio...,0.903006,0.666667,2.03113,1.684336
0,gpt-4,You are a first-aid expert. Answer the questio...,0.907844,1.0,5.116838,5.495382


In [None]:
# Start the Mlflow dashboard.
!mlflow ui