## GPT4 Evaluation of LLM Performance to Compare Different Prompt Tip Versions or Different Models against Ground Truths

In [None]:
import json
# from cloudgpt_aoai import  *
import random
import os
from openai import OpenAI

In [None]:
client = OpenAI(
    api_key= API_Key,
)

### Functions Used

In [None]:
def get_chat_completion(messages):
    response = client.chat.completions.create(
        model= "gpt-4-turbo",
        messages=messages,
        temperature=0.2,
        # max_tokens=1200,
        top_p=0.5,
        frequency_penalty=0,
        # presence_penalty=0,
        stop=None,
        response_format={"type": "json_object"}
    )
    response = response.choices[0].message.content
    return response

In [None]:
def convert_to_dict(output):
    """Convert JSON string to dictionary if necessary."""
    if isinstance(output, str):
        try:
            output = json.loads(output)
        except json.JSONDecodeError as e:
            print("Failed to decode JSON: ", e)
            output = {}
    elif not isinstance(output, dict):
        print("Output is neither a string nor a dictionary.")
        output = {}
    return output

### Load two competing LLM response transcripts

In [None]:
# In this case, we compare two transcripts in which the coach responses are generated by the same LLM model
# with 2 different system prompt tips to compare which prompt is more effective.

# We can also compare two transcripts in which the coach responses are generated by 2 different LLMs with the
# same prompt tips to compare model capabilities.
with open('prompt1_LLM_response.json', 'r') as file:
    prompt1_response = json.load(file)

with open('prompt2_LLM_response.json', 'r') as file:
    prompt2_response = json.load(file)

### Evaluation prompt tips for GPT4

In [None]:
system_prompt = """
You are an insightful and meticulous evaluator tasked with assessing the performance of leadership coaching AI systems.
We would like to invite you to provide feedback on the performance of two AI assistants in coaching a leader with the <Question>, compared with the <Human Coach Answer>.
These two AI assistants answers are in <Answer1> and <Answer2>, respectively. You should not focus on the length of the answer or the details of the answer, a longer answer or answer containing more details does NOT necessarily mean the answer is better.
Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias on answer length and details and ensuring that the order in which the responses were presented does not affect your judgment. Then, output your decision indicating your preference on <Answer1> and <Answer2>.
You are allowed to choose one of the following options: "Answer1", "Answer2", or "Tie". "Answer1" means you prefer the first answer, "Answer2" means you prefer the second answer, and "Tie" means you have no preference between the two answers.

There are some Evaluation Metrics you can follow to compare "Answer1" and "Answer2":
.....


Your output should be in the following format in JSON:
{{
   "evaluation_evidence": "your evaluation explanation here",
   "evaluation_decision": "Answer1, or Answer2, or Tie"
}}
"""

prompt= """
Here is the user's question <Question>: {question}
The grounded answer <Grounded Answer>: {answer_gt}
The first answer <Answer1> is: {answer1}
The second answer <Answer2> is: {answer2}
"""

### Load & Organize User Queries and Ground Truth Responses(Human Coach)

In [None]:
random.seed(0)

all_lines_q = []
all_lines_a = []

# load the human coach transcript as the ground truth answer
with open('Testing Transcripts.json', 'r', encoding='utf-8') as file:
    conversations = json.load(file)

# Seperate user queries and huamn coach responses into seperate lists respectively
for entry in conversations:
    if 'client' in entry:
        query = entry['client'] + '\ncoach:\n'
        all_lines_q.append(query)
    elif 'coach' in entry:
        all_lines_a.append(entry['coach'])

### Generate & Store Evaluation Results

In [None]:
result_all = []
tie = 0
p2 = 0
p1 = 0

for i in range(len(all_lines_a)):
    print(i)
    problem = all_lines_q[i]
    prompt1_sl = prompt1_response[i]['response']
    gt_sl = all_lines_a[i]
    prompt2_sl = prompt2_response[i]['response']

    # use shuffling to mitigate position bias and ensure a fair comparison and more robust evaluation
    shuffle=False
    if random.random() > 0.5:
        shuffle=True
        modified_text = prompt.format(question = problem, answer_gt = gt_sl, answer1=prompt1_sl, answer2=prompt2_sl)
    else:
        modified_text = prompt.format(question = problem, answer_gt = gt_sl, answer1=prompt2_sl, answer2=prompt1_sl)

    # generate evaluation results
    test_chat_message = [{"role": "system", "content": system_prompt},
        {"role": "user", "content": modified_text}]
    rre = get_chat_completion(test_chat_message)

    # store evaluation results for downstream analysis
    temp_ = {}
    temp_['problem'] = problem
    temp_['ground truth'] = gt_sl
    temp_['prompt1'] = prompt1_sl
    temp_['prompt2'] = prompt2_sl
    temp_['shuffle'] = shuffle

    result = convert_to_dict(rre)

    temp_['explanation'] = result['evaluation_evidence']
    preference=result['evaluation_decision']

    if shuffle:
        if preference.lower() == 'answer1':
            preference = 'prompt1'
            p1 += 1
        elif preference.lower() == 'answer2':
            preference = 'prompt2'
            p2 += 1
        else:
            preference = "Tie"
            tie += 1
    else:
        if preference.lower() == 'answer1':
            preference = 'prompt2'
            p2 += 1
        elif preference.lower() == 'answer2':
            preference = 'prompt1'
            p1 += 1
        else:
            preference = "Tie"
            tie += 1
    temp_['preference'] = preference
    result_all.append(temp_)

with open('evaluation_prompt1_vs_prompt2.json', 'w') as file:
    json.dump(result_all, file, indent=4)

### GPT4 Voting

In [None]:
print(p1)
print(p2)
print(tie)

4
3
4


#### **Important Note: in order to compare three responses(LLM response from prompt1, LLM response from prompt2, and ground truth response), these three responses need to be generated to answer the SAME user query for each internaction. Therefore, in order for this process above to effectively work, the imported transcripts should follow the same format and include the same user queries in the right order, each followed by one type of response for each interaction. That way, we can compare the responses in parallell.**