In [1]:
import os
import json
from openai import OpenAI

# OpenAI api key
key = ''
# set the env variable
os.environ['OPENAI_API_KEY'] = key
client = OpenAI()

In [2]:
def generate_qa_pairs(system_eval_path):
    """
    Use the system eval json file.
    """
    with open(system_eval_path, 'r') as file:
        system_eval = json.load(file)
    
    # get the user queries/outer keys
    queries = list(system_eval.keys())
    # get the models/inner keys, remove 'query type', 'truth' and 'context'
    inner_keys = list(system_eval[queries[0]].keys())[3:]

    # generate qa pairs
    qa_pairs = []
    # iterate over each model
    for model in inner_keys:
        model_responses = []
        # iterate over each query
        for query in queries:
            # get the answer for the specific model on each query
            response = system_eval[query][model]
            # get the context
            context = system_eval[query]['context']
            model_responses.append(f'Question: {query}, Context: {context}, Answer: {response}')
        # append all models answers to qa_pairs
        # this will be a list of list, where each element in the inner list is a string with query, context and response
        qa_pairs.append(model_responses)
    
    return qa_pairs

def generate_grade(qa_pair, model_name='gpt-3.5-turbo', system_prompt=
                   """You are GPT-4, a large language model created by OpenAI. 
                    You are a precise grader and will be provided a question, some background context, and an answer to the question. 
                    Your task is to grade how good the answer to the question is based on the background context, on a 1-10 scale. 
                    Just answer with a grade as a single number, e.g. 1, no further explanation is needed. 
                    Please do a good job as my your work is very important to my career."""):
    """
    Get the grade for a specific qa pair.
    """
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": qa_pair}
        ]
      )
    
    # get the answer from the llm and cast to int
    try:
        grade = float(completion.choices[0].message.content)
    except ValueError:
        print('Could not cast grade to float.')

    return grade

def llm_evaluation(qa_pairs, model_info_path):
    """
    qa_pairs should be the result from the generate_qa_pairs function.
    """
    with open(model_info_path, 'r') as file:
        model_info = json.load(file)

    # iterate over each model, iterate over each question answer pair and generate grade
    model_grades = [[generate_grade(qa_pair) for qa_pair in model] for model in qa_pairs]

    # save results in dict where model name is the key and the grades are the values
    results = {model_info[model]['name']: grades for model, grades in zip(model_info, model_grades)}

    return results

In [3]:
# json files
sys_eval_path = 'system_eval.json'
model_info_path = 'model_info.json'
# create qa pairs
qas = generate_qa_pairs(sys_eval_path)
# run evaluation
model_grades = llm_evaluation(qas, model_info_path)

In [4]:
model_grades

{'mistralai/Mistral-7B-Instruct-v0.2': [10.0]}