In [None]:
import os
import pandas as pd
import json
from tqdm import tqdm
from glob import glob
from octotools.engine.openai import ChatOpenAI

exp_code = 'YOUR_EXPERIMENT_LABEL'

local_llm_engine = ChatOpenAI(model_string="gpt-3.5-turbo", is_multimodal=False, enable_cache=True)

demo_prompt = """
You are grading responses to questions about radiology. You will be given the question, the response, and the correct answer and you will respond with CORRECT or INCORRECT only with NO ADDITIONAL EXPLANATION. THe response is generally much longer than the correct answer and it is up to you to determine if the response encompasses the correct answer. If the response encompasses or contains the correct answer (synonyms and rephrasings are allowed), you should respond with CORRECT. If the response does not contain the correct answer, you should respond with INCORRECT. If the response refuses or is unable to answer the question and does not contain the correct answer, you should respond with INCORRECT.
"""

def create_test_prompt(demo_prompt, question, response, answer):
    demo_prompt = demo_prompt.strip()
    test_prompt = f"Question:\n{question}\n\nResponse:\n{response}\n\nCorrect Answer:\n{answer}"
    full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nCorrectness:"
    return full_prompt

In [None]:
# Agent exps
results_dir = f'../results/{exp_code}'
outputs = glob(f'{results_dir}/*.json')

use_col = 'direct_output'

items = []
for output in tqdm(outputs):
    item_id = os.path.splitext(os.path.basename(output))[0]
    with open(output, 'r') as f:
        out = json.load(f)

    output = out[use_col]

    question = out.get('query', out.get('question', ''))
    full_prompt = create_test_prompt(demo_prompt, question, output, out['answer'])
    extraction = local_llm_engine(full_prompt)

    item = {
        'id': item_id,
        'question': question,
        'correct': out['answer'],
        'extracted_choice': extraction,
        'response': output,
        'use_col': use_col
    }
    items.append(item)

In [3]:
df = pd.DataFrame(items)
print((df['extracted_choice'] == 'CORRECT').mean())