In [None]:
import os
import pandas as pd
import json
from tqdm import tqdm
from glob import glob
from octotools.engine.openai import ChatOpenAI

exp_code = 'YOUR_EXPERIMENT_CODE'

local_llm_engine = ChatOpenAI(model_string="gpt-3.5-turbo", is_multimodal=False, enable_cache=True)

demo_prompt = """
You are looking at answers to multiple choice questions. You will be given a question and a response. Each question will have 4 choices (A, B, C, and D). Find the choice chosen by the response. YOU ARE NOT ANSWERING THE QUESTION YOURSELF. You are simply extracting the choice from the provided response. The response likely is organized in sections and the choice is generally located in the conclusion or final answer section. RESPOND ONLY WITH THE NUMBER CORRESPONDING TO THE CHOICE THAT THE ANSWER MADE. If the response did not choose an answer, respond with '-'.
"""

def create_test_prompt(demo_prompt, query, response):
    demo_prompt = demo_prompt.strip()
    test_prompt = f"Question:\n{query}\n\nResponse:\n{response}"
    full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nChoice:"
    return full_prompt

def safe_equal(prediction, answer):
    """
    Check if the prediction is equal to the answer, even if they are of different types
    """
    try:
        if prediction == answer:
            return True
        return False
    except Exception as e:
        print(e)
        return False

In [None]:
results_dir = f'../results/{exp_code}'
outputs = glob(f'{results_dir}/*.json')

use_col = 'direct_output'

items = []
for output in tqdm(outputs):
    item_id = os.path.splitext(os.path.basename(output))[0]
    with open(output, 'r') as f:
        out = json.load(f)

    output = out[use_col]
    question = out.get('query', out.get('question', ''))
    if len(output.replace('.', '').strip()) == 1:
        extraction = output.replace('.', '').strip()
    else:
        full_prompt = create_test_prompt(demo_prompt, question, output)
        extraction = local_llm_engine(full_prompt)

    item = {
        'id': item_id,
        'question': question,
        'correct': out['answer'],
        'extracted_choice': extraction,
        'response': output,
        'use_col': use_col
    }
    items.append(item)

In [8]:
df = pd.DataFrame(items)

def process_choice(x): 
    x = x.split('.')[0]
    x = x.replace('.', '')
    if x.isnumeric():
        return chr(ord('A') + int(x) - 1)
    return x

df.apply(lambda row: safe_equal(process_choice(row['extracted_choice']), row['correct']), axis=1).mean()