In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from tools import tools  
# Load environment variables
load_dotenv()

# Load the GAIA dataset locally 
with open('2023/validation/metadata.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Set up OpenAI API client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [2]:

def get_agent_answer(question, context="", additional_prompt=""):
    prompt = f"""You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma-separated list, apply the above rules depending on whether the element is a number or a string. Answer the following question as best you can, speaking as a general AI assistant. You have access to the following tools:

Search: useful for when you need to answer questions about current events or the current state of the world
ReadImage: useful for reading image files. Input should be the file path to the image.
ReadExcel: useful for reading Excel files. Input should be the file path to the Excel file.
ReadCSV: useful for reading CSV files. Input should be the file path to the CSV file.
ReadZIP: useful for reading ZIP files. Input should be the file path to the ZIP file.
ReadPDF: useful for reading PDF files. Input should be the file path to the PDF file.
ReadJSON: useful for reading JSON files. Input should be the file path to the JSON file.
ReadPython: useful for reading Python files. Input should be the file path to the Python file.
ReadDOCX: useful for reading DOCX files. Input should be the file path to the DOCX file.
ReadPPTX: useful for reading PPTX files. Input should be the file path to the PPTX file.
ReadAudio: useful for reading audio files (MP3, WAV). Input should be the file path to the audio file.
ReadPDB: useful for reading PDB files. Input should be the file path to the PDB file.
ReadJSONLD: useful for reading JSON-LD files. Input should be the file path to the JSON-LD file.
ReadTXT: useful for reading TXT files. Input should be the file path to the TXT file.

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of the tool names
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin Remember to speak as a general AI assistant when giving your final answer.

Context: {context}

Question: {question}

{additional_prompt}
"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    if response.choices:
        return response.choices[0].message.content
    else:
        return "No response received from the model."


In [3]:

def select_test_case():
    print("Available test cases:")
    for i, case in enumerate(data, 1):
        print(f"{i}. {case['task_id']}: {case['Question']}")
    
    while True:
        try:
            selection = int(input("Enter the number of the test case you want to evaluate: ")) - 1
            if 0 <= selection < len(data):
                return data[selection]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a valid number.")

def compare_answers(test_case, agent_full_answer):
    print("\nComparing Answers:")
    print("LLM Full Answer:")
    print(agent_full_answer)

    final_answer = agent_full_answer.split("Final Answer:")[-1].strip() if "Final Answer:" in agent_full_answer else agent_full_answer

    print("\nLLM Final Answer:", final_answer)
    print("Correct Answer:", test_case['Final answer'])

    is_correct = final_answer.lower().strip() == test_case['Final answer'].lower().strip()
    print("Is the answer correct?", "Yes" if is_correct else "No")

    return agent_full_answer, final_answer, is_correct

def handle_file_reading(test_case):
    if test_case.get('file_name'):
        file_path = os.path.join('2023', 'validation', test_case['file_name'])
        if os.path.exists(file_path):
            _, ext = os.path.splitext(file_path)
            ext = ext.lower()
            tool = None

            if ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
                tool = "ReadImage"
            elif ext in ['.xlsx', '.xls']:
                tool = "ReadExcel"
            elif ext == '.csv':
                tool = "ReadCSV"
            elif ext == '.zip':
                tool = "ReadZIP"
            elif ext == '.pdf':
                tool = "ReadPDF"
            elif ext == '.json':
                tool = "ReadJSON"
            elif ext == '.py':
                tool = "ReadPython"
            elif ext == '.docx':
                tool = "ReadDOCX"
            elif ext == '.pptx':
                tool = "ReadPPTX"
            elif ext in ['.mp3', '.wav']:
                tool = "ReadAudio"
            elif ext == '.pdb':
                tool = "ReadPDB"
            elif ext == '.jsonld':
                tool = "ReadJSONLD"
            elif ext == '.txt':
                tool = "ReadTXT"

            if tool and tool in tools:
                context = tools[tool](file_path)
                if tool == "ReadAudio":
                    try:
                        audio_data = json.loads(context)
                        transcription = audio_data.get('transcription', '')
                        context += f"\nTranscription: {transcription}"
                    except json.JSONDecodeError:
                        print("Failed to decode audio metadata.")
                return context
            else:
                with open(file_path, 'r', encoding='utf-8') as f:
                    return f.read()
        else:
            print(f"Dependent file {test_case['file_name']} not found.")
            return ""
    return ""

def evaluate(test_case):
    print("Evaluating...")
    context = handle_file_reading(test_case)
    agent_full_answer = get_agent_answer(test_case['Question'], context)
    llm_full_answer, llm_final_answer, is_correct = compare_answers(test_case, agent_full_answer)

    return {
        'task_id': test_case['task_id'],
        'question': test_case['Question'],
        'llm_full_answer': llm_full_answer,
        'llm_final_answer': llm_final_answer,
        'correct_answer': test_case['Final answer'],
        'is_correct': is_correct
    }

def re_evaluate(test_case, additional_prompt):
    print("Re-evaluating...")
    context = handle_file_reading(test_case)
    new_agent_full_answer = get_agent_answer(test_case['Question'], context, additional_prompt)
    new_llm_full_answer, new_llm_final_answer, new_is_correct = compare_answers(test_case, new_agent_full_answer)

    return {
        'task_id': test_case['task_id'],
        'question': test_case['Question'],
        'llm_full_answer': new_llm_full_answer,
        'llm_final_answer': new_llm_final_answer,
        'correct_answer': test_case['Final answer'],
        'is_correct': new_is_correct
    }

def generate_report(feedback):
    print("\nEvaluation Report")
    if not feedback:
        print("No evaluations have been performed yet.")
        return

    df = pd.DataFrame(feedback)
    accuracy = df['is_correct'].mean()
    print(f"Overall Accuracy: {accuracy:.2%}")

    fig, ax = plt.subplots()
    value_counts = df['is_correct'].value_counts()
    value_counts.plot(kind='bar', ax=ax, color=['red', 'green'])
    ax.set_xlabel("Correctness")
    ax.set_ylabel("Count")
    ax.set_title("Correct vs Incorrect Answers")

    labels = []
    if False in value_counts.index:
        labels.append('Incorrect')
    if True in value_counts.index:
        labels.append('Correct')
    ax.set_xticklabels(labels)

    plt.setp(ax.get_xticklabels(), rotation=0, ha='center')
    plt.show()

    print("\nDetailed Feedback:")
    print(df[['task_id', 'question', 'llm_final_answer', 'correct_answer', 'is_correct']])


In [4]:

def main():
    feedback = []

    while True:
        print("\nLLM Evaluation Tool")
        print("1. Evaluate a test case")
        print("2. Generate report")
        print("3. Exit")

        choice = input("Enter your choice (1-3): ")

        if choice == '1':
            test_case = select_test_case()
            result = evaluate(test_case)
            feedback.append(result)

            if not result['is_correct']:
                additional_prompt = input("Enter additional guidance for the LLM (or press Enter to skip): ")
                if additional_prompt:
                    new_result = re_evaluate(test_case, additional_prompt)
                    feedback[-1] = new_result

        elif choice == '2':
            generate_report(feedback)

        elif choice == '3':
            print("Exiting the program.")
            break

        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    main()


LLM Evaluation Tool
1. Evaluate a test case
2. Generate report
3. Exit
Invalid choice. Please try again.

LLM Evaluation Tool
1. Evaluate a test case
2. Generate report
3. Exit
Invalid choice. Please try again.

LLM Evaluation Tool
1. Evaluate a test case
2. Generate report
3. Exit
Available test cases:
1. c61d22de-5f6c-4958-a7f6-5e9707bd3466: A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?
2. 17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc: I’m researching species that became invasive after people who kept them as pets released them. There’s a certain species of fish that was popularized as a pet by being the main character of the movie Finding Nemo. According to the USGS, where was this fish found as a nonnative species, before the year 2020? I nee