In [None]:
import json
import random
from PIL import Image
import pandas as pd
import time
import traceback
import base64
from openai import OpenAI
import os
import mimetypes
from typing import List, Dict, Any, Tuple

In [None]:
client = OpenAI(api_key='Add your API key here')

In [None]:
NUM_RUNS = 10
MAX_RETRIES = 3
RETRY_DELAY = 10  # seconds
TEMPERATURE = 0
MAX_TOKENS = 300

In [None]:
VLAT_PROMPT = """I am about to show you an image and ask you a multiple choice question about that image. 
Please structure your response in the following format:
Answer: [Enter the exact text of your chosen option]
Explanation: [Provide your reasoning]
Select the BEST answer, based only on the chart and not external knowledge. DO NOT GUESS.
If you are not sure about your answer or your answer is based on a guess, select "Omit".
Choose your answer ONLY from the provided options."""

CALVI_PROMPT = """I am about to show you an image and ask you a multiple choice question about that image. 
Please structure your response in the following format:
Answer: [Enter the exact text of your chosen option(s)]
Explanation: [Provide your reasoning]
Select the BEST answer(s), based only on the chart and not external knowledge.
Choose your answer(s) ONLY from the provided options."""

In [None]:
def load_questions(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data['questions']

def get_image_mime_type(image_path):
    mime_type, _ = mimetypes.guess_type(image_path)
    return mime_type if mime_type else 'image/png'

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def query_gpt_with_retry(prompt, image_path, temperature=TEMPERATURE, max_tokens=MAX_TOKENS):
    base64_image = encode_image(image_path)
    mime_type = get_image_mime_type(image_path)
    
    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=max_tokens,
                temperature=temperature
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error occurred (Attempt {attempt + 1}/{MAX_RETRIES}): {str(e)}")
            if attempt < MAX_RETRIES - 1:
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                print("Max retries reached. Skipping this question.")
                return "Error: Max retries reached"

def extract_answer_and_explanation(gpt_answer, options):
    parts = gpt_answer.split('Why:', 1)
    chosen_option = parts[0].strip()
    explanation = parts[1].strip() if len(parts) > 1 else "No explanation provided"
    
    chosen_option = chosen_option.strip('*').strip()
    
    if chosen_option.isdigit():
        index = int(chosen_option) - 1
        if 0 <= index < len(options):
            return [options[index]], explanation
    
    matches = []
    for opt in options:
        if opt.lower() in chosen_option.lower():
            matches.append(opt)
        elif any(word.lower() in chosen_option.lower() for word in opt.split()):
            matches.append(opt)
    
    return matches if matches else [chosen_option], explanation

def evaluate_answer(correct_answer, gpt_answer):
    correct_answers = set(answer.strip().lower() for answer in correct_answer.split(','))
    gpt_answers = set(answer.strip().lower() for answer in gpt_answer)
    return bool(correct_answers & gpt_answers)

def extract_answer_and_explanation(gpt_answer: str, options: List[str]) -> Tuple[List[str], str]:
    """
    Extract answer and explanation from GPT's response using robust pattern matching.
    """
    # Handle empty response
    if not gpt_answer:
        return [], "No response provided"
    
    # Define markers for answer and explanation sections
    answer_markers = [
        "Answer:", 
        "answer:",
        "The best answer based on the information provided in the image is:",
        "The best answer based on the chart is:",
        "The best answer is:"
    ]
    
    explanation_markers = [
        "Explanation:",
        "explanation:",
        "Why:",
        "why:"
    ]
    
    # Initialize variables
    chosen_option = ""
    explanation = ""
    
    # Convert to lowercase for case-insensitive searching
    lower_response = gpt_answer.lower()
    
    # Find the start of the answer section
    answer_start = -1
    answer_marker_used = ""
    for marker in answer_markers:
        if marker.lower() in lower_response:
            answer_start = lower_response.index(marker.lower()) + len(marker)
            answer_marker_used = marker
            break
    
    # Find the start of the explanation section
    explanation_start = -1
    explanation_marker_used = ""
    for marker in explanation_markers:
        if marker.lower() in lower_response:
            explanation_start = lower_response.index(marker.lower())
            explanation_marker_used = marker
            break
    
    # Extract answer and explanation
    if answer_start >= 0:
        if explanation_start >= 0:
            # We have both answer and explanation
            chosen_option = gpt_answer[answer_start:explanation_start].strip()
            explanation = gpt_answer[explanation_start + len(explanation_marker_used):].strip()
        else:
            # We only have answer
            chosen_option = gpt_answer[answer_start:].strip()
    else:
        # Fallback: use the first line as answer if it's not too long
        first_line = gpt_answer.split('\n')[0].strip()
        if len(first_line) < 100:  # Arbitrary length check to avoid using explanations as answers
            chosen_option = first_line
    
    # Clean up the chosen option
    chosen_option = chosen_option.strip()
    
    # Remove numbering if present (e.g., "3) Answer" -> "Answer")
    if chosen_option and chosen_option[0].isdigit() and ') ' in chosen_option:
        chosen_option = chosen_option.split(') ', 1)[1].strip()
    
    # Remove quotes if present
    chosen_option = chosen_option.strip('"\'')
    
    # Match with provided options
    matches = []
    for opt in options:
        # Exact match (case-insensitive)
        if opt.lower().strip() == chosen_option.lower().strip():
            return [opt], explanation
    
    # If no exact match, try partial matches
    for opt in options:
        # Check if the option is contained within the chosen answer
        if opt.lower().strip() in chosen_option.lower():
            matches.append(opt)
    
    # If we found matches, return them
    if matches:
        return [matches[0]], explanation
    
    # Handle "Omit" case for VLAT
    if "omit" in chosen_option.lower():
        return ["Omit"], explanation
    
    # If no matches but we have a chosen_option, return it
    if chosen_option and len(chosen_option) > 1:
        return [chosen_option], explanation
    
    return ["No valid answer extracted"], explanation

def generate_structured_prompt(question: Dict, test_type: str, base_prompt: str) -> str:
    """
    Generate a structured prompt for either VLAT or CALVI questions.
    """
    options = question['options'].copy()
    
    question_prompt = f"Question: {question['question']}\n\nOptions:\n"
    for i, option in enumerate(options, 1):
        question_prompt += f"{i}) {option}\n"
    
    question_prompt += f"\n{base_prompt}"
    
    return question_prompt

def evaluate_visualization_literacy(test_name: str, questions: List[Dict], prompt: str,
                                 randomize_options: bool = False,
                                 randomize_questions: bool = False) -> List[Dict]:
    results = []
    working_questions = questions.copy()
    
    if randomize_questions:
        random.shuffle(working_questions)
    
    for idx, question in enumerate(working_questions, 1):
        print(f"\nProcessing {test_name} question {idx}/{len(working_questions)}")
        print(f"Conditions: Options {'Randomized' if randomize_options else 'Not Randomized'}, "
              f"Questions {'Randomized' if randomize_questions else 'Not Randomized'}")
        
        options = question['options'].copy()
        if randomize_options:
            random.shuffle(options)
        
        question_prompt = generate_structured_prompt(
            question={"question": question['question'], "options": options},
            test_type=test_name,
            base_prompt=prompt
        )
        
        start_time = time.time()
        full_gpt_answer = query_gpt_with_retry(question_prompt, question['image_path'])
        end_time = time.time()
        time_taken = end_time - start_time
        
        print(f"Time taken: {time_taken:.2f} seconds")
        print(f"Raw GPT response: {full_gpt_answer}")
        
        gpt_answer, explanation = extract_answer_and_explanation(full_gpt_answer, options)
        is_correct = evaluate_answer(question['correct_answer'], gpt_answer)
        
        results.append({
            'test_name': test_name,
            'question': question['question'],
            'options': ', '.join(options),
            'correct_answer': question['correct_answer'],
            'gpt_answer': ', '.join(gpt_answer),
            'explanation': explanation,
            'raw_response': full_gpt_answer,
            'Task': question.get('Task', ''),
            'Chart_type': question.get('Chart_type', ''),
            'Misleader': question.get('Misleader', ''),
            'wrong_due_to_misleader': question.get('wrong_due_to_misleader', ''),
            'is_correct': is_correct,
            'randomized_options': randomize_options,
            'randomized_questions': randomize_questions,
            'image_path': question['image_path'],
            'time_taken': time_taken
        })
        
        print(f"Question: {question['question']}")
        print(f"Options: {', '.join(options)}")
        print(f"GPT's answer: {', '.join(gpt_answer)}")
        print(f"GPT's explanation: {explanation}")
        print(f"Correct answer: {question['correct_answer']}")
        print(f"Result: {'Correct' if is_correct else 'Incorrect'}")
        
        time.sleep(1)
    
    return results

def run_experiment(test_name: str, file_path: str, prompt: str):
    print(f"\nStarting {test_name} experiment...")
    questions = load_questions(file_path)
    
    conditions = [
         (False, False, "No_Randomization"),
         (True, False, "Randomized_Options"),
        (False, True, "Randomized_Questions"),
        (True, True, "Both_Randomized")
    ]
    
    all_results = []
    
    for randomize_options, randomize_questions, condition_name in conditions:
        print(f"\n=== Running {test_name} - {condition_name} ===")
        
        for run in range(1, NUM_RUNS + 1):
            print(f"\n--- Run {run}/{NUM_RUNS} ---")
            results = evaluate_visualization_literacy(
                test_name,
                questions, 
                prompt,
                randomize_options=randomize_options,
                randomize_questions=randomize_questions
            )
            
            # Add condition information to results
            for result in results:
                result['condition'] = condition_name
                result['run'] = run
            
            all_results.extend(results)
            
            # Save individual run results
            df_run = pd.DataFrame(results)
            df_run.to_csv(f'gpt_{test_name.lower()}_{condition_name}_run_{run}.csv', index=False)
            
            # Calculate and print run score
            score = (df_run['is_correct'].sum() / len(df_run)) * 100
            print(f"\nScore for {condition_name} Run {run}: {score:.2f}%")
    
    # Combine all results
    combined_df = pd.DataFrame(all_results)
    
    # Calculate and print overall statistics
    print("\n=== Overall Results ===")
    for condition in combined_df['condition'].unique():
        condition_df = combined_df[combined_df['condition'] == condition]
        print(f"\n{condition}:")
        print(f"Mean accuracy: {condition_df['is_correct'].mean() * 100:.2f}%")
        print(f"Best question accuracy: {condition_df.groupby('question')['is_correct'].mean().max() * 100:.2f}%")
        print(f"Worst question accuracy: {condition_df.groupby('question')['is_correct'].mean().min() * 100:.2f}%")
        print(f"Average time per question: {condition_df['time_taken'].mean():.2f} seconds")
        print(f"Fastest question: {condition_df['time_taken'].min():.2f} seconds")
        print(f"Slowest question: {condition_df['time_taken'].max():.2f} seconds")
    
    # Statistics by various dimensions
    print("\nAverage Statistics by Task:")
    print(combined_df.groupby(['Task', 'condition'])['is_correct'].mean().unstack())
    
    print("\nAverage Statistics by Chart Type:")
    print(combined_df.groupby(['Chart_type', 'condition'])['is_correct'].mean().unstack())
    
    if 'Misleader' in combined_df.columns:
        print("\nAverage Statistics by Misleader Type:")
        print(combined_df.groupby(['Misleader', 'condition'])['is_correct'].mean().unstack())
    
    # Save combined results
    combined_df.to_csv(f'gpt_{test_name.lower()}_all_results.csv', index=False)
    print(f"\nCombined results saved to gpt_{test_name.lower()}_all_results.csv")

In [None]:
if __name__ == "__main__":
    try:
        run_experiment("VLAT", "vlat_skip.json", VLAT_PROMPT)
        run_experiment("CALVI", "calvi.json", CALVI_PROMPT)
    except Exception as e:
        print(f"Fatal error: {str(e)}")
        traceback.print_exc()