In [1]:
%load_ext autoreload
%autoreload 2

In [22]:
import collections
import copy
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

import order_dependency_problem
from order_dependency_problem.dataset import ArcDataset
from order_dependency_problem.quesiton_answering import (
    answer_question, answer_multiple_questions
)
from order_dependency_problem.evaluation import (
    calculate_answer_prevalence, calculate_accuracy, calculate_answer_recall
)

In [3]:
MODEL_NAMES = ["gpt-4o-mini", "gpt-3.5-turbo"]
ARC_DATASET_PATH = Path(order_dependency_problem.__file__).parent.parent.parent / "data/arc/ARC-Challenge-Test.jsonl"

output_json = {}

# Load dataset
100 sample questions from ARC dataset

In [8]:
arc_dataset = ArcDataset.load_from_file(ARC_DATASET_PATH, num_samples=100, seed=1000)

In [9]:
correct_answers = {}
for question in arc_dataset.questions:
    for choice in question.choices:
        if choice.is_correct_answer:
            correct_answers[question.id] = choice.text

# Explore the original dataset

In [10]:
output_json["original dataset"] = {
    "questions": [question.dict() for question in arc_dataset.questions],
    "results": {},
}

ground_truth_counts = collections.defaultdict(lambda: 0)
for question in arc_dataset.questions:
    for choice in question.choices:
        if choice.is_correct_answer:
            ground_truth_counts[choice.label] += 1

option_level_data = []
overall_metrics = []
for model_name in MODEL_NAMES:
    output_json["original dataset"]["results"][model_name] = {}
    responses = await answer_multiple_questions(
        arc_dataset.questions,
        model_name=model_name,
        batch_size=10,
        label_removed=False,
        verbose=True,
    )
    answers = [response.content for response in responses]
    output_json["original dataset"]["results"][model_name]["answers"] = answers

    # Option level metrics
    answer_prevalences = calculate_answer_prevalence(
        arc_dataset.questions, answers, label_removed=False
    )
    
    answer_recalls = calculate_answer_recall(
        arc_dataset.questions, answers, label_removed=False
    )
    option_level_data.extend(
        [(model_name, key, ground_truth_count, answer_prevalences.get(key, 0), answer_recalls.get(key))
         for key, ground_truth_count in ground_truth_counts.items()]
    )
    
    # Overall metrics
    accuracy = calculate_accuracy(
        questions=arc_dataset.questions,
        answers=answers,
    )
    recall_std = np.array(list(answer_recalls.values())).std()
    overall_metrics.append((model_name, accuracy, recall_std))
    output_json["original dataset"]["results"][model_name]["accuracy"] = accuracy
    output_json["original dataset"]["results"][model_name]["recall_std"] = recall_std


option_level_metrics_df = pd.DataFrame(
    option_level_data,
    columns=["model", "label", "ground_truth_count", "answer_prevalence", "answer_recall"]
).sort_values(by=["model", "label"])
                           
overall_metrics_df = pd.DataFrame(
    overall_metrics,
    columns=["model", "accuracy", "recall_std"],
).sort_values(by="model")

display(option_level_metrics_df)
display(overall_metrics_df)


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.38s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.08it/s]


Unnamed: 0,model,label,ground_truth_count,answer_prevalence,answer_recall
7,gpt-3.5-turbo,A,19,0.18,0.894737
4,gpt-3.5-turbo,B,30,0.32,0.833333
6,gpt-3.5-turbo,C,24,0.25,0.791667
5,gpt-3.5-turbo,D,27,0.25,0.703704
3,gpt-4o-mini,A,19,0.23,1.0
0,gpt-4o-mini,B,30,0.31,0.9
2,gpt-4o-mini,C,24,0.24,0.875
1,gpt-4o-mini,D,27,0.22,0.777778


Unnamed: 0,model,accuracy,recall_std
1,gpt-3.5-turbo,0.8,0.069446
0,gpt-4o-mini,0.88,0.079066


Exploratory analysis shows:
* Answer prevalence has a strong correlation with ground truth prevalence. We will not use it in the following analysis.
* `gpt-4o-mini` has higher accuracy and higher recall standard deviation than `gpt-3.5-turbo`. which implies `gpt-4o-mini` may have a more severe order dependency problem.
* From the answer recalls, Both `gpt-3.5-turbo` and `gpt-4o-mini` appear to prefer "Option A" and dislike "Option D".

# Answer-moving attack
We will experiment moving all ground truths to a specific position (A/B/C/D) and check how it impacts answer accuracy. High accuracy fluctuation indicates high ODP.


In [11]:
data = {
    "model": MODEL_NAMES,
    "original": [
        overall_metrics_df[overall_metrics_df.model == model_name].iloc[0]["accuracy"]
        for model_name in MODEL_NAMES
    ],
}

output_json["answer-moving attack"] = {}

for gt_label in "ABCD":
    output_json["answer-moving attack"][gt_label] = {}
    questions = arc_dataset.move_ground_truth_to_option(gt_label)
    output_json["answer-moving attack"][gt_label]["questions"] = [question.dict() for question in questions]
    output_json["answer-moving attack"][gt_label]["results"] = {}
    data[gt_label] = []
    for model_name in MODEL_NAMES:
        output_json["answer-moving attack"][gt_label]["results"][model_name] = {}
        responses = await answer_multiple_questions(
            questions=questions,
            model_name=model_name,
            batch_size=10,
            label_removed=False,
            verbose=True,
        )
        answers = [response.content for response in responses]
        accuracy = calculate_accuracy(questions, answers, label_removed=False)
        output_json["answer-moving attack"][gt_label]["results"][model_name]["answers"] = answers
        output_json["answer-moving attack"][gt_label]["results"][model_name]["accuracy"] = accuracy
        data[gt_label].append(accuracy)

answer_moving_attack_df = pd.DataFrame(data)
display(answer_moving_attack_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.36s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.10it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.34s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.34s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.37s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.06s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.27s/it]
100%|████████████████████████████████████████████████████████████████

Unnamed: 0,model,original,A,B,C,D
0,gpt-4o-mini,0.88,0.95,0.92,0.88,0.84
1,gpt-3.5-turbo,0.8,0.86,0.82,0.81,0.75


The accuracy fluctuations in answer-moving attacks show:
* Both models show similar order of accuracy fluctuations
* Both models show moving ground truths to A (D) gets the most accuracy increase (decrease). It's consistent with the answer recall distributions shown in the previous step.

# Shuffle option contents
Now let's experiment shuffling option contents, but maintain the option ID orders

In [16]:
data = {
    "model": [],
    "accuracy": [],
    "recall_std": [],
}

recalls = []

questions = arc_dataset.generate_samples(
    shuffle_contents=True,
    shuffle_labels=False,
    seed=1000,
)

output_json["shuffle option contents"] = {
    "questions": [question.dict() for question in questions],
    "results": {}
}

for model_name in MODEL_NAMES:
    data["model"].append(model_name)
    output_json["shuffle option contents"]["results"][model_name] = {}
    responses = await answer_multiple_questions(
        questions=questions,
        model_name=model_name,
        batch_size=10,
        label_removed=False,
        verbose=True,
    )
    answers = [response.content for response in responses]
    output_json["shuffle option contents"]["results"][model_name]["answers"] = answers
    
    accuracy = calculate_accuracy(questions, answers, label_removed=False)
    output_json["shuffle option contents"]["results"][model_name]["accuracy"] = accuracy
    data["accuracy"].append(accuracy)
    
    answer_recalls = calculate_answer_recall(
        questions, answers, label_removed=False
    )
    output_json["shuffle option contents"]["results"][model_name]["recalls"] = answer_recalls
    recalls.extend([
        (model_name, label, recall)
        for label, recall in answer_recalls.items()
    ])
    recall_std = np.array(list(answer_recalls.values())).std()
    output_json["shuffle option contents"]["results"][model_name]["recall_std"] = recall_std
    data["recall_std"].append(recall_std)

df = pd.DataFrame(data).sort_values(by="model")
display(df)

recall_df = pd.DataFrame(recalls, columns=["model", "label", "recall"]).sort_values(by=["model", "label"])
display(recall_df)
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.10s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.39s/it]


Unnamed: 0,model,accuracy,recall_std
1,gpt-3.5-turbo,0.82,0.047416
0,gpt-4o-mini,0.91,0.050111


Unnamed: 0,model,label,recall
4,gpt-3.5-turbo,A,0.828571
6,gpt-3.5-turbo,B,0.736842
7,gpt-3.5-turbo,C,0.833333
5,gpt-3.5-turbo,D,0.863636
0,gpt-4o-mini,A,0.971429
2,gpt-4o-mini,B,0.842105
1,gpt-4o-mini,C,0.916667
3,gpt-4o-mini,D,0.863636


Both `gpt-4o-mini` and `gpt-3.5-turbo` have higher accuracy and recall_std. If this is repeatable on a larger dataset, it will mean that shuffling option content could help mitigate order dependency problem and hence improve answer accuracy.
One thing to notice is that "Option D" has the highest recall for `gpt-3.5-turbo`. 

# Shuffle option IDs
Now let's experiment shuffling option IDs, but maintain the option content orders


In [17]:
data = {
    "model": [],
    "accuracy": [],
    "recall_std": [],
}

recalls = []

questions = arc_dataset.generate_samples(
    shuffle_contents=False,
    shuffle_labels=True,
    seed=1000,
)

output_json["shuffle option ids"] = {
    "questions": [question.dict() for question in questions],
    "results": {}
}

for model_name in MODEL_NAMES:
    data["model"].append(model_name)
    output_json["shuffle option ids"]["results"][model_name] = {}
    responses = await answer_multiple_questions(
        questions=questions,
        model_name=model_name,
        batch_size=10,
        label_removed=False,
        verbose=True,
    )
    answers = [response.content for response in responses]
    output_json["shuffle option ids"]["results"][model_name]["answers"] = answers
    
    accuracy = calculate_accuracy(questions, answers, label_removed=False)
    output_json["shuffle option ids"]["results"][model_name]["accuracy"] = accuracy
    data["accuracy"].append(accuracy)
    
    answer_recalls = calculate_answer_recall(
        questions, answers, label_removed=False
    )
    output_json["shuffle option ids"]["results"][model_name]["recalls"] = answer_recalls
    recalls.extend([
        (model_name, label, recall)
        for label, recall in answer_recalls.items()
    ])
    recall_std = np.array(list(answer_recalls.values())).std()
    output_json["shuffle option ids"]["results"][model_name]["recall_std"] = recall_std
    data["recall_std"].append(recall_std)

df = pd.DataFrame(data).sort_values(by="model")
display(df)

recall_df = pd.DataFrame(recalls, columns=["model", "label", "recall"]).sort_values(by=["model", "label"])
display(recall_df)
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.44s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.12it/s]


Unnamed: 0,model,accuracy,recall_std
1,gpt-3.5-turbo,0.77,0.047958
0,gpt-4o-mini,0.87,0.042005


Unnamed: 0,model,label,recall
4,gpt-3.5-turbo,A,0.804878
5,gpt-3.5-turbo,B,0.791667
6,gpt-3.5-turbo,C,0.681818
7,gpt-3.5-turbo,D,0.769231
0,gpt-4o-mini,A,0.926829
1,gpt-4o-mini,B,0.833333
2,gpt-4o-mini,C,0.818182
3,gpt-4o-mini,D,0.846154


* As expected, both models have lower accuracy when option IDs are shuffled (unnatural option ID order)
* both models have lower recall_std. If this is repeated in a larger dataset, it may indicate that position bias is also important.

# Remove option IDs
Now let's experiment removing option IDs, but maintain the option orders

In [25]:
data = {
    "model": [],
    "accuracy": [],
    "recall_std": [],
}

recalls = []

questions = copy.deepcopy(arc_dataset.questions)
for question in questions:
    for choice in question.choices:
        choice.label = None

output_json["remove option ids"] = {
    "questions": [question.dict() for question in questions],
    "results": {}
}

for model_name in MODEL_NAMES:
    data["model"].append(model_name)
    output_json["remove option ids"]["results"][model_name] = {}
    responses = await answer_multiple_questions(
        questions=questions,
        model_name=model_name,
        batch_size=10,
        label_removed=True,
        verbose=True,
    )
    answers = [response.content for response in responses]
    output_json["remove option ids"]["results"][model_name]["answers"] = answers
    
    accuracy = calculate_accuracy(questions, answers, label_removed=True)
    output_json["remove option ids"]["results"][model_name]["accuracy"] = accuracy
    data["accuracy"].append(accuracy)
    
    answer_recalls = calculate_answer_recall(
        questions, answers, label_removed=True
    )
    output_json["remove option ids"]["results"][model_name]["recalls"] = answer_recalls
    recalls.extend([
        (model_name, label, recall)
        for label, recall in answer_recalls.items()
    ])
    recall_std = np.array(list(answer_recalls.values())).std()
    output_json["remove option ids"]["results"][model_name]["recall_std"] = recall_std
    data["recall_std"].append(recall_std)

df = pd.DataFrame(data).sort_values(by="model")
display(df)

recall_df = pd.DataFrame(recalls, columns=["model", "position_idx", "recall"]).sort_values(by=["model", "position_idx"])
display(recall_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.45s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.12s/it]


Unnamed: 0,model,accuracy,recall_std
1,gpt-3.5-turbo,0.83,0.049251
0,gpt-4o-mini,0.91,0.050518


Unnamed: 0,model,position_idx,recall
7,gpt-3.5-turbo,0,0.894737
4,gpt-3.5-turbo,1,0.866667
6,gpt-3.5-turbo,2,0.791667
5,gpt-3.5-turbo,3,0.777778
3,gpt-4o-mini,0,1.0
0,gpt-4o-mini,1,0.866667
2,gpt-4o-mini,2,0.916667
1,gpt-4o-mini,3,0.888889


* As expected, both models have lower accuracy when option IDs are shuffled (unnatural option ID order)
* both models have lower recall_std, but not as much a drop as reported in the paper. If this is repeated in a larger dataset, it may indicate that token bias plays a weaker role in ODP.

# Export result

In [24]:
with open("report.json", "w") as f:
    json.dump(output_json, f, indent=2)