# Investigate GPT-4 Annotations

In [None]:
import pandas as pd
import numpy as np
from create_reward_dataset import LABELS

In [3]:
labels = ["correct"] + LABELS[1:]

gpt4_evals = pd.read_csv("data/annotated/feedback_test_single_subset_annotated_bin_gpt-4.csv")
manual_evals = pd.read_csv("data/annotated/feedback_test_single_subset_annotated_bin_manual.csv")
gpt4_evals = gpt4_evals.iloc[:len(manual_evals)]

for df in [gpt4_evals, manual_evals]:
    df["correct"] = 1 - df["incorrect"]
    df["reveal"] = 1 - df["reveal"]
    df["score"] = df["correct"] * sum([df[label] for label in LABELS])

In [4]:
has_labels = ~manual_evals["incorrect"].isna()
gpt4_evals = gpt4_evals.loc[has_labels]
manual_evals = manual_evals.loc[has_labels]

In [5]:
print("Diff")
label_to_diff = {label: gpt4_evals[label] != manual_evals[label] for label in labels}
for label in label_to_diff:
    print(label, label_to_diff[label].sum())

print("\nOnly Diff")
label_to_only_diff = {}
for inc_label in labels:
    mask = np.ones((len(manual_evals),), dtype=bool)
    for label in labels:
        if label == inc_label:
            mask = mask & (gpt4_evals[label] != manual_evals[label])
        else:
            mask = mask & (gpt4_evals[label] == manual_evals[label])
    label_to_only_diff[inc_label] = mask
for label in label_to_only_diff:
    print(label, label_to_only_diff[label].sum())

print("\nBoth True")
label_to_both_true = {label: gpt4_evals[label].astype(bool) & manual_evals[label].astype(bool) for label in labels}
for label in label_to_both_true:
    print(label, label_to_both_true[label].sum())

print("\nBoth False")
label_to_both_false = {label: ~gpt4_evals[label].astype(bool) & ~manual_evals[label].astype(bool) for label in labels}
for label in label_to_both_false:
    print(label, label_to_both_false[label].sum())

Diff
correct 18
reveal 3
suggestions 20
misconceptions 25
positive 14

Only Diff
correct 3
reveal 1
suggestions 7
misconceptions 5
positive 7

Both True
correct 39
reveal 69
suggestions 26
misconceptions 28
positive 7

Both False
correct 23
reveal 8
suggestions 34
misconceptions 27
positive 59


In [6]:
def show_diffs_for_label(label, label_map):
    print(label, label_map[label].sum())
    print("")
    for (_, gpt4_sample), (_, manual_sample) in zip(
        gpt4_evals[label_map[label]].iterrows(),
        manual_evals[label_map[label]].iterrows()
    ):
        print(gpt4_sample["question"])
        print(gpt4_sample["correct_answer"])
        print(gpt4_sample["distractor"])
        print(gpt4_sample["feedback"])
        print(gpt4_sample["method"])
        print("GPT-4:", gpt4_sample[label], "Manual:", manual_sample[label])
        print(gpt4_sample["response"])
        print("")

In [None]:
show_diffs_for_label("correct", label_to_both_true)

# Investigate Generations

In [96]:
zs = pd.read_csv("results/feedback_gen_results_meta-llama-Llama-2-7b-chat-hf_rubric_greedy_eval_llm.csv")
sft = pd.read_csv("results/feedback_gen_results_feedback-gen-sft-llama-chat-rubric_greedy_eval_llm.csv")
dpo_abl = pd.read_csv("results/feedback_gen_results_feedback-gen-dpo-llama-chat-rubric-nomm_greedy_eval_llm.csv")
dpo = pd.read_csv("results/feedback_gen_results_feedback-gen-dpo-llama-chat-rubric_greedy_eval_llm.csv")
human = pd.read_csv("results/eedi_expanded_test_eval_llm.csv")
gpt4 = pd.read_csv("results/feedback_test_zs_gpt-4_sol_rubric_eval_llm.csv")

In [None]:
desirable_idxs = (zs["score"] < dpo["score"]) & (sft["score"] < dpo["score"]) & (dpo_abl["score"] <= dpo["score"]) & (dpo["score"] == 1)
# desirable_idxs = (zs["score"] < dpo_abl["score"]) & (sft["score"] < dpo_abl["score"]) & (dpo_abl["score"] == 1) & (human["score"] == 1)
# desirable_idxs = (zs["score"] < dpo["score"]) & (sft["score"] < dpo["score"]) & (dpo["score"] == 1) & (human["score"] == 1)
print(desirable_idxs.sum())
print(desirable_idxs.to_numpy().nonzero())
for i in desirable_idxs.to_numpy().nonzero()[0]:
    print(i)
    print(human.iloc[i]["question"])
    print(human.iloc[i]["correct_answer"])
    print(human.iloc[i]["distractor"])
    print(f"Human ({human.iloc[i]['score']}):", human.iloc[i]["feedback"])
    print(f"DPO ({dpo.iloc[i]['score']}):", dpo.iloc[i]["feedback"])
    print(f"DPO abl ({dpo_abl.iloc[i]['score']}):", dpo_abl.iloc[i]["feedback"])
    print(f"SFT ({sft.iloc[i]['score']}):", sft.iloc[i]["feedback"])
    print(f"ZS ({zs.iloc[i]['score']}):", zs.iloc[i]["feedback"])
    print(f"GPT-4 ({gpt4.iloc[i]['score']}):", gpt4.iloc[i]["feedback"])
    print("")