In [17]:
from sklearn.metrics import accuracy_score
import json, os, numpy as np, pandas as pd

eval_fn = "data/finetune_PR_test.json"

with open(eval_fn) as f:
    data = json.load(f)

id2data = {d["id"]: d for d in data}

models = set([])
for fn in os.listdir("data/preds"):
    with open(f"data/preds/{fn}") as f:
        model_name = fn.replace(".jsonl", "").replace("preds_", "")
        models.add(model_name)
        for line in f:
            d = json.loads(line)
            id2data[d["id"]]["pred_" + model_name] = d["output"]

data_pairwise = [d for d in data if d["sample_type"] == "pairwise"]
data_reward = [d for d in data if d["sample_type"] == "reward"]

print(len(data), len(data_pairwise), len(data_reward))

def extract_preference(d, pred_key):
    try:
        return int(d[pred_key]["preference"])
    except:
        return 0

# 
def extract_score(d, pred_key):
    try:
        return d[pred_key]["score"]
    except:
        return 0

results = []
for model in models:
    N = len([d for d in data_pairwise + data_reward if "pred_" + model in d])
    # compute accuracy on preference data
    y_true = [int(d["reference_preference"]) for d in data_pairwise]
    y_pred = [extract_preference(d, "pred_" + model) for d in data_pairwise]

    pref1 = 100.0 * len([p for p in y_pred if p == 1]) / len(y_pred)
    acc = 100.0 * accuracy_score(y_true, y_pred)

    # compute absolute error and correlation on reward data
    y_true = [d["zscore"] for d in data_reward]
    y_pred = [extract_score(d, "pred_" + model) for d in data_reward]

    # print(y_true[:5], y_pred[:5])

    abs_err = np.abs(np.array(y_true) - np.array(y_pred))
    corr = np.corrcoef(y_true, y_pred)[0, 1]
    results.append({"model": model, "N": N, "Pref_acc": acc, "Pref (Choice = 1)": pref1, "Rew_abs_err": abs_err.mean(), "Rew_corr": corr})

pd.DataFrame(results).sort_values(by="Pref_acc", ascending=False).set_index("model")

836 404 432
[5, 7, 3, 1, 6] [0, 0, 0, 0, 0]
[5, 7, 3, 1, 6] [9, 8, 8, 9, 8]
[5, 7, 3, 1, 6] [8, 8, 7, 8, 8]
[5, 7, 3, 1, 6] [9, 9, 8, 9, 8]
[5, 7, 3, 1, 6] [9, 8, 8, 9, 9]
[5, 7, 3, 1, 6] [5, 9, 8, 4, 8]


  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,model,N,Pref_acc,Rew_abs_err,Rew_corr
5,lamp-gem-1p5-flash-p,836,0.492574,2.680556,-0.063487
2,gemini-1.5-pro,836,0.222772,2.703704,-0.148995
3,gpt-4o,836,0.163366,3.446759,-0.175364
1,gpt-4o-mini,836,0.121287,3.4375,-0.151879
4,gemini-1.5-flash,836,0.113861,3.576389,-0.252054
0,lamp-gem-1p5-flash-pr,836,0.066832,4.979167,
