In [4]:
from utils_eval import compute_pairwise_metrics, extract_score
import json, os, numpy as np, pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt

def compute_results(eval_fn):
    with open(eval_fn) as f:
        data = json.load(f)

    id2data = {d["id"]: d for d in data}

    models = set([])
    for fn in os.listdir("data/preds"):
        with open(f"data/preds/{fn}") as f:
            model_name = fn.replace(".jsonl", "").replace("preds_", "")
            # if "gem-1p5" in model_name:
            #     continue
            models.add(model_name)
            for line in f:
                d = json.loads(line)
                if d["input_fn"] != eval_fn:
                    continue
                id2data[d["id"]]["pred_" + model_name] = d["output"]

    sample_types = {}
    for d in data:
        if d["sample_type"] not in sample_types:
            sample_types[d["sample_type"]] = []
        sample_types[d["sample_type"]].append(d)

    results, N_samples = [], []
    total_N_samples = {"model": "Total"}
    for sample_type in sorted(sample_types):
        total_N_samples[sample_type] = len(sample_types[sample_type])
    N_samples.append(total_N_samples)

    # remove models that don't have any samples annotated
    models = [model for model in models if any("pred_" + model in d for d in data)]

    print(models)

    bad_ends = ["-b", "-c", "-d", "-e"]
    gold_mappings, silver_mappings = {}, {}
    for model in models:
        # if any(bad_end in model for bad_end in bad_ends):
        #     continue
        N_samples_row = {"model": model}
        result_row = {"model": model}
        for sample_type in sorted(sample_types):
            model_samples = [d for d in sample_types[sample_type] if "pred_" + model in d]
            if len(model_samples) == 0:
                continue
            N_samples_row[sample_type] = len(model_samples)
            if sample_type.startswith("pairwise"):
                pref1, acc, err = compute_pairwise_metrics(model_samples, model)
                result_row[sample_type] = acc
            else:
                y_true = [d["zscore"] for d in model_samples]
                y_pred = []
                for d in model_samples:
                    pred, err = extract_score(d, "pred_" + model)
                    y_pred.append(pred)
                abs_err = np.abs(np.array(y_true) - np.array(y_pred))
                corr = np.corrcoef(y_true, y_pred)[0, 1]
                avg_R = np.mean(y_pred)
                result_row[sample_type+"_MAE_R"] = abs_err.mean()
                result_row[sample_type+"_Corr_R"] = corr
                result_row[sample_type+"_Avg_R"] = avg_R
        N_samples.append(N_samples_row)
        gold_mappings[model] = result_row["pairwise-gold"]
        silver_mappings[model] = result_row["pairwise-silver"]

        results.append(result_row)
    return results, gold_mappings, silver_mappings, N_samples

def visualize_results(eval_fn):
    results, gold_mappings, silver_mappings, N_samples = compute_results(eval_fn)

    # add the model eval_fn as header to the results
    print(eval_fn.center(80, "-"))
    display(pd.DataFrame(results).sort_values(by="pairwise", ascending=False).set_index("model").round(2))
    display(pd.DataFrame(N_samples).set_index("model").round(2))

visualize_results(eval_fn="data/lamp_PRGSH_test.json")
# visualize_results(eval_fn="data/lamp_PR_editor_test.json")

['lamp-4o-pr-eval', 'lamp-4o-r-eval', 'lamp-4o-p', 'lamp-4o-r-eval-rmode']
---------------------------data/lamp_PRGSH_test.json----------------------------


Unnamed: 0_level_0,pairwise,pairwise-P1,pairwise-P2,pairwise-P3,pairwise-P4,pairwise-P5,pairwise-P6,pairwise-P7,pairwise-gold,pairwise-h,pairwise-silver,reward_MAE_R,reward_Corr_R,reward_Avg_R
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
lamp-4o-pr-eval,100.0,87.91,98.14,99.04,99.5,100.0,100.0,100.0,74.54,65.79,100.0,1.23,0.52,4.96
lamp-4o-p,100.0,88.84,98.6,99.04,99.5,100.0,100.0,100.0,73.96,14.66,99.73,1.45,0.42,5.78
lamp-4o-r-eval,92.82,58.14,60.93,62.68,67.84,75.41,76.73,76.09,69.65,75.19,97.14,1.33,0.48,5.06
lamp-4o-r-eval-rmode,86.88,47.91,66.05,75.6,72.36,81.97,77.36,77.54,60.2,30.08,83.39,1.29,0.5,5.14


Unnamed: 0_level_0,pairwise,pairwise-P1,pairwise-P2,pairwise-P3,pairwise-P4,pairwise-P5,pairwise-P6,pairwise-P7,pairwise-gold,pairwise-h,pairwise-silver,reward
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Total,404,215,215,209,199,183,159,138,1206,266,1120,430
lamp-4o-pr-eval,404,215,215,209,199,183,159,138,1206,266,1120,430
lamp-4o-r-eval,404,215,215,209,199,183,159,138,1206,266,1120,430
lamp-4o-p,404,215,215,209,199,183,159,138,1206,266,1120,430
lamp-4o-r-eval-rmode,404,215,215,209,199,183,159,138,1206,266,1120,430


In [13]:
# for r-mode, see how often there's a tie
from collections import Counter

eval_fn = "data/lamp_PRGSH_test.json"

prefs2label = {1: "para1", 2: "para2", 0: "tie"}

label_counts = Counter()
for fn in os.listdir("data/preds"):
    if "rmode" not in fn:
        continue
    model_name = fn.replace(".jsonl", "").replace("preds_", "")
    with open(f"data/preds/{fn}", "r") as f:
        for line in f:
            d = json.loads(line)
            output = d["output"]
            if d["input_fn"] != eval_fn or "preference" not in output:
                continue
            label_counts[prefs2label[output["preference"]]] += 1

lab_percs = {k: 100.0 * v / sum(label_counts.values()) for k, v in label_counts.items()}
for k, v in sorted(lab_percs.items(), key=lambda x: x[1], reverse=True):
    print(f"{k}: {v:.2f}%")


para1: 42.00%
para2: 41.19%
tie: 16.81%


In [None]:
def plot_scaling(eval_fn, model_prefixes):
    results, gold_mappings, silver_mappings, N_samples = compute_results(eval_fn)
    for model_prefix in model_prefixes:
        model_family = sorted([model for model in gold_mappings if model_prefix in model])
        plt.figure()
        # Gold series in yellow
        # plt.plot(model_family, [gold_mappings[model] for model in model_family], label=model_prefix, color='gold')
        # it should not be a line but a bar 
        plt.bar(model_family, [gold_mappings[model] for model in model_family], label=model_prefix, color='gold')
        # plot text in black above with the value
        for i, model in enumerate(model_family):
            plt.text(i, gold_mappings[model], f"{gold_mappings[model]:.2f}", ha='center', va='bottom', color='black')
        # # Silver series in grey
        # plt.plot(model_family, [silver_mappings[model] for model in model_family], 
        #         label=model_prefix+"_silver", color='grey')
        plt.xticks(rotation=90)
        plt.ylim(50, 80)
        plt.xlabel("Model")
        plt.ylabel("Gold Mapping")
        plt.title(f"{model_prefix} Scaling")
        plt.show()

# plot_scaling(eval_fn="data/lamp_PRGS_test.json", model_prefixes=["lamp-gem-1p5-flash-s", "lamp-4o-s"])
