In [209]:
%matplotlib widget

import json
import re

import pandas as pd
import wandb
import datasets

import direct.openai_ranking


In [None]:
# This notebook pulls up the eval data from a set of experiments, and re-runs eval, but using human
# completions from the original dataset

In [212]:
data = datasets.load_dataset("CarperAI/openai_summarize_tldr", split="test")
human_response_lookup = {p.replace(" ", ""): h for p, h in zip(data['prompt'], data['label'])}

def get_eval_data():
    dfs = []
    for r in wandb.Api().runs("bbnn/wm-apl-tldr"):
        if "tldr-exp5-final1" in r.tags:
            cfg = json.loads(r.json_config)
            for f in r.files():
                if re.match(r"evaluation_m[0-9]+_post_training_T0\.25\.json", f.name):
                    print("loading", f.name, " from ", r.name)
                    root = "/tmp"
                    f.download(root, replace=True)
                    path = f"{root}/{f.name}"
                    with open(path) as fh:
                        d = json.load(fh)
                    df = pd.DataFrame(d)
                    df["human_response"] = df["prompts"].apply(lambda p: human_response_lookup.get(p.replace(" ", "")))
                    df["completions"] = df["completions"].str.lstrip()
                    df["vs_completions"] = df["vs_completions"].str.lstrip()
                    df["acquire_pairs_function"] = cfg["exp5"]["value"]["acquire_pairs_function"]
                    df["seed"] = int(cfg["seed"]["value"])
                    df["m"] = int(f.name.split("_")[1][1:])
                    df["run_name"] = r.name
                    dfs.append(df)

    return pd.concat(dfs)

def get_winrate_vs_human(df):
    print(set(df["acquire_pairs_function"]), set(df["m"]))
    # get_preference(prompt: str, completion_a: str, completion_b: str, task_name: str, model: str = "gpt-3.5-turbo", request_logger=None, oracle_temperature=0.05, provider="azure")
    win_count = 0
    batch = [dict(prompt=p, completion_a=r_a, completion_b=r_b) for p, r_a, r_b in zip(list(df["prompts"]), list(df["completions"]), list(df["human_response"]))]
    resps = direct.openai_ranking.get_preference_batch(batch, model="gpt-4", request_logger=None, num_threads=1, task_name="tldr", provider="openai")
    for resp in resps:
        if resp["preferred"] == 0:
            win_count += 1
    return win_count / len(df)

Found cached dataset parquet (/home/will/.cache/huggingface/datasets/CarperAI___parquet/CarperAI--openai_summarize_tldr-536d9955f5e6f921/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


In [213]:
df = get_eval_data()

loading evaluation_m128_post_training_T0.25.json  from  bright-music-233
loading evaluation_m256_post_training_T0.25.json  from  bright-music-233
loading evaluation_m512_post_training_T0.25.json  from  bright-music-233
loading evaluation_m768_post_training_T0.25.json  from  bright-music-233
loading evaluation_m128_post_training_T0.25.json  from  drawn-wave-232
loading evaluation_m256_post_training_T0.25.json  from  drawn-wave-232
loading evaluation_m512_post_training_T0.25.json  from  drawn-wave-232
loading evaluation_m768_post_training_T0.25.json  from  drawn-wave-232
loading evaluation_m128_post_training_T0.25.json  from  iconic-salad-231
loading evaluation_m256_post_training_T0.25.json  from  iconic-salad-231
loading evaluation_m512_post_training_T0.25.json  from  iconic-salad-231
loading evaluation_m768_post_training_T0.25.json  from  iconic-salad-231
loading evaluation_m128_post_training_T0.25.json  from  quiet-dawn-224
loading evaluation_m256_post_training_T0.25.json  from  quiet

In [214]:
df.groupby(["acquire_pairs_function", "m", "seed"]).prompts.count()

acquire_pairs_function  m    seed 
CERTAINTY               128  42       512
                             29716    512
                             41697    512
                        256  42       512
                             29716    512
                             41697    512
                        512  42       512
                             29716    512
                             41697    512
                        768  42       512
                             29716    512
                             41697    512
ENTROPY                 128  42       512
                             29716    512
                             41697    512
                        256  42       512
                             29716    512
                             41697    512
                        512  42       512
                             29716    512
                             41697    512
                        768  42       512
                             29716    512

In [None]:
N=512
rows=[]

for acq in set(df["acquire_pairs_function"]):
    for m in set(df["m"]):
        for seed in set(df["seed"]):
            sub_df = df[(df["acquire_pairs_function"] == acq) & (df["m"] == m) & (df["seed"] == seed)]
            assert len(sub_df) == N
            w = get_winrate_vs_human(sub_df)
            rows.append([acq, m, seed, N, w])
            print(rows[-1])

results_df3 = pd.DataFrame(rows, columns=["acq", "m", "seed", "N", "winrate"])

# A very expensive dataframe: :)
results_df3

In [225]:

results_df3.to_csv("data/tldr_winrate_vs_human.csv")