In [1]:
import sys

In [2]:
import wandb
import json
import pandas as pd
import glob

sys.path.append("../.")
import direct.openai_ranking

In [3]:
# dataset = "imdb"
# project = "wm-debug-imdb"
# tag = "xmas-lora"
# m_s = [256, 512, 768]
# n = 512
# #n = 64

dataset = "tldr"
project = "wm-debug-tldr"
tag = "xmas-sweep2"
#m_s = [512]
m_s = [128,256, 384, 512]
# m_s = [768]
n = 1024
#n = 16

In [4]:
def get_eval_data(project, tag, filename):
    dfs = []
    for r in wandb.Api().runs(project, filters=dict(tags=tag)):
        if tag in r.tags:
            cfg = json.loads(r.json_config)
            for f in r.files():
                if f.name == filename:
                    print("loading", f.name, "from", r.name)
                    root = "/tmp"
                    f.download(root, replace=True)
                    path = f"{root}/{f.name}"
                    eval_data = json.loads(open(path).read())

                    df = pd.DataFrame(eval_data)
                    df["acquire_pairs_function"] = cfg["exp5"]["value"]["acquire_pairs_function"]
                    #df["og"] = int(cfg["exp5"]["value"]["over_generate_factor"])
                    df["seed"] = int(cfg["seed"]["value"])
                    df["run_name"] = r.name
                    dfs.append(df)

    return pd.concat(dfs)

In [5]:
def submit_to_oracle(m):
    df = get_eval_data(project, tag, f"evaluation_m{m}_post_training_T0.25.json")
    df = df[df.acquire_pairs_function == "HIGH_ENTROPY_AND_CERTAINTY"]
    df = df[~df.seed.isin([12029, 22151, 22371, 23767, 29799])]
    if m == 0:
        # No point paying to eval other variants since these are sampled from the ref model!
        df = df[df.acquire_pairs_function == "RANDOM"]

    print(set(df.seed))
    print(set(df.acquire_pairs_function))    
    
    sample_df = df.groupby(["acquire_pairs_function", "seed"], group_keys=False).apply(lambda x: x.sample(n))
    batch = sample_df[["prompts", "completions", "vs_completions"]].rename(
    columns={"completions": "completion_a", "vs_completions": "completion_b", "prompts": "prompt"}).to_dict("records")
    
    oracle_response = direct.openai_ranking.get_preference_batch(batch, "gpt-4-1106-preview", None, 10, dataset, provider="openai")
    cost = sum([r['cost'] for r in oracle_response])
    print(f"That cost ~ {cost} USD")
    sample_df["win"] = [r['preferred'] == 0 for r in oracle_response]  
    return sample_df


In [6]:
import time, os

def to_csv(df, filename):
    if os.path.exists(filename):
        print(f"Warning - {filename} exists")
        filename = filename + "." + str(time.time())
        print(f"Writing to {filename}")
        
    s.to_csv(filename, index=False)

In [11]:
for m in m_s:
    s = submit_to_oracle(m)
    to_csv(s, f"../results/post-eval-winrate-{dataset}-m{m}-{tag}.csv")

loading evaluation_m128_post_training_T0.25.json from devout-galaxy-102
loading evaluation_m128_post_training_T0.25.json from logical-glitter-101
loading evaluation_m128_post_training_T0.25.json from vocal-hill-99
loading evaluation_m128_post_training_T0.25.json from cosmic-brook-99
loading evaluation_m128_post_training_T0.25.json from vague-darkness-98


KeyboardInterrupt: 

In [8]:
# for m in [128, 256, 512, 768]:
#     s = submit_to_oracle(m)
#     s.to_csv(f"../results/post-eval-winrate-{dataset}-m{m}-{tag}.csv", index=False)

In [9]:
# TODO - how can I only eval the seeds I haven't seen before?  e.g. for high ent tldr?

# for m in m_s:
#     s = submit_to_oracle(m)
#     s.to_csv(f"../results/post-eval-winrate-{dataset}-m{m}-{tag}-ENTROPY.csv", index=False)

In [10]:
!ls ../results/*lora*


../results/post-eval-winrate-imdb-m256-xmas-lora.csv
../results/post-eval-winrate-imdb-m512-xmas-lora.csv
../results/post-eval-winrate-imdb-m768-xmas-lora.csv
