In [1]:
import sys

In [2]:
import wandb
import json
import pandas as pd
import glob

sys.path.append("../.")
import direct.openai_ranking

In [3]:
# dataset = "imdb"
# project = "wm-debug-imdb"
# tag = "xmas-sweep2"
# m_s = [128, 256, 512, 768]
# phase_s = [0, 1, 3, 5]

dataset = "tldr"
project = "wm-debug-tldr"
tag = "xmas-sweep2"
m_s = [128, 256, 384, 512]
phase_s = [1, 2, 3, 4]


n = 1024
# n = 64

# dataset = "tldr"
# project = "wm-debug-tldr"
# tag = "lora-sweep2"
#m_s = [512]
# m_s = [128,256, 384, 512, 640, 768]
# m_s = [512]
# m_s = [768]
#n = 1024
# n = 512

In [4]:
def get_eval_data(project, tag, filename):
    dfs = []
    for r in wandb.Api().runs(project, filters=dict(tags=tag)):
        if tag in r.tags:
            cfg = json.loads(r.json_config)
            for f in r.files():
                if f.name == filename:
                    print("loading", f.name, "from", r.name)
                    root = "/tmp"
                    f.download(root, replace=True)
                    path = f"{root}/{f.name}"
                    eval_data = json.loads(open(path).read())

                    df = pd.DataFrame(eval_data)
                    df["acquire_pairs_function"] = cfg["exp5"]["value"]["acquire_pairs_function"]
                    #df["og"] = int(cfg["exp5"]["value"]["over_generate_factor"])
                    df["seed"] = int(cfg["seed"]["value"])
                    df["run_name"] = r.name
                    dfs.append(df)

    return pd.concat(dfs)

In [5]:
def submit_to_oracle(eval_filename):
    df = get_eval_data(project, tag, eval_filename)
    df = df[df.acquire_pairs_function == "HIGH_ENTROPY"]
    #df = df[~df.seed.isin([12029, 22151, 22371, 23767, 29799])]
    if m == 0:
        # No point paying to eval other variants since these are sampled from the ref model!
        df = df[df.acquire_pairs_function == "RANDOM"]

    print(set(df.seed))
    print(set(df.acquire_pairs_function))    
    
    sample_df = df.groupby(["acquire_pairs_function", "seed"], group_keys=False).apply(lambda x: x.sample(n))
    batch = sample_df[["prompts", "completions", "vs_completions"]].rename(
    columns={"completions": "completion_a", "vs_completions": "completion_b", "prompts": "prompt"}).to_dict("records")
    
    oracle_response = direct.openai_ranking.get_preference_batch(batch, "gpt-4-1106-preview", None, 10, dataset, provider="openai")
    cost = sum([r['cost'] for r in oracle_response])
    print(f"That cost ~ {cost} USD")
    sample_df["win"] = [r['preferred'] == 0 for r in oracle_response]  
    return sample_df


In [6]:
import time, os

def to_csv(df, filename):
    if os.path.exists(filename):
        print(f"Warning - {filename} exists")
        filename = filename + "." + str(time.time())
        print(f"Writing to {filename}")
        
    df.to_csv(filename, index=False)

In [7]:
for m, p in zip(m_s, phase_s):
    eval_filename = f"evaluation_m{m}_phase{p}_post_training_T0.25.json"
    s = submit_to_oracle(eval_filename)
    to_csv(s, f"../results/post-eval-winrate-{dataset}-m{m}-{tag}.csv")

loading evaluation_m128_phase1_post_training_T0.25.json from sparkling-haze-260
loading evaluation_m128_phase1_post_training_T0.25.json from distinctive-star-259
loading evaluation_m128_phase1_post_training_T0.25.json from faithful-breeze-258
loading evaluation_m128_phase1_post_training_T0.25.json from classic-grass-257
loading evaluation_m128_phase1_post_training_T0.25.json from balmy-dragon-256
loading evaluation_m128_phase1_post_training_T0.25.json from floral-plasma-255
{6242, 9242, 7242, 8242, 4242, 5242}
{'HIGH_ENTROPY'}


Getting preferences of batch of 6144 using 10 threads: 100%|██████████| 6144/6144 [28:55<00:00,  3.54it/s]  


That cost ~ 34.985630000000036 USD
Writing to ../results/post-eval-winrate-tldr-m128-xmas-sweep2.csv.1717145431.1338885
loading evaluation_m256_phase2_post_training_T0.25.json from sparkling-haze-260
loading evaluation_m256_phase2_post_training_T0.25.json from distinctive-star-259
loading evaluation_m256_phase2_post_training_T0.25.json from faithful-breeze-258
loading evaluation_m256_phase2_post_training_T0.25.json from classic-grass-257
loading evaluation_m256_phase2_post_training_T0.25.json from balmy-dragon-256
loading evaluation_m256_phase2_post_training_T0.25.json from floral-plasma-255
{6242, 9242, 7242, 8242, 4242, 5242}
{'HIGH_ENTROPY'}


Getting preferences of batch of 6144 using 10 threads: 100%|██████████| 6144/6144 [30:47<00:00,  3.33it/s]  


That cost ~ 35.36103 USD
Writing to ../results/post-eval-winrate-tldr-m256-xmas-sweep2.csv.1717147291.832279
loading evaluation_m384_phase3_post_training_T0.25.json from sparkling-haze-260
loading evaluation_m384_phase3_post_training_T0.25.json from distinctive-star-259
loading evaluation_m384_phase3_post_training_T0.25.json from faithful-breeze-258
loading evaluation_m384_phase3_post_training_T0.25.json from classic-grass-257
loading evaluation_m384_phase3_post_training_T0.25.json from balmy-dragon-256
loading evaluation_m384_phase3_post_training_T0.25.json from floral-plasma-255
{6242, 9242, 7242, 8242, 4242, 5242}
{'HIGH_ENTROPY'}


Getting preferences of batch of 6144 using 10 threads:  68%|██████▊   | 4149/6144 [20:46<08:58,  3.70it/s]  Traceback (most recent call last):
  File "/home/will/code/active-preference-learning/venv/lib/python3.10/site-packages/openai/api_requestor.py", line 753, in _interpret_response_line
    data = json.loads(rbody)
  File "/home/will/miniconda3/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/will/miniconda3/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/will/miniconda3/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/will/code/active-preference-learning/direct/openai_ranking.py", line 172,

openai error raised HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
) - sleeping for 51.86881350489081 s...


Getting preferences of batch of 6144 using 10 threads: 100%|██████████| 6144/6144 [30:41<00:00,  3.34it/s]  


That cost ~ 35.18677999999996 USD
Writing to ../results/post-eval-winrate-tldr-m384-xmas-sweep2.csv.1717149145.5653675
loading evaluation_m512_phase4_post_training_T0.25.json from sparkling-haze-260
loading evaluation_m512_phase4_post_training_T0.25.json from distinctive-star-259
loading evaluation_m512_phase4_post_training_T0.25.json from faithful-breeze-258
loading evaluation_m512_phase4_post_training_T0.25.json from classic-grass-257
loading evaluation_m512_phase4_post_training_T0.25.json from balmy-dragon-256
loading evaluation_m512_phase4_post_training_T0.25.json from floral-plasma-255
{6242, 9242, 7242, 8242, 4242, 5242}
{'HIGH_ENTROPY'}


Getting preferences of batch of 6144 using 10 threads:  68%|██████▊   | 4148/6144 [20:27<09:04,  3.67it/s]  Traceback (most recent call last):
  File "/home/will/code/active-preference-learning/direct/openai_ranking.py", line 172, in get_preference
    resp = openai.ChatCompletion.create(
  File "/home/will/code/active-preference-learning/venv/lib/python3.10/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/home/will/code/active-preference-learning/venv/lib/python3.10/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/home/will/code/active-preference-learning/venv/lib/python3.10/site-packages/openai/api_requestor.py", line 298, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/home/will/code/active-preference-learning/venv/lib/python3.10/site-packages/openai/api_requestor.py", line 700

openai error raised The server had an error while processing your request. Sorry about that! {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that!",
    "type": null,
    "param": null,
    "code": null
  }
} 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': None, 'param': None, 'code': None}} {'Date': 'Fri, 31 May 2024 10:13:16 GMT', 'Content-Type': 'application/json', 'Content-Length': '165', 'Connection': 'keep-alive', 'openai-organization': 'ucl-ai-centre', 'openai-processing-ms': '30853', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '10000', 'x-ratelimit-limit-tokens': '2000000', 'x-ratelimit-remaining-requests': '9999', 'x-ratelimit-remaining-tokens': '1999469', 'x-ratelimit-reset-requests': '6ms', 'x-ratelimit-reset-tokens': '15ms', 'x-request-id': 'req_93662be9bf0a30e42d1987d75111ca1c',

Getting preferences of batch of 6144 using 10 threads: 100%|██████████| 6144/6144 [30:52<00:00,  3.32it/s]  

That cost ~ 35.28268000000006 USD
Writing to ../results/post-eval-winrate-tldr-m512-xmas-sweep2.csv.1717151010.3497448





In [None]:
# for m in [128, 256, 512, 768]:
#     s = submit_to_oracle(m)
#     s.to_csv(f"../results/post-eval-winrate-{dataset}-m{m}-{tag}.csv", index=False)

In [None]:
# TODO - how can I only eval the seeds I haven't seen before?  e.g. for high ent tldr?

# for m in m_s:
#     s = submit_to_oracle(m)
#     s.to_csv(f"../results/post-eval-winrate-{dataset}-m{m}-{tag}-ENTROPY.csv", index=False)

In [None]:
!ls ../results/*xmas-lora.csv*


In [None]:
pattern = "../results/*xmas-lora.csv*"

dfs = []
for p in glob.glob(pattern):
    m = int(p.split("-")[4][1:])
    df = pd.read_csv(p)
    df["m"] = m
    dfs.append(df)
df = pd.concat(dfs)
df
df.groupby(["m", "acquire_pairs_function"]).win.agg(["count", "mean"])