In [1]:
%matplotlib inline

import json
import pandas as pd
import wandb
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [9]:
def get_training_data(project, tag, filename):
    dfs = []
    for r in wandb.Api().runs(project):
        if tag in r.tags:
            cfg = json.loads(r.json_config)
            for f in r.files():
                if f.name == filename:
                    # print("MATCH", f.name)
                    print("loading", f.name, " from ", r.name)
                    root = "/tmp"
                    f.download(root, replace=True)
                    path = f"{root}/{f.name}"
                    df = pd.read_json(path, lines=True)
                    df["acquire_pairs_function"] = cfg["exp5"]["value"]["acquire_pairs_function"]
                    df["seed"] = int(cfg["seed"]["value"])
                    df["m"] = int(f.name.split("_")[2][1:].split(".")[0])
                    df["run_name"] = r.name
                    df["oracle_model_agree"] = df.apply(
                        lambda x: 1 if x['r_hats'][x['rank'][0]] > x['r_hats'][x['rank'][1]] else (0.5 if x['r_hats'][0] == x['r_hats'][1] else 0), axis=1)

                    df["r_margin"] = df.apply(lambda x: x["r_hats"][x['rank'][0]] - x["r_hats"][x['rank'][1]], axis=1)
                    df["abs_r_margin"] = df["r_margin"].abs()
                    df["run_name"] = r.name
                    df["p_y0_winner"] = 1 / (1 + np.exp(df["r_hats"].apply(lambda r: r[1]) - df["r_hats"].apply(lambda r: r[0])))
                    df["y0_winner"] = df["rank"].apply(lambda r: r[0] == 0)
                    df["p_y0_winner_bucket"] = (np.floor((df["p_y0_winner"] * 4.9999)) + 1).astype(int)
                                            
                    dfs.append(df)

    return pd.concat(dfs)


In [10]:
df_imdb = get_training_data("bbnn/wm-debug-imdb", "xmas-sweep2", "training_data_m768.jsonl")
df_tldr = get_training_data("bbnn/wm-debug-tldr", "xmas-sweep2", "training_data_m512.jsonl")

loading training_data_m768.jsonl  from  daily-pyramid-235


KeyError: 'r_hats'

In [4]:
def key_to_nice_name(key):
    return {
        "ENTROPY": "Entropy",
        "CERTAINTY": "Pref cert",
        "UNCERTAINTY": "Active-R-Uncertainty",
        "OFFLINE": "Offline-Random",
        "RANDOM": "Random",
        "HIGH_ENTROPY_AND_CERTAINTY": "Ent + Pref",
    }[key]

In [6]:

def plot_confidence_histos(acq_fns, target_ms, df, y_max, dataset="imdb"):    
    def map_conf(agree, p_y0):
        """
        Want to map data such that (e.g.) 99% is where the r_hat values were confidently right.
        1% is where r_hat values were confidently wrong.  non-confident predictions should lie close to 50%

        if p_y0_winner < 50%, then this means the model predicted that y1 was preferred.  p_y0_winner ~1% is a very confident y1 prediction
        if p_y0_winner > 50%, then this means the model predicted that y0 was preferred.  p_y0_winner ~99% is a very confident y0 prediction

        So to produce our combined confidence/correctness number we may need to mirror 
        p_y0_winner into the relevant half of the [0,1] range        
        """
        if agree:
            if p_y0 < 0.5: # model predicted y0
                return 1 - p_y0
            else: # model predicted y1
                return p_y0
        else:
            if p_y0 > 0.5:
                return 1 - p_y0
            else:
                return p_y0
    
    sns.set_style("whitegrid")  # Set plot style

    fig, axes = plt.subplots(len(acq_fns), len(target_ms), figsize=(10, 8))
    fig.suptitle(f'{dataset} histograms of preference predictions', fontsize=20)
    # fig.tight_layout()



    
    for i, acq_fn in enumerate(acq_fns):
        for j, target_m in enumerate(target_ms):                         
            ddf = df[(df.target_m == target_m) & (df.acquire_pairs_function == acq_fn)][["oracle_model_agree", "p_y0_winner"]].copy()
            ddf["conf"] = ddf.apply(lambda r: map_conf(r["oracle_model_agree"], r["p_y0_winner"]), axis=1)
            ddf[ddf["conf"] >= 0.5]["conf"].plot.hist(
                bins=10, figsize=(12, 9), ax=axes[i, j], xlim=(0, 1), ylim=(0, y_max), title=f"{key_to_nice_name(acq_fn)}/{target_m}", color="green")
            ddf[ddf["conf"] <= 0.5]["conf"].plot.hist(
                bins=10, figsize=(12, 9), ax=axes[i, j], xlim=(0, 1), ylim=(0, y_max), color="red")
            
            plt.xticks(fontsize=16)  # Increase fontsize for x-axis tick labels
            plt.yticks(fontsize=16)  # Increase fontsize for y-axis tick labels
    
    plt.savefig(f"paper_{dataset}_histogram.png", dpi=300)


In [7]:
# This first table shows us how well calibrated the implied p(y0 > y1 | x) in the model becomes in each phase. This 
# probability is from eq 6 in the DPO paper and is derived from the r_hat values of the two prompts y0 and y1.

# We bucket it into 5 buckets 0-0.2, 0.2-0.4 etc.. showing the implied probability that y0 is prefered over y1. Then we
# plot the actual win-rate for y0 for datapoints in each bucket.  You might expect that training improves their agreement

display(
    df_imdb.groupby(
        ["target_m", "acquire_pairs_function", "p_y0_winner_bucket"]
    )["y0_winner"].agg(["count", "mean"]).unstack(level=1).fillna("")
)

ACQ_FNS = ["RANDOM", "CERTAINTY", "HIGH_ENTROPY_AND_CERTAINTY"]
plot_confidence_histos(ACQ_FNS, [256, 768], df_imdb, 300, dataset="IMDB")



NameError: name 'df_imdb' is not defined

In [8]:
display(
    df_tldr.groupby(
        ["target_m", "acquire_pairs_function", "p_y0_winner_bucket"]
    )["y0_winner"].agg(["count", "mean"]).unstack(level=1).fillna("")
)

ACQ_FNS = ["RANDOM", "CERTAINTY", "HIGH_ENTROPY_AND_CERTAINTY"]
plot_confidence_histos(ACQ_FNS, [256, 512], df_tldr, 150, dataset="TLDR")

NameError: name 'df_tldr' is not defined