In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os


from scipy.stats import pearsonr, spearmanr

pd.set_option("display.width", 80)
pd.set_option("display.max_colwidth", None)

In [None]:
# A: load all data files and concat them
data_files = os.listdir("data")
reward_df_files = sorted([
    f
    for f in data_files
    if "scoredreward_humanassistant_includeprompt" in f and "t1.0" in f
])
nll_df_files = sorted([
    f
    for f in data_files
    if "scorednll" in f and "includeprompt" not in f and "t1.0" in f
])


raw_reward_dfs = [pd.read_csv(f"data/{f}") for f in reward_df_files]
raw_nll_dfs = [pd.read_csv(f"data/{f}") for f in nll_df_files]
assert all([
    r_df[["prompt", "generated_text"]].equals(n_df[["prompt", "generated_text"]])
    for r_df, n_df in zip(raw_reward_dfs, raw_nll_dfs)
])
reward_df = pd.concat([r_df for r_df in raw_reward_dfs])
nll_df = pd.concat([n_df for n_df in raw_nll_dfs])
assert reward_df[["prompt", "generated_text"]].equals(
    nll_df[["prompt", "generated_text"]]
)

# B: load specific files
reward_df = pd.read_csv(
    "./data/ethz-spylab-rlhf-7b-harmless_l256_promptseed42_numprompt1000_numgenerations2_top_p095_t1.0_humanassistant_scoredreward_humanassistant_includeprompt.csv"
)
nll_df = pd.read_csv(
    "./data/ethz-spylab-rlhf-7b-harmless_l256_promptseed42_numprompt1000_numgenerations2_top_p095_t1.0_humanassistant_scorednll.csv"
)
assert reward_df[["prompt", "generated_text"]].equals(
    nll_df[["prompt", "generated_text"]]
)

# Merge
df = reward_df.merge(nll_df, on=["prompt", "generated_text"])
df["log_probability"] = -df["negative_log_probability"]
df = df[df.apply(lambda row: len(row["generated_text"]) > 0, axis=1)].drop_duplicates()
df

In [None]:
df[["score", "log_probability"]].describe()

In [None]:
num_corpuses = 1000

In [None]:
# 1. Sample level statistics
print("===== Sample level statistics =====")
print(f"Spearman: {spearmanr(df['score'], df['log_probability'])}")
print(f"Pearson: {pearsonr(df['score'], df['log_probability'])}")
df.plot.scatter(y="score", x="log_probability", title="Samples")
plt.show()
plt.close()

# 2. Corpus level
print("===== Corpus level statistics =====")

# 2.1 Sample corpuses
corpuses = {
    corpus_seed: df.sample(10 * len(df), random_state=corpus_seed, replace=True)
    for corpus_seed in range(num_corpuses)
}

# 2.2. Compute means and plot
mean_corpuses_df = pd.DataFrame.from_dict({
    corpus_seed: corpus[["score", "log_probability"]].mean(axis=0)
    for corpus_seed, corpus in corpuses.items()
}).T
print(
    "Spearman:"
    f" {spearmanr(mean_corpuses_df['score'], mean_corpuses_df['log_probability'])}"
)
print(
    "Pearson:"
    f" {pearsonr(mean_corpuses_df['score'], mean_corpuses_df['log_probability'])}"
)
mean_corpuses_df.plot.scatter(
    y="score",
    x="log_probability",
    title="Means by corpus",
)
plt.show()
plt.close()