In [39]:
import torch
import numpy as np
import math
import os
import pandas as pd

from tqdm import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use('seaborn-v0_8')

In [40]:
bj_bounds = dict()

In [41]:
def binedges_equalmass(x, n_bins):
    n = len(x)
    return np.interp(np.linspace(0, n, n_bins + 1),
                     np.arange(n),
                     np.sort(x))

def find_bin_edges_equal_mass_src(source_data, n_bins, clf):
    ## compute iw using training set
    w_list_train = []
    X_source = source_data[0]
    w_source = 1/clf.predict_proba(X_source)[:, 1]-1

    bin_edges = binedges_equalmass(list(w_source), n_bins)
    bin_edges[0] = 0.0
    bin_edges[-1] = np.inf
    return bin_edges

def bci_clopper_pearson(k, n, alpha, two_side=True, use_R=False):
    if two_side:
        if use_R: # R is numerically better when alpha is small
            from rpy2.robjects.packages import importr
            stats = importr('stats')

            lo = stats.qbeta(alpha/2, int(k), int(n-k+1))[0]
            hi = stats.qbeta(1 - alpha/2, int(k+1), int(n-k))[0]
        else:
            from scipy import stats

            lo = stats.beta.ppf(alpha/2, k, n-k+1)
            hi = stats.beta.ppf(1 - alpha/2, k+1, n-k)
        
            lo = 0.0 if math.isnan(lo) else lo
            hi = 1.0 if math.isnan(hi) else hi
    
        return lo, hi
    else:
        if use_R: # R is numerically better when alpha is small
            from rpy2.robjects.packages import importr
            stats = importr('stats')

            hi = stats.qbeta(1 - alpha, int(k+1), int(n-k))[0]
        else:
            from scipy import stats

            hi = stats.beta.ppf(1 - alpha, k+1, n-k)
            hi = 1.0 if math.isnan(hi) else hi
    
        return hi

In [97]:
chat_root = "full"
filepath = "../llm_output/{}_chat/google-flan-t5-xxl_predictions.csv".format(chat_root)
source_df = pd.read_csv(filepath)

test_hyp = None
for h in source_df["hypothesis"].unique():
    rows = source_df[source_df["hypothesis"] == h]
    if len(rows) == 8000:
        test_hyp = h
        break
source_df = source_df[source_df["hypothesis"] == test_hyp]
source_df = source_df.sort_values(["task_id"])
source_tasks = source_df["task_id"].tolist()

source_emb_df = pd.read_csv("../llm_output/{}_chat/sentence-transformers-multi-qa-mpnet-base-dot-v1_embeddings.csv".format(chat_root))
source_emb_df = source_emb_df.sort_values(["task_id"])
source_emb_tasks = source_emb_df["task_id"].tolist()

assert source_tasks == source_emb_tasks

  source_df = pd.read_csv(filepath)


In [101]:
X_source = np.array([eval(s) for s in source_emb_df["embedding"].tolist()])
z_source = np.array(source_df["toxicity"].tolist())

X_source.shape, z_source.shape

((8000, 768), (8000,))

In [None]:
chat_root = "red_team"
filepath = "../llm_output/{}_chat/google-flan-t5-xxl_predictions.csv".format(chat_root)
target_df = pd.read_csv(filepath)

target_df = target_df[target_df["hypothesis"] == test_hyp]
target_df = target_df.sort_values(["task_id"])
target_tasks = target_df["task_id"].tolist()

target_emb_df = pd.read_csv("../llm_output/{}_chat/sentence-transformers-multi-qa-mpnet-base-dot-v1_embeddings.csv".format(chat_root))
target_emb_df = target_emb_df.sort_values(["task_id"])
target_emb_tasks = target_emb_df["task_id"].tolist()

assert target_tasks == target_emb_tasks

In [None]:
X_target = np.array([eval(s) for s in target_emb_df["embedding"].tolist()])
z_target = np.array(target_df["toxicity"].tolist())

X_target.shape, z_target.shape

In [6]:
output_dir = '../llm_output'

chat_root = "full"
chat_dir = os.path.join(output_dir, "{}_chat".format(chat_root))

chat_df = pd.read_csv(os.path.join(chat_dir, 'google-flan-t5-xxl_predictions.csv'))
chat_df = chat_df.sort_values(["task_id"])
chat_tasks = sorted(chat_df["task_id"].unique())

source_df = pd.read_csv(os.path.join(chat_dir, 'sentence-transformers-multi-qa-mpnet-base-dot-v1_embeddings.csv'))
source_df = source_df.sort_values(["task_id"])
source_tasks = sorted(source_df["task_id"].unique())

assert chat_tasks == source_tasks

X_source = []
z_source = []

last_task = -1
for task in source_tasks:
    assert task > last_task
    emb = eval(source_df[source_df["task_id"] == task]["embedding"].tolist()[0])
    X_source.append(emb)

    score = chat_
    
    last_task = task
    
X_source = np.array(X_source)
X_source.shape

  chat_df = pd.read_csv(os.path.join(chat_dir, 'google-flan-t5-xxl_predictions.csv'))
  chat_df = pd.read_csv(os.path.join(chat_dir, 'google-flan-t5-xxl_predictions.csv'))


In [8]:
chat_root = "red_team"
chat_dir = os.path.join(output_dir, "{}_chat".format(chat_root))

chat_df = pd.read_csv(os.path.join(chat_dir, 'google-flan-t5-xxl_predictions.csv'))
chat_df = chat_df.sort_values(["task_id"])
chat_tasks = sorted(chat_df["task_id"].unique())

target_df = pd.read_csv(os.path.join(chat_dir, 'sentence-transformers-multi-qa-mpnet-base-dot-v1_embeddings.csv'))
target_df = target_df.sort_values(["task_id"])
target_tasks = sorted(target_df["task_id"].unique())

assert chat_tasks == target_tasks

X_target = []

last_task = -1
for task in target_tasks:
    assert task > last_task
    emb = eval(target_df[target_df["task_id"] == task]["embedding"].tolist()[0])
    X_target.append(emb)
    last_task = task
    
X_target = np.array(X_target)
X_target.shape

(8000, 768)

In [3]:
# Create dists for source and target
# Train classifier
# Calculate importance weights
# Get bin boundaries
# Run below code

In [17]:
def unison_shuffled_copies(a, b, c):
    assert len(a) == len(b) and len(b) == len(c)
    p = np.random.permutation(len(a))
    return a[p], b[p], c[p]

torch.manual_seed(0)
np.random.seed(0)

# n_source = X_source.shape[0]
# n_target = X_target.shape[0]

n_source = 10000
n_target = 10000

X_source = np.random.normal(0.5,0.25,(n_source, 768))
X_target = np.random.normal(0.5,0.25,(n_target, 768))

y_source = np.ones(n_source)
y_target = np.zeros(n_target)

z_source = np.clip(np.random.normal(0.25,0.25,n_source), 0, 1)
z_target = np.clip(np.random.normal(0.75,0.25,n_target), 0, 1)

# plt.scatter(X_source[:, 0], X_source[:, 1], alpha=0.5, label="source")
# plt.scatter(X_target[:, 0], X_target[:, 1], alpha=0.5, label="target")
# plt.scatter(target_data[:, 0], target_data[:, 1], alpha=0.5, label="target")
# plt.show()

X = np.concatenate([X_source, X_target])
y = np.concatenate([y_source, y_target])
z = np.concatenate([z_source, z_target])

n_train = int(n_source*0.2)
X, y, z = unison_shuffled_copies(X, y, z)
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
z_train, z_test = z[:n_train], z[n_train:]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(z_train.shape, z_test.shape)

(2000, 768) (18000, 768)
(2000,) (18000,)
(2000,) (18000,)


In [18]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X, 
#     y, 
#     train_size = 0.25,
#     stratify=y,
#     random_state=1
# )

clf = MLPClassifier(random_state=1, max_iter=50, verbose=True).fit(X_train, y_train)

score = clf.score(X_test, y_test)
print("score", score)
# print(X_train.shape, X_test.shape)

Iteration 1, loss = 0.78838618
Iteration 2, loss = 0.71264593
Iteration 3, loss = 0.70625763
Iteration 4, loss = 0.69497843
Iteration 5, loss = 0.69161089
Iteration 6, loss = 0.68753725
Iteration 7, loss = 0.68517506
Iteration 8, loss = 0.68147992
Iteration 9, loss = 0.67749718
Iteration 10, loss = 0.67745259
Iteration 11, loss = 0.67090195
Iteration 12, loss = 0.66687047
Iteration 13, loss = 0.66573716
Iteration 14, loss = 0.65951243
Iteration 15, loss = 0.65500136
Iteration 16, loss = 0.64872327
Iteration 17, loss = 0.64372259
Iteration 18, loss = 0.64322293
Iteration 19, loss = 0.64805833
Iteration 20, loss = 0.64273659
Iteration 21, loss = 0.63686194
Iteration 22, loss = 0.62464118
Iteration 23, loss = 0.61537552
Iteration 24, loss = 0.61026349
Iteration 25, loss = 0.60576389
Iteration 26, loss = 0.59770156
Iteration 27, loss = 0.59391218
Iteration 28, loss = 0.59001136
Iteration 29, loss = 0.59560978
Iteration 30, loss = 0.58792023
Iteration 31, loss = 0.57852375
Iteration 32, los



In [19]:
source_data = (X_test[y_test == 1], y_test[y_test == 1], z_test[y_test == 1])
target_data = (X_test[y_test == 0], y_test[y_test == 0], z_test[y_test == 0])

# clf = MLPClassifier(random_state=1, max_iter=50, verbose=True).fit(X, y)

# score = clf.score(X, y)
# print("score", score)
# print(X.shape, X.shape)

# source_data = (X[y == 1], y[y == 1])
# target_data = (X[y == 0], y[y == 0])

print(source_data[0].shape, source_data[1].shape, source_data[2].shape)
print(target_data[0].shape, target_data[1].shape, target_data[2].shape)

(9002, 768) (9002,) (9002,)
(8998, 768) (8998,) (8998,)


In [20]:
X_val = np.concatenate([source_data[0], target_data[0]])
y_val = np.concatenate([source_data[1], target_data[1]])
z_val = np.concatenate([source_data[2], target_data[2]])
X_val.shape, y_val.shape, z_val.shape

((18000, 768), (18000,), (18000,))

In [21]:
n_bins = 5
delta = 0.05
E = 1e-3

In [22]:
print(f'## histogram binning with n_bins = {n_bins}, delta = {delta:e}, and E = {E}')

bin_edges = find_bin_edges_equal_mass_src(source_data, n_bins, clf)
assert(len(bin_edges) == n_bins+1)

print('[bin edges]', bin_edges)

iw = (1/clf.predict_proba(X_val)[:, 1])-1
# iw = clf.predict_proba(X_val)[:, 1]
iw.shape

## histogram binning with n_bins = 5, delta = 5.000000e-02, and E = 0.001
[bin edges] [0.         0.69459888 1.18068726 1.87631833 3.14481509        inf]


(18000,)

In [23]:
n_src_list, n_tar_list, iw_est = [], [], []
for i, (l, u) in enumerate(zip(bin_edges[:-1], bin_edges[1:])):
    if i == len(bin_edges)-2:
        idx = (iw>=l) & (iw<=u)
    else:
        idx = (iw>=l) & (iw<u)
    label_i = y_val[idx]
    n_src = np.sum(label_i == 1)
    n_tar = np.sum(label_i == 0)

    print(f'bin_id = {i+1}, n_src = {n_src}, n_tar = {n_tar}')

    iw_est.append((l+u)/2.0)
    n_src_list.append(n_src)
    n_tar_list.append(n_tar)

bin_id = 1, n_src = 1801, n_tar = 1820
bin_id = 2, n_src = 1800, n_tar = 1789
bin_id = 3, n_src = 1801, n_tar = 1780
bin_id = 4, n_src = 1800, n_tar = 1776
bin_id = 5, n_src = 1800, n_tar = 1833


In [24]:
iw_est, n_src_list, n_tar_list = np.array(iw_est), np.array(n_src_list), np.array(n_tar_list)
n_src_all, n_tar_all = np.sum(n_src_list), np.sum(n_tar_list)
print('[src]', n_src_list, n_src_all)
print('[tar]', n_tar_list, n_tar_all)

## estimate CP intervals
itv_rate_src = [bci_clopper_pearson(k, n_src_all, delta / n_bins / 2.0) for k in n_src_list]
itv_rate_tar = [bci_clopper_pearson(k, n_tar_all, delta / n_bins / 2.0) for k in n_tar_list]

print('[itv_rate_src]', itv_rate_src)
print('[itv_rate_tar]', itv_rate_tar)
print()

## compute iw lower/upper/mean. Note that add a small value to avoid numerical error
iw_lower = np.array([max(0, n_tar[0] - E)/(n_src[1] + E + 1e-16) for n_src, n_tar in zip(itv_rate_src, itv_rate_tar)]) 
iw_upper = np.array([(n_tar[1] + E)/(max(0, n_src[0] - E) + 1e-16) for n_src, n_tar in zip(itv_rate_src, itv_rate_tar)])
iw_mean = (iw_lower + iw_upper) / 2.0

print('[lower]', iw_lower)
print('[upper]', iw_upper)
print('[mean]', iw_mean)
print('[iw_max]', np.max(iw_upper))
print()

w_hat = np.zeros_like(y_val)

for i, (l, u) in enumerate(zip(bin_edges[:-1], bin_edges[1:])):
    if i == len(bin_edges)-2:
        idx = (iw>=l) & (iw<=u)
    else:
        idx = (iw>=l) & (iw<u)
    w_hat[idx] = iw_mean[i]

print("w hat", w_hat)

print("diffs", iw_upper - iw_lower)

epsilon = np.max(iw_upper - iw_lower)
print("epsilon", epsilon, epsilon/(1-epsilon))

[src] [1801 1800 1801 1800 1800] 9002
[tar] [1820 1789 1780 1776 1833] 8998
[itv_rate_src] [(0.18836488709195212, 0.21214300544007075), (0.18825633367210096, 0.2120295252482702), (0.18836488709195212, 0.21214300544007075), (0.18825633367210096, 0.2120295252482702), (0.18825633367210096, 0.2120295252482702)]
[itv_rate_tar] [(0.19051294023366067, 0.2143933973179572), (0.18714609525217568, 0.21087414232963356), (0.18616888155148315, 0.20985216587396957), (0.1857346018787794, 0.20939791661151091), (0.19192524751325887, 0.21586880962460678)]

[lower] [0.88913516 0.87380421 0.8687542  0.8671784  0.89623843]
[upper] [1.14959319 1.13146583 1.12535582 1.12358238 1.15813871]
[mean] [1.01936417 1.00263502 0.99705501 0.99538039 1.02718857]
[iw_max] 1.15813871483973

w hat [1.00263502 1.02718857 1.00263502 ... 1.02718857 1.01936417 1.02718857]
diffs [0.26045803 0.25766162 0.25660162 0.25640398 0.26190028]
epsilon 0.2619002836140608 0.35483048943094003


In [25]:
w_hat_source = w_hat[y_test == 1]
w_hat_target = w_hat[y_test == 0]

w_hat_max = np.max(w_hat_source)

w_hat_source.shape, w_hat_target.shape, w_hat_max

((9002,), (8998,), 1.0271885730326995)

In [26]:
V = np.random.uniform(0,1,w_hat_source.shape[0])

In [27]:
z_target = target_data[2]

In [28]:
z_source = source_data[2][(w_hat_source/w_hat_max) > V]
z_source.shape

(8837,)

In [328]:
from prompt_risk.bounds import *
from prompt_risk.utils import *

In [None]:
n_cal = z_source.shape[0]
if n_cal not in bj_bounds:
    b = berk_jones(n_cal, 0.05)
    bj_bounds[n_cal] = b
else:
    b = bj_bounds[n_cal]

In [None]:
inflation = epsilon/(1-epsilon)
plt.plot(b, np.sort(z_source), label="unshifted bound")
plt.plot(b+inflation, np.sort(z_source), label="shift bound")
plt.plot(np.arange(z_target)/len(z_target), np.sort(z_target), label="empirical")