In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm
from pac_utils import pac_labeling, zero_one_loss, squared_loss
from plotting_utils import pac_plot

### Load data

In [None]:
dataset = 'alphafold' # one of 'stance', 'misinfo', 'bias', 'imagenet', 'imagenetv2', 'sentiment', 'alphafold'
data = pd.read_csv(dataset + '.csv')
Y = data["Y"].to_numpy()
if dataset in ['stance', 'misinfo', 'bias', 'sentiment']:
    Yhat = data["Yhat (GPT4o)"].to_numpy()
elif dataset in ['imagenet', 'imagenetv2', 'alphafold']:
    Yhat = data["Yhat"].to_numpy()
confidence = data["confidence"].to_numpy()
n = len(Y)

if dataset in ['stance', 'misinfo', 'bias', 'imagenet', 'imagenetv2']:
    loss = zero_one_loss
elif dataset in ['sentiment', 'alphafold']:
    loss = squared_loss

### Set parameters for PAC labeling

In [None]:
alpha = 0.05
epsilon = 0.05
num_trials = 1000
errs = np.zeros(num_trials)
percent_saved = np.zeros(num_trials)
pi = np.ones(len(Y))
if dataset in ['stance', 'misinfo', 'bias']:
    asymptotic = False
    K = 500
elif dataset in ['imagenet', 'imagenetv2']:
    asymptotic = False
    K = n // 10
elif dataset in ['sentiment', 'alphafold']:
    asymptotic = True
    K = 1000

### Run PAC labeling

In [None]:
for i in tqdm(range(num_trials)):
    uncertainty = 1 - confidence + 1e-5*np.random.normal(size=n) # break ties
    Y_tilde, labeled_inds, _ = pac_labeling(Y, Yhat, loss, epsilon, alpha, uncertainty, pi, K, asymptotic=asymptotic)
    errs[i] = loss(Y,Y_tilde)
    percent_saved[i] = np.mean(labeled_inds==0.0)*100
print('Error:', np.quantile(errs, 1-alpha), 'Budget save:(', np.mean(percent_saved), '+/-', np.std(percent_saved),')')

### Plot results

In [None]:
pac_plot(errs, percent_saved, epsilon, Y, Yhat, confidence, loss, dataset, num_trials, xlim=[0,0.4], plot_naive=True)