# Load model evaluation results and compute accuracies, confidence intervals

In [None]:
from datasets import Dataset
import os

import sys
import pickle as pkl

import numpy as np
from tqdm import tqdm

# add to path
sys.path.append('evaluation')

import benchmarks

In [2]:
from scipy.stats import bootstrap
from sklearn import metrics

def accuracy(labels, predictions, verbose=True, confidence_level=0.9):
    """Compute the accuracy. Also compute a confidence interval using a bootstrap method."""
    acc = metrics.accuracy_score(labels, predictions)
    res = bootstrap(
        (np.array(labels), np.array(predictions)),
        metrics.accuracy_score,
        vectorized=False,
        paired=True,
        confidence_level=confidence_level,
        batch=min(len(labels), 5000),
    )
    if verbose:
        print(
            f"Accuracy: {acc:.3f}, {confidence_level*100:-2f}%-Confidence Interval: ({res.confidence_interval.low:.3f}, {res.confidence_interval.high:.3f}), Standard error: {res.standard_error:.3f}"
        )
    return acc, res

In [None]:
# get the names of all files in the folder
experiment_folder = "results/124M_8x"

files = os.listdir(experiment_folder)

# filter for json files
files = [file for file in files if file.endswith(".parquet") and 'step' in file]

# sort files by the number in the filename
files = sorted(files, key=lambda x: int(x.split("=")[1].split(".")[0]))

# get the steps from the file names
steps = [int(file.split("=")[1].split(".")[0]) for file in files]

# sort both steps and files
steps, files = zip(*sorted(zip(steps, files)))

print(files)
print(steps)

In [None]:
# load the all contamination splits
contamination_ds = benchmarks.load_benchmark('all-contamination-splits')
contamination_ds = benchmarks.sort_length(contamination_ds)

In [None]:
results = {}

# for each step, load the respective file
for idx, step in enumerate(steps):
    results[step] = {}
    filename = files[idx]
    print(f'file: {filename}')
    # load the json dataset
    ds = Dataset.from_parquet(os.path.join(experiment_folder, filename), keep_in_memory=True)
    # sort dataset by option length (required for match with contamination_ds below)
    ds = benchmarks.sort_length(ds)
    # convert the dataset to a list of lists
    ds = ds.to_pandas()
    # add split-id column to the huggingface datasets object
    ds['split-id'] = contamination_ds['split-id']
    ds['benchmark'] = contamination_ds['benchmark']
    # sanity check that we have matched the right split-id
    for idx in tqdm(range(len(contamination_ds))):
        assert ds.iloc[idx]['options'][0] == contamination_ds[idx]['options'][0] # check that the benchmark questions at the same index are the same
    # filter by the value in the split-id column
    values = np.unique(ds["split-id"])
    for v in values:
        # select the values where split-id has the value
        split_ds = ds[ds["split-id"] == v]
        # print the number of samples in the split
        print(f"Split {v}: {len(split_ds)} samples")
        # report the accuracy
        split_acc, ce = accuracy(np.array(split_ds['label'].values), split_ds['prediction'].values, confidence_level=0.90)
        results[step][v] = (split_acc, ce) 
        # cross-entropy loss
        ce_losses = []
        for idx in range(len(split_ds)):
            label_ce_loss = split_ds.iloc[idx]['ce_loss'][split_ds.iloc[idx]['label']]
            ce_losses.append(np.nanmean(label_ce_loss))
        split_ce_loss = np.nanmean(ce_losses)
        # the 95% confidence interval of the cross-entropy loss
        ce = bootstrap(
            (np.array(ce_losses),),
            np.nanmean,
            vectorized=False,
            paired=False,
            confidence_level=0.90,
        )
        # print ce loss and 95% confidence interval
        print(
            f"Cross-Entropy Loss: {split_ce_loss:.3f}, 90%-Confidence Interval: ({ce.confidence_interval.low:.3f}, {ce.confidence_interval.high:.3f}), Standard error: {ce.standard_error:.3f}"
        )
        results[step][100+v] = (split_ce_loss, ce)

In [6]:
# save the results
with open(f'results/cache/{os.path.basename(experiment_folder)}.pkl', 'wb') as f:
    pkl.dump(results, f)