In [1]:
import time

import matplotlib.pyplot as plt
import numpy as np
import pickle
# import torch

from collections import Counter
from gap_statistic import OptimalK
from scipy import stats, cluster
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

from utils.clustering_utils import *
from utils.conformal_utils import *

%load_ext autoreload
%autoreload 2

## Testing null that there is one cluster

In [2]:
from sklearn import metrics 

def _get_cluster_fit(true_class_scores, labels, num_classes, num_clusters):
    
    # Compute embeddings
    q = [0.5, 0.6, 0.7, 0.8, 0.9]
    embeddings = np.zeros((num_classes, len(q)))
    for i in range(num_classes):
        class_i_scores = true_class_scores[labels==i]
        embeddings[i,:] = quantile_embedding(class_i_scores, q=q)

    kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init=10).fit(embeddings)
    
# #     # OPTION 1: (Doesn't work well) Sum of squared distances of samples to their closest cluster center
#     cluster_fit_metric = kmeans.inertia_

      # OPTION 2: Silhouette score
    cluster_labels = kmeans.labels_
    cluster_fit_metric = metrics.silhouette_score(embeddings, cluster_labels, metric='euclidean')
    
    # OPTION 3: avg L1 distance of quantiles of original scores
    # Group scores by cluster
    list_of_cluster_scores = []
    for i in range(num_clusters):
        clusteri_classes = np.argwhere(cluster_labels == i)
        clusteri_scores = true_class_scores[np.in1d(labels, clusteri_classes)]
        list_of_cluster_scores.append(clusteri_scores)
    cluster_fit_metric = compute_avg_distance_between_quantiles(list_of_cluster_scores, q=q)
    
    return cluster_fit_metric


def test_one_cluster_null(scores, labels, num_classes, num_clusters=2, num_trials=100, seed=0):
    np.random.seed(seed)
    
    if len(scores.shape) > 1:
        true_class_scores = get_true_class_conformal_score(scores, labels)
    else:
        true_class_scores = scores
    
    # Compute metric using true class labels
    observed_metric = _get_cluster_fit(true_class_scores, labels, num_classes, num_clusters)   
    
    metrics_under_null = np.zeros((num_trials,))
    permuted_labels = np.copy(labels)
    for i in range(num_trials):
        # Randomly permute labels
        np.random.shuffle(permuted_labels)
        
        # Compute metric for each random permutation 
        metrics_under_null[i] = _get_cluster_fit(true_class_scores, permuted_labels, num_classes, num_clusters)
        
    # Compute fraction of results under null that yield a better clustering metric 
    # than the observed value 
    num_better = np.sum(metrics_under_null < observed_metric) # Lower inertia = better clustering
    p_value = num_better / num_trials
    
    print('Observed metric:', observed_metric)
    print('Metric under null:', metrics_under_null)
    
    print(f'Probability of observing a smaller metric under null hypothesis of one cluster: {p_value}',
          f'({num_better} out of {num_trials} trials)')
    
    return p_value

In [3]:
# # Version 2: operate directly on scores rather than embeddings [DOES NOT WORK]

def compute_avg_distance_between_quantiles(list_of_arrs, q=[0.5, 0.6, 0.7, 0.8, 0.9]):
    '''
    Computes the L1 distance between quantiles q between each pair of groups
    in list_of_arrs and then takes average across all pairs
    
    Input:
        list_of_arrs: length-n list of arrays. list_of_arrs[i] contains 
        samples from group i
    '''
    n_groups = len(list_of_arrs)
    
    dists = []
    for i in range(n_groups):
        
        groupi_quantiles = np.quantile(list_of_arrs[i], q)
        
        for j in range(i+1, n_groups):
            groupj_quantiles = np.quantile(list_of_arrs[j], q)
            
            dist_ij = np.sum(np.abs(groupi_quantiles - groupj_quantiles))
            dists.append(dist_ij)
            
    avg_dist = sum(dists) / len(dists)
    return avg_dist

# def cluster_and_group_scores_by_cluster(true_class_scores, labels, num_clusters):
    
#     # Compute embeddings
#     q = [0.5, 0.6, 0.7, 0.8, 0.9]
#     embeddings = np.zeros((num_classes, len(q)))
#     for i in range(num_classes):
#         class_i_scores = true_class_scores[labels==i]
#         embeddings[i,:] = quantile_embedding(class_i_scores, q=q)

#     # Cluster
#     kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init=10).fit(embeddings)
#     cluster_labels = kmeans.labels_
    
#     # Group scores by cluster
#     list_of_cluster_scores = []
#     for i in range(num_clusters):
#         clusteri_classes = np.argwhere(cluster_labels == i)
#         clusteri_scores = true_class_scores[np.in1d(labels, clusteri_classes)]
#         list_of_cluster_scores.append(clusteri_scores)
        
#     return list_of_cluster_scores 

# def test_one_cluster_null_v2(scores, labels, num_classes, num_clusters=2, num_trials=100, seed=0):
#     np.random.seed(seed)
#     q=[0.5, 0.6, 0.7, 0.8, 0.9]
    
#     if len(scores.shape) > 1:
#         true_class_scores = get_true_class_conformal_score(scores, labels)
#     else:
#         true_class_scores = scores
    
#     # Compute metric using true class labels
#     list_of_cluster_scores = cluster_and_group_scores_by_cluster(true_class_scores, labels, num_clusters)
#     observed_metric = compute_avg_distance_between_quantiles(list_of_cluster_scores, q=q)
    
# #     print('OBS', list_of_cluster_scores)
    
#     metrics_under_null = np.zeros((num_trials,))
#     for i in range(num_trials):
#         # Split scores into num_clusters randomly sized chunks
#         # - Randomly select a probability vector p
#         unifs = np.random.rand(num_clusters)
#         p = unifs / unifs.sum()
        
#         # - Assign classes based on p
#         rand_clusters = np.random.choice(num_clusters, size=len(labels), p=p)
#         list_of_cluster_scores = [true_class_scores[rand_clusters==i] for i in range(num_clusters)]
        
# #         print('SIM', list_of_cluster_scores)
        
#         # Compute metric for each random permutation 
#         metrics_under_null[i] = compute_avg_distance_between_quantiles(list_of_cluster_scores, q=q)
        
# #         assert False
        
#     # Compute fraction of results under null that yield a better clustering metric 
#     # than the observed value 
#     num_better = np.sum(metrics_under_null < observed_metric) # Lower inertia = better clustering
#     p_value = num_better / num_trials
    
#     print('Observed metric:', observed_metric)
#     print('Metric under null:', metrics_under_null)
    
#     print(f'Probability of observing a smaller metric under null hypothesis of one cluster: {p_value}',
#           f'({num_better} out of {num_trials} trials)')
    
#     return p_value

### Test on Enron

In [24]:
# Load data
alpha = .1
n_totalcal = 20 # Total number of calibration points (= # clustering examples + # conformal calibration examples)

# Enron - BERT
softmax_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_softmax_bert_ntrain=500.npy"
labels_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_labels_bert_ntrain=500.npy"

softmax_scores = np.load(softmax_path)
labels = np.load(labels_path)

num_classes = labels.max() + 1

In [35]:
n_clustering = 5
num_clusters = 3

score_function = 'softmax'


if score_function == 'softmax':
    scores_all = 1 - softmax_scores
    
# Split into clustering+calibration data and validation data
totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=7)

# 0) Split data between clustering and calibration
scores1_all, labels1, scores2_all, labels2 = split_X_and_y(totalcal_scores_all, 
                                                       totalcal_labels, 
                                                       n_clustering, 
                                                       num_classes=num_classes, 
                                                       seed=0)

# 1) Test k chosen using ad-hoc heuristic
test_one_cluster_null(scores1_all, labels1, num_classes, num_clusters=num_clusters, num_trials=100, seed=0)

Observed metric: 0.7757373115323073
Metric under null: [0.45151132 0.32432147 0.73205709 0.49116746 0.23389451 0.43058253
 0.39253624 0.46617533 0.30909889 0.80397174 0.67749834 0.82024545
 0.70272406 0.51445724 0.29029255 0.35221266 0.57012928 0.90003647
 0.44544204 0.27693843 0.3671603  0.45065448 1.53850399 0.41744018
 0.30732166 0.39344402 0.44805636 0.41908243 0.69618777 1.26120738
 0.33152527 0.76047233 0.36360908 0.62614049 1.09884185 1.90902774
 0.30035567 1.11888252 0.72788612 0.42156505 0.35764796 1.29340075
 0.34450692 0.98043791 0.45037955 1.12890606 0.27056832 0.36193183
 0.21872629 0.29598139 0.54984367 0.41075872 0.72651938 0.34303703
 0.36823209 0.64185557 0.29999708 0.42316498 0.61647996 0.51725537
 0.32094597 0.39102757 0.61793278 0.40789829 0.32464459 0.57148852
 0.41716768 0.4598939  0.60976683 0.54144907 0.29602576 0.98729637
 0.46410569 0.36812143 0.31888791 0.40146241 0.38875357 0.44211959
 0.78028015 0.32953237 0.33911119 0.44698013 0.54953297 0.25002406
 0.3171

0.85

### Synthetic test

In [41]:
# Generate synthetic data (2 clusters of classes. One is Beta(1,1) and the other is Beta(3,.5))
num_classes = 100
n_clustering = 5

true_class_scores = np.zeros((num_classes * n_clustering,))
labels = np.zeros((num_classes * n_clustering,))
for i in range(num_classes):
    # Each class is in Cluster 0 or Cluster 1 with equal probability 
    if np.random.rand() > 0.5:
#         samples = np.random.normal(0,1, size=n_clustering)
        samples = np.random.beta(1,1, size=n_clustering)
    else:
#         samples = np.random.normal(0,1, size=n_clustering)
        samples = np.random.beta(3,.5, size=n_clustering)
        
    true_class_scores[i*n_clustering:(i+1)*n_clustering] = samples 
        
    labels[i*n_clustering:(i+1)*n_clustering] = i
    
# Test k=2 vs. k=1
test_one_cluster_null(true_class_scores, labels, num_classes, num_clusters=2, num_trials=100, seed=0)

Observed metric: 1.8425164414927768
Metric under null: [0.86187347 0.87179594 0.9484072  0.97205673 0.79611811 1.02954443
 0.89338773 0.71414115 0.98365402 1.04118377 0.99117361 0.78590661
 1.01256282 1.15610226 0.81320829 0.95973893 0.8134263  1.00787498
 0.95751718 0.85664703 0.87029901 1.17421971 1.30395335 0.93433334
 1.00179653 0.94041709 0.92147795 1.04283627 0.8346762  0.8681899
 0.90155756 1.05764202 1.00417968 0.92311172 0.98390793 0.76538461
 0.84297992 0.90925041 1.2775974  0.90477097 0.92095417 0.9894978
 1.06825953 0.90673281 0.98330046 1.21902753 0.94558885 1.03282373
 0.79267238 1.12088129 0.96915478 0.84036883 1.10097703 0.94188114
 1.00316437 1.40257299 0.86751182 0.96285074 0.96662509 1.217582
 0.86403219 0.8022638  1.32255051 1.05810676 0.81474648 0.71010339
 1.31229166 0.88728205 1.01444249 0.91783796 0.90669439 1.00289909
 0.93147397 1.06774802 0.92460432 1.18133069 0.958657   1.13707261
 1.02927718 1.05152956 0.94845884 0.79241753 0.84040538 0.82922434
 0.94035698

1.0

Correct p-value? e.g., seed=5 gives p=.11 but other times it is < .03. Should maybe increase num_trials beyond 100 for more stability

[silhouette] With 1000 trials and 100 classes, Beta (1,1), Beta(3, .5). Higher score = better cluster

P(lower) is 
* 0.986
* 1.0
* 0.963
* 0.992

[inertia] With 1000 trials and 100 classes, Beta (1,1), Beta(3, .5). Lower score = better cluster. But does not account for the fact that randomizing results in embeddings that are closer together overall

P(lower) is 
* 0.998 but we actually want this to be close to 0!!!


[inertia] With 100 trials and 100 classes, Normal(0,1), Normal(10,1). 
P(lower is)
* 0.0


In conclusion, silhouette is better for our use case because it is standardized. 

### Test on ImageNet - softmax

In [48]:
# Load data
alpha = .1
n_totalcal = 10 # Total number of calibration points (= # clustering examples + # conformal calibration examples)

# ImageNet
softmax_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_softmax.npy'
labels_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_labels.npy'
softmax_scores = np.load(softmax_path)
labels = np.load(labels_path)

num_classes = labels.max() + 1

In [49]:
score_function = 'softmax'


if score_function == 'softmax':
    scores_all = 1 - softmax_scores
    
# Split into clustering+calibration data and validation data
totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=7)

In [52]:
n_clustering = 9
num_clusters = 50

# 0) Split data between clustering and calibration
scores1_all, labels1, scores2_all, labels2 = split_X_and_y(totalcal_scores_all, 
                                                       totalcal_labels, 
                                                       n_clustering, 
                                                       num_classes=num_classes, 
                                                       seed=0)

# 1) Test k chosen using ad-hoc heuristic
test_one_cluster_null(scores1_all, labels1, num_classes, num_clusters=num_clusters, num_trials=100, seed=0)

Observed metric: 1.5014748257909503
Metric under null: [1.34337364 1.34823567 1.35910352 1.33588652 1.30077732 1.35194303
 1.34619584 1.38380463 1.40494304 1.34793517 1.2848282  1.35615271
 1.33192835 1.36943094 1.31527481 1.37436384 1.34828231 1.32104032
 1.36042272 1.37184055 1.34199021 1.31849631 1.37028395 1.327778
 1.3274847  1.35686893 1.33207267 1.36747525 1.33284109 1.34138328
 1.3834615  1.31482878 1.39302892 1.31821949 1.35974126 1.34644293
 1.3061125  1.33753944 1.31653484 1.36780583 1.30645252 1.34614379
 1.34176885 1.28651441 1.35002671 1.28216354 1.34035147 1.32556597
 1.31701139 1.33371759 1.35105503 1.35966238 1.28096273 1.35130207
 1.32273301 1.37076126 1.30632771 1.39565656 1.30777287 1.30715527
 1.36965516 1.36329009 1.33515404 1.36878446 1.36936872 1.34348023
 1.32248072 1.34989239 1.3394125  1.36238889 1.33879649 1.32879162
 1.34444194 1.29470788 1.31599475 1.38324171 1.37345336 1.38477182
 1.35027722 1.32980355 1.32493293 1.40223242 1.34168502 1.36346118
 1.342081

1.0

### Test on ImageNet - APS

In [54]:
# Load data
alpha = .1
n_totalcal = 10 # Total number of calibration points (= # clustering examples + # conformal calibration examples)

# ImageNet
APS_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_APS.npy'
labels_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_labels.npy'
APS_scores = np.load(APS_path)
labels = np.load(labels_path)

num_classes = labels.max() + 1

In [56]:
scores_all = APS_scores

# Split into clustering+calibration data and validation data
totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=7)

In [62]:
n_clustering = 5
num_clusters = 5

# 0) Split data between clustering and calibration
scores1_all, labels1, scores2_all, labels2 = split_X_and_y(totalcal_scores_all, 
                                                       totalcal_labels, 
                                                       n_clustering, 
                                                       num_classes=num_classes, 
                                                       seed=0)

# 1) Test k chosen using ad-hoc heuristic
test_one_cluster_null(scores1_all, labels1, num_classes, num_clusters=num_clusters, num_trials=1000, seed=2)

Observed metric: 1.2852707739201041
Metric under null: [1.14475778 1.08360021 1.10488874 1.08136564 1.05255599 1.13209695
 1.07458268 1.08542867 1.24344913 1.08505069 1.14587061 1.19985392
 1.26749752 1.08098534 1.07252618 1.30788533 1.17926092 1.15843491
 1.13979329 1.11182036 1.25768946 1.07926848 1.11760908 1.21430416
 1.10692022 1.13669431 1.11049796 1.16183259 1.14150135 1.11540434
 1.14976526 1.09699924 1.12444508 1.12418124 1.24842408 1.14833131
 1.09286472 1.22571067 1.11569564 1.13573816 1.29479786 1.1317263
 1.11881283 1.10208476 1.1365222  1.21136059 1.08514501 1.13615582
 1.14465668 1.15464564 1.18747472 1.13685776 1.23081858 1.12058058
 1.11648811 1.12578767 1.15818624 1.092296   1.2192506  1.0831188
 1.09661343 1.28469823 1.11335996 1.07396858 1.09910141 1.09966898
 1.23482948 1.10342615 1.10705648 1.12493029 1.20657005 1.16475902
 1.23133083 1.07055279 1.05491229 1.30033174 1.09313835 1.07284801
 1.10334683 1.28130213 1.26155882 1.13898708 1.14907569 1.29483329
 1.070818

0.954

n_clustering = 5
num_clusters = 5

seed=0
Probability of observing a smaller metric under null hypothesis of one cluster: 0.949 (949 out of 1000 trials)
seed=1
Probability of observing a smaller metric under null hypothesis of one cluster: 0.949 (949 out of 1000 trials)
seed=2
Probability of observing a smaller metric under null hypothesis of one cluster: 0.954 (954 out of 1000 trials)

## Test

In [33]:
alpha = .1
n_totalcal = 20 # Total number of calibration points (= # clustering examples + # conformal calibration examples)


# Enron - BERT
softmax_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_softmax_bert.npy"
labels_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_labels_bert.npy"

In [7]:
## 1. Get data ============================
print('Loading softmax scores and labels...')

softmax_scores = np.load(softmax_path)
labels = np.load(labels_path)

num_classes = labels.max() + 1

Loading softmax scores and labels...


In [26]:
# # for score_function in ['softmax', 'APS', 'RAPS']:
# n_clustering_list = (np.array([.3, .5, .7, .9]) * n_totalcal).astype(np.int32)


# for score_function in ['softmax', 'APS']:
    
#     print(f'====== score_function={score_function} ======')
    
#     print('Computing conformal score...')
#     if score_function == 'softmax':
#         scores_all = 1 - softmax_scores
#     elif score_function == 'APS':
#         scores_all = get_APS_scores_all(softmax_scores, randomize=True)
#     elif score_function == 'RAPS': 
        
#         # RAPS hyperparameters (currently using ImageNet defaults)
#         lmbda = .01 
#         kreg = 5
        
#         scores_all = get_RAPS_scores_all(softmax_scores, lmbda, kreg, randomize=True)
#     else:
#         raise Exception('Undefined score function')


#     print('Splitting data...')
#     # Split into clustering+calibration data and validation data
#     totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=0)

#     for n_clustering in n_clustering_list:
            
#         # 0) Split data 
#         scores1_all, labels1, scores2_all, labels2 = split_X_and_y(totalcal_scores_all, 
#                                                            totalcal_labels, 
#                                                            n_clustering, 
#                                                            num_classes=num_classes, 
#                                                            seed=0)

#         # 1) Compute embedding for each class
#         embeddings = embed_all_classes(scores1_all, labels1, q=[0.5, 0.6, 0.7, 0.8, 0.9])

#         # 2) Do k-means with different k's
#         for num_clusters in np.arange(1,13):
#             kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init=10).fit(embeddings)
#             cluster_assignments = kmeans.labels_  

#             # Print cluster sizes
#             print(f'[n_clustering={n_clustering}, num_clusters={num_clusters}] Cluster sizes:', [x[1] for x in Counter(cluster_assignments).most_common()])


Computing conformal score...
Splitting data...
[n_clustering=3, num_clusters=1] Cluster sizes: [109]
[n_clustering=3, num_clusters=2] Cluster sizes: [87, 22]
[n_clustering=3, num_clusters=3] Cluster sizes: [74, 28, 7]
[n_clustering=3, num_clusters=4] Cluster sizes: [62, 26, 17, 4]
[n_clustering=3, num_clusters=5] Cluster sizes: [61, 26, 14, 6, 2]
[n_clustering=3, num_clusters=6] Cluster sizes: [59, 22, 17, 5, 4, 2]
[n_clustering=3, num_clusters=7] Cluster sizes: [59, 16, 13, 13, 4, 2, 2]
[n_clustering=3, num_clusters=8] Cluster sizes: [59, 16, 13, 13, 3, 2, 2, 1]
[n_clustering=3, num_clusters=9] Cluster sizes: [59, 16, 12, 11, 3, 3, 2, 2, 1]
[n_clustering=3, num_clusters=10] Cluster sizes: [36, 23, 16, 12, 11, 3, 3, 2, 2, 1]
[n_clustering=3, num_clusters=11] Cluster sizes: [31, 28, 16, 11, 8, 4, 3, 3, 2, 2, 1]
[n_clustering=3, num_clusters=12] Cluster sizes: [36, 23, 15, 12, 7, 3, 3, 3, 2, 2, 2, 1]
[n_clustering=5, num_clusters=1] Cluster sizes: [109]
[n_clustering=5, num_clusters=2] C

In [31]:
# for score_function in ['softmax', 'APS', 'RAPS']:
for score_function in ['softmax', 'APS']:
    
    print(f'====== score_function={score_function} ======')
    
    print('Computing conformal score...')
    if score_function == 'softmax':
        scores_all = 1 - softmax_scores
    elif score_function == 'APS':
        scores_all = get_APS_scores_all(softmax_scores, randomize=True)
    elif score_function == 'RAPS': 
        
        # RAPS hyperparameters (currently using ImageNet defaults)
        lmbda = .01 
        kreg = 5
        
        scores_all = get_RAPS_scores_all(softmax_scores, lmbda, kreg, randomize=True)
    else:
        raise Exception('Undefined score function')


    print('Splitting data...')
    # Split into clustering+calibration data and validation data
    totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=0)


    qhats, preds, class_cov_gap, set_size_metrics = clustered_conformal(totalcal_scores_all, totalcal_labels,
                                                                        alpha,
                                                                        tune_parameters=True,
                                                                        n_clustering=None, num_clusters=None,
                                                                        val_scores=val_scores_all, val_labels=val_labels)
    print('[Clustered conformal] Class coverage gap:', class_cov_gap)
    print('[Clustered conformal] Set size metrics', set_size_metrics)

Computing conformal score...
Splitting data...
Best n_clustering: 5
Best num_clusters: 3
Cluster sizes: [88, 15, 6]
[Clustered conformal] Class coverage gap: 3.131593054758153
[Clustered conformal] Set size metrics {'mean': 38.618291323892166, '[.25, .5, .75, .9] quantiles': array([30., 41., 49., 55.])}
Computing conformal score...
Splitting data...
Best n_clustering: 9
Best num_clusters: 5
Cluster sizes: [30, 23, 23, 18, 15]
[Clustered conformal] Class coverage gap: 6.701743518999062
[Clustered conformal] Set size metrics {'mean': 66.72654921553159, '[.25, .5, .75, .9] quantiles': array([62., 71., 76., 80.])}


# Test ad-hoc heuristic

For n_totalcal=10, num_classes=108 our heuristic says 5 points for clustering, 3 clusters.



This is a hard task since we only have 1,080 total calibration points

In [43]:
n_clustering = 5
num_clusters = 5

In [44]:
# for score_function in ['softmax', 'APS', 'RAPS']:
for score_function in ['softmax', 'APS']:
    
    print(f'====== score_function={score_function} ======')
    
    print('Computing conformal score...')
    if score_function == 'softmax':
        scores_all = 1 - softmax_scores
    elif score_function == 'APS':
        scores_all = get_APS_scores_all(softmax_scores, randomize=True)
    elif score_function == 'RAPS': 
        
        # RAPS hyperparameters (currently using ImageNet defaults)
        lmbda = .01 
        kreg = 5
        
        scores_all = get_RAPS_scores_all(softmax_scores, lmbda, kreg, randomize=True)
    else:
        raise Exception('Undefined score function')


    print('Splitting data...')
    # Split into clustering+calibration data and validation data
    totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=0)


    qhats, preds, class_cov_gap, set_size_metrics = _clustered_conformal(totalcal_scores_all, totalcal_labels,
                                                                        alpha,
                                                                        n_clustering=n_clustering, num_clusters=num_clusters,
                                                                        val_scores=val_scores_all, val_labels=val_labels)
    print('[Clustered conformal] Class coverage gap:', class_cov_gap)
    print('[Clustered conformal] Set size metrics', set_size_metrics)

Computing conformal score...
Splitting data...
Cluster sizes: [81, 19, 7, 1, 1]
[Clustered conformal] Class coverage gap: 2.1080014658611397
[Clustered conformal] Set size metrics {'mean': 37.021779144955744, '[.25, .5, .75, .9] quantiles': array([29., 40., 47., 53.])}
Computing conformal score...
Splitting data...
Cluster sizes: [29, 27, 27, 20, 6]
[Clustered conformal] Class coverage gap: 2.602582134356466
[Clustered conformal] Set size metrics {'mean': 39.76418858685417, '[.25, .5, .75, .9] quantiles': array([32., 43., 51., 56.])}


In [47]:
# Scratch

import glob

file_names = sorted(glob.glob('.cache/enron_n=10/softmax/*.pkl'))
file_names

['.cache/enron_n=10/softmax/seed=0_allmetrics.pkl',
 '.cache/enron_n=10/softmax/seed=1_allmetrics.pkl',
 '.cache/enron_n=10/softmax/seed=2_allmetrics.pkl',
 '.cache/enron_n=10/softmax/seed=3_allmetrics.pkl',
 '.cache/enron_n=10/softmax/seed=4_allmetrics.pkl']