In [18]:
import glob # For getting file names
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
# import seaborn as sns
# import torch

from collections import Counter
# from gap_statistic import OptimalK
from scipy import stats, cluster
from sklearn.cluster import KMeans
# from yellowbrick.cluster import KElbowVisualizer

from utils.clustering_utils import *
from utils.conformal_utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
def run_experiment(softmax_scores, labels,
                  save_folder,
                  alpha=.1,
                  n_totalcal_list=[10, 30],
                  score_function_list = ['softmax', 'APS'],
                  seeds = [0,1,2,3,4]):
    
    num_classes = softmax_scores.shape[1]
    
    for n_totalcal in n_totalcal_list:
        for score_function in score_function_list:
            curr_folder = os.path.join(save_folder, f'n_totalcal={n_totalcal}/score={score_function}')
            os.makedirs(curr_folder, exist_ok=True)
            
            print(f'====== score_function={score_function} ======')
    
            print('Computing conformal score...')
            if score_function == 'softmax':
                scores_all = 1 - softmax_scores
            elif score_function == 'APS':
                scores_all = get_APS_scores_all(softmax_scores, randomize=True)
            elif score_function == 'RAPS': 

                # RAPS hyperparameters (currently using ImageNet defaults)
                lmbda = .01 
                kreg = 5

                scores_all = get_RAPS_scores_all(softmax_scores, lmbda, kreg, randomize=True)
            else:
                raise Exception('Undefined score function')

            for seed in seeds:
                print(f'\nseed={seed}')
                save_to = os.path.join(curr_folder, f'seed={seed}_allresults.pkl')
                
                # Split data
                totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=seed)
    
                # 1) Compute baselines
                # Standard conformal
                standard_qhat = compute_qhat(totalcal_scores_all, totalcal_labels, alpha=alpha)
                standard_preds = create_prediction_sets(val_scores_all, standard_qhat)
                
                coverage_metrics, set_size_metrics = compute_all_metrics(val_labels, standard_preds, alpha)
                standard_results = (standard_qhat, standard_preds, coverage_metrics, set_size_metrics)
                
                # Class-wise conformal
                classwise_qhats = compute_class_specific_qhats(totalcal_scores_all, totalcal_labels, alpha=alpha, default_qhat=np.inf)
                classwise_preds = create_cb_prediction_sets(val_scores_all, classwise_qhats)
                
                coverage_metrics, set_size_metrics = compute_all_metrics(val_labels, classwise_preds, alpha)
                classwise_results = (classwise_qhats, classwise_preds, coverage_metrics, set_size_metrics)

                # 2) Always Cluster
                # results contain qhats, preds, coverage_metrics, set_size_metrics
                always_cluster_results = automatic_clustered_conformal(totalcal_scores_all, totalcal_labels,
                                                                                alpha,
                                                                                val_scores_all, val_labels, 
                                                                                cluster='smart')

                # 3) Smart Cluster
                smart_cluster_results = automatic_clustered_conformal(totalcal_scores_all, totalcal_labels,
                                                                                alpha,
                                                                                val_scores_all, val_labels, 
                                                                                cluster='smart')

                # Save results 
                all_results = {'standard': standard_results,
                               'classwise': classwise_results,
                               'always_cluster': always_cluster_results,
                               'smart_cluster': smart_cluster_results}
                with open(save_to,'wb') as f:
                    pickle.dump(all_results, f)
                    print(f'Saved results to {save_to}')

# Run

In [32]:
# # Enron - BERT (n_train=500)
# softmax_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_softmax_bert_ntrain=500.npy"
# labels_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_labels_bert_ntrain=500.npy"
save_folder = '.cache/paper/enron'

# ImageNet
softmax_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_softmax.npy'
labels_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_labels.npy'
save_folder = '.cache/paper/imagenet'

 
# SETTINGS
alpha = .1
n_totalcal_list = [10, 30]
score_function_list = ['softmax', 'APS']

softmax_scores = np.load(softmax_path)
labels = np.load(labels_path)

run_experiment(softmax_scores, labels,
                  save_folder,
                  alpha=alpha,
                  n_totalcal_list=n_totalcal_list,
                  score_function_list = score_function_list,
                  seeds=[0,1,2,3,4])

Computing conformal score...

seed=0
Saved results to .cache/paper/enron/n_totalcal=10/score=softmax/seed=0_allresults.pkl

seed=1
Observed metric: 2.2787582983573276
Metric under null: [1.71548601 1.81439317 1.78283058 1.75036928 1.73521396 1.74453273
 1.80299342 1.8050169  1.77976559 1.80535449 1.74695889 1.80753427
 1.80672232 1.83681467 1.71653533 1.79047636 1.70705528 1.75509069
 1.78128633 1.75919851 1.76320062 1.75631003 1.76284693 1.73501168
 1.74255262 1.74009411 1.78837851 1.72816339 1.79684196 1.78058726
 1.80758665 1.80019431 1.75939081 1.82256104 1.75979739 1.75964126
 1.78228093 1.75038596 1.74825921 1.79017163 1.7786065  1.78098357
 1.81940285 1.73595766 1.77699688 1.74835783 1.73282943 1.76268643
 1.77410272 1.70873085 1.78547989 1.73849691 1.75208108 1.74211685
 1.82404202 1.78000318 1.80512929 1.75395764 1.77130748 1.81182376
 1.7162326  1.82805222 1.85072266 1.8174911  1.78841712 1.70751904
 1.81715033 1.83117004 1.85860861 1.76212543 1.84507155 1.80196003
 1.7097005

Cluster sizes: [323, 266, 226, 185]
Saved results to .cache/paper/enron/n_totalcal=10/score=softmax/seed=3_allresults.pkl

seed=4
Observed metric: 2.2747718085845308
Metric under null: [1.75931075 1.79727675 1.79269644 1.74123777 1.82774249 1.79980238
 1.76986474 1.86163385 1.77223456 1.80054678 1.78012839 1.78626743
 1.82358938 1.8405719  1.7736718  1.81937829 1.81666962 1.76560365
 1.80131967 1.8380861  1.81843958 1.7645899  1.7740636  1.7761078
 1.84037858 1.7383383  1.82968998 1.86739444 1.84001171 1.86167561
 1.7918053  1.80159925 1.85075525 1.81294678 1.81410784 1.84033741
 1.74737162 1.8551724  1.7727929  1.69618251 1.74953917 1.81910587
 1.78338691 1.79409153 1.84441034 1.8513022  1.84268632 1.86251704
 1.79582692 1.74449507 1.82481879 1.81693196 1.79030555 1.81826063
 1.81000987 1.79917075 1.81306372 1.77233804 1.84528826 1.81788976
 1.73616078 1.81331624 1.74294264 1.75514124 1.77324135 1.78148106
 1.78717527 1.80636856 1.83403333 1.78959585 1.79372154 1.78682917
 1.78736389 

Saved results to .cache/paper/enron/n_totalcal=10/score=APS/seed=1_allresults.pkl

seed=2
Observed metric: 1.0475494443397204
Metric under null: [0.95011777 0.92781227 0.91013402 0.92951369 0.93706931 0.89242783
 0.91961699 0.96061717 1.01049016 0.92802458 0.90790753 0.95018733
 0.97864754 0.94607669 0.95488765 0.99105408 1.0220664  0.89924299
 0.97764784 1.02133757 0.93402851 0.95385022 0.97647271 0.9531324
 0.98500438 0.94361867 0.90219701 0.94405868 0.95912437 1.00556697
 0.95011314 0.98323619 0.99177778 0.9108977  0.94459388 1.0363726
 0.9690927  0.96206875 0.94796788 0.92010759 0.97781148 0.93601271
 0.92883382 0.94521719 0.94858226 0.99915999 0.9962079  0.92843374
 0.95161259 0.88601719 0.95771869 0.95641685 0.984441   0.92390783
 0.93056282 0.86474917 0.89432304 1.00377428 0.95646433 0.90940665
 0.98086143 0.90254038 0.87978519 1.06662283 0.94486019 0.94495828
 0.95506121 0.96524769 0.92935201 0.93451201 0.93334861 0.91025738
 0.95851298 0.96473472 0.95220785 0.95979503 0.988381

Saved results to .cache/paper/enron/n_totalcal=10/score=APS/seed=4_allresults.pkl
Computing conformal score...

seed=0
Observed metric: 1.6242735442442768
Metric under null: [0.99660944 0.96814101 0.98320062 0.95566999 0.971788   1.06039651
 1.01939803 1.02055912 1.04339107 1.08629773 1.03448813 1.07698743
 0.97083932 1.08108637 0.99825588 1.13032736 1.10948055 0.99966935
 0.95678551 1.04286828 1.04428002 1.02092602 1.0296535  0.95906284
 0.99685708 1.01013129 0.96897257 0.96304965 1.04025067 0.94992232
 0.9483076  1.00478473 0.98792485 0.97147729 1.0835635  0.98534561
 0.9612388  1.022255   1.03116486 0.94290379 0.99910859 0.95318437
 1.01771861 0.94349384 1.01440871 1.04772087 1.06469245 1.03110061
 0.95986818 0.99265218 1.09084883 1.03085055 0.98031649 0.99256266
 0.97858008 1.00353638 1.04355841 1.03397856 0.98714754 0.99333672
 1.02077288 0.95858816 0.97767641 0.89211912 1.01134063 1.02391512
 1.06486796 1.14283825 0.9982085  1.02100748 1.06049615 1.03823126
 1.0154119  0.92093787

Cluster sizes: [106, 105, 95, 89, 80, 78, 73, 69, 64, 62, 62, 59, 58]
Saved results to .cache/paper/enron/n_totalcal=30/score=softmax/seed=2_allresults.pkl

seed=3
Observed metric: 1.5975904805537986
Metric under null: [0.9843091  0.97898047 1.0115245  1.01764471 1.0507332  1.05919117
 1.01961833 1.01531829 0.9457263  1.00424791 1.02629685 0.94959843
 1.05362717 0.97452994 0.99677226 1.05457214 0.97403596 1.08230396
 0.98029065 1.02994441 1.0558452  1.02137037 0.96521884 0.96743178
 0.9496808  0.96059731 0.9143142  0.94827618 0.9993758  0.92742423
 0.9708458  1.05521835 1.06755462 1.04525867 0.98392128 1.00512701
 1.03530211 1.03703091 0.97746843 0.9863736  1.02223849 0.94100449
 0.95083215 1.00900452 0.98595245 0.90425977 0.99265979 1.04116158
 1.03161182 1.06885165 0.98505549 1.09476566 1.00220573 0.95774699
 1.07751088 1.07126967 0.97251486 1.11617177 0.96054573 0.99130028
 1.09234317 0.97130791 1.06929702 0.95821883 0.98551347 0.95269851
 1.0318425  0.99899307 1.01483724 0.94082111

Saved results to .cache/paper/enron/n_totalcal=30/score=APS/seed=0_allresults.pkl

seed=1
Observed metric: 0.6310557751819967
Metric under null: [0.49787932 0.54667096 0.51985858 0.5094465  0.48769079 0.54230943
 0.53138179 0.54950182 0.54338785 0.54181368 0.50316769 0.4963636
 0.53629325 0.53787823 0.58440005 0.49107577 0.52016769 0.50715192
 0.54077299 0.55007237 0.55691631 0.5001813  0.5302423  0.55464158
 0.52035252 0.51490384 0.52646344 0.51046759 0.56919576 0.5091751
 0.51408924 0.48778758 0.5245787  0.53899687 0.49983047 0.54936522
 0.51836465 0.51187206 0.50759963 0.48589515 0.53479961 0.55114932
 0.49975592 0.57205563 0.53647241 0.54949552 0.51806719 0.50980055
 0.52584396 0.52936669 0.5476109  0.5359042  0.546516   0.51298284
 0.51028474 0.54980116 0.52989408 0.5776543  0.54591468 0.49606785
 0.52464961 0.51034801 0.53857598 0.53686967 0.53335924 0.53374432
 0.52300289 0.47653032 0.51782189 0.51016347 0.49974148 0.54943852
 0.52383947 0.51252666 0.51629696 0.50208268 0.537805

Cluster sizes: [116, 110, 105, 93, 91, 87, 84, 76, 62, 59, 56, 42, 19]
Saved results to .cache/paper/enron/n_totalcal=30/score=APS/seed=3_allresults.pkl

seed=4
Observed metric: 0.5960192355190465
Metric under null: [0.48384502 0.5100109  0.55337607 0.51498589 0.5216585  0.52861881
 0.51169776 0.5546665  0.51091746 0.51288308 0.5735918  0.56593477
 0.50337408 0.51961834 0.51287011 0.53340269 0.49468073 0.49040549
 0.4995283  0.54219594 0.48911121 0.52106003 0.54336615 0.52365882
 0.5504858  0.50146675 0.54339715 0.49232261 0.50445669 0.47319215
 0.52161889 0.51928597 0.50521046 0.4774047  0.49237532 0.48917609
 0.50345498 0.5340785  0.49342769 0.546858   0.50953237 0.50737095
 0.51002115 0.53517822 0.499127   0.50535284 0.56076268 0.55772397
 0.49646348 0.494063   0.55021508 0.50671716 0.48131137 0.52560465
 0.5207383  0.52642828 0.50296008 0.52803274 0.50901336 0.5148541
 0.54193406 0.50381723 0.49538805 0.52135564 0.48194022 0.52923977
 0.49990649 0.52418121 0.54639963 0.54028249 0.5

In [33]:
# Produce table of results

def initialize_metrics_dict(methods):
    
    
    metrics = {}
    for method in methods:
        metrics[method] = {'class_cov_gap': [],
                           'avg_set_size': []} # Could also retrieve other metrics
        
    return metrics

def average_results_across_seeds(folder):
    
    methods = ['standard', 'classwise', 'smart_cluster', 'always_cluster']
    
    file_names = sorted(glob.glob(os.path.join(folder, '*.pkl')))
    num_seeds = len(file_names)
    print('Number of seeds found:', num_seeds)
    
    metrics = initialize_metrics_dict(methods)
    
    for pth in file_names:
        with open(pth, 'rb') as f:
            results = pickle.load(f)
            
        for method in methods:
            metrics[method]['class_cov_gap'].append(results[method][2]['mean_class_cov_gap'])
            metrics[method]['avg_set_size'].append(results[method][3]['mean'])
            
    cov_means = []
    cov_ses = []
    set_size_means = []
    set_size_ses = []
    
    
    for method in methods:
#         print(metrics[method]['class_cov_gap'])
        cov_means.append(np.mean(metrics[method]['class_cov_gap']))
        cov_ses.append(np.std(metrics[method]['class_cov_gap'], ddof=1))
        
        set_size_means.append(np.mean(metrics[method]['avg_set_size']))
        set_size_ses.append(np.std(metrics[method]['avg_set_size'], ddof=1))
        
    df = pd.DataFrame({'method': methods,
                      'class_cov_gap_mean': cov_means,
                      'class_cov_gap_se': cov_ses,
                      'avg_set_size_mean': set_size_means,
                      'avg_set_size_se': set_size_ses})
    
    display(df)
    return df
        

In [34]:
for n_totalcal in n_totalcal_list:
    for score in score_function_list:
        print(f'===== n_totalcal={n_totalcal}, score={score} =====')
        folder = f'.cache/paper/enron/n_totalcal={n_totalcal}/score={score}/'
        average_results_across_seeds(folder)

===== n_totalcal=10, score=softmax =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.052153,0.001015,1.946379,0.067398
1,classwise,0.065195,0.001991,26.176198,4.200345
2,smart_cluster,0.045512,0.002663,3.188571,1.030001
3,always_cluster,0.045512,0.002663,3.188571,1.030001


===== n_totalcal=10, score=APS =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.025781,0.000292,25.872699,0.572171
1,classwise,0.065905,0.00127,78.931868,5.413114
2,smart_cluster,0.025781,0.000292,25.872699,0.572171
3,always_cluster,0.025781,0.000292,25.872699,0.572171


===== n_totalcal=30, score=softmax =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.051958,0.000548,1.961016,0.039351
1,classwise,0.041756,0.001027,5.93584,0.344422
2,smart_cluster,0.034037,0.001734,2.689713,0.15381
3,always_cluster,0.034037,0.001734,2.689713,0.15381


===== n_totalcal=30, score=APS =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.026115,0.000121,26.024229,0.332259
1,classwise,0.042572,0.000788,42.612352,1.374026
2,smart_cluster,0.026696,0.001301,27.348073,1.160882
3,always_cluster,0.026696,0.001301,27.348073,1.160882
