In [18]:
import glob # For getting file names
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
# import seaborn as sns
# import torch

from collections import Counter
# from gap_statistic import OptimalK
from scipy import stats, cluster
from sklearn.cluster import KMeans
# from yellowbrick.cluster import KElbowVisualizer

from utils.clustering_utils import *
from utils.conformal_utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
def run_experiment(softmax_scores, labels,
                  save_folder,
                  alpha=.1,
                  n_totalcal_list=[10, 30],
                  score_function_list = ['softmax', 'APS'],
                  seeds = [0,1,2,3,4]):
    
    num_classes = softmax_scores.shape[1]
    
    for n_totalcal in n_totalcal_list:
        for score_function in score_function_list:
            curr_folder = os.path.join(save_folder, f'n_totalcal={n_totalcal}/score={score_function}')
            os.makedirs(curr_folder, exist_ok=True)
            
            print(f'====== score_function={score_function} ======')
    
            print('Computing conformal score...')
            if score_function == 'softmax':
                scores_all = 1 - softmax_scores
            elif score_function == 'APS':
                scores_all = get_APS_scores_all(softmax_scores, randomize=True)
            elif score_function == 'RAPS': 

                # RAPS hyperparameters (currently using ImageNet defaults)
                lmbda = .01 
                kreg = 5

                scores_all = get_RAPS_scores_all(softmax_scores, lmbda, kreg, randomize=True)
            else:
                raise Exception('Undefined score function')

            for seed in seeds:
                print(f'\nseed={seed}')
                save_to = os.path.join(curr_folder, f'seed={seed}_allresults.pkl')
                
                # Split data
                totalcal_scores_all, totalcal_labels, val_scores_all, val_labels = split_X_and_y(scores_all, labels, n_totalcal, num_classes=num_classes, seed=seed)
    
                # 1) Compute baselines
                # Standard conformal
                standard_qhat = compute_qhat(totalcal_scores_all, totalcal_labels, alpha=alpha)
                standard_preds = create_prediction_sets(val_scores_all, standard_qhat)
                
                coverage_metrics, set_size_metrics = compute_all_metrics(val_labels, standard_preds, alpha)
                standard_results = (standard_qhat, standard_preds, coverage_metrics, set_size_metrics)
                
                # Class-wise conformal
                classwise_qhats = compute_class_specific_qhats(totalcal_scores_all, totalcal_labels, alpha=alpha, default_qhat=np.inf)
                classwise_preds = create_cb_prediction_sets(val_scores_all, classwise_qhats)
                
                coverage_metrics, set_size_metrics = compute_all_metrics(val_labels, classwise_preds, alpha)
                classwise_results = (classwise_qhats, classwise_preds, coverage_metrics, set_size_metrics)

                # 2) Always Cluster
                # results contain qhats, preds, coverage_metrics, set_size_metrics
                always_cluster_results = automatic_clustered_conformal(totalcal_scores_all, totalcal_labels,
                                                                                alpha,
                                                                                val_scores_all, val_labels, 
                                                                                cluster='smart')

                # 3) Smart Cluster
                smart_cluster_results = automatic_clustered_conformal(totalcal_scores_all, totalcal_labels,
                                                                                alpha,
                                                                                val_scores_all, val_labels, 
                                                                                cluster='smart')

                # Save results 
                all_results = {'standard': standard_results,
                               'classwise': classwise_results,
                               'always_cluster': always_cluster_results,
                               'smart_cluster': smart_cluster_results}
                with open(save_to,'wb') as f:
                    pickle.dump(all_results, f)
                    print(f'Saved results to {save_to}')

# Run

In [27]:
# Enron - BERT (n_train=500)
softmax_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_softmax_bert_ntrain=500.npy"
labels_path = "../class-conditional-conformal-datasets/notebooks/.cache/email_labels_bert_ntrain=500.npy"

# # ImageNet
# softmax_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_softmax.npy'
# labels_path = '/home/tding/data/finetuned_imagenet/imagenet_train_subset_labels.npy'

 
# SETTINGS
alpha = .1
n_totalcal_list = [10, 30]
score_function_list = ['softmax', 'APS']

save_folder = '.cache/paper/enron'
softmax_scores = np.load(softmax_path)
labels = np.load(labels_path)

run_experiment(softmax_scores, labels,
                  save_folder,
                  alpha=alpha,
                  n_totalcal_list=n_totalcal_list,
                  score_function_list = score_function_list,
                  seeds=[0,1,2,3,4])

In [29]:
# Produce table of results

def initialize_metrics_dict(methods):
    
    
    metrics = {}
    for method in methods:
        metrics[method] = {'class_cov_gap': [],
                           'avg_set_size': []} # Could also retrieve other metrics
        
    return metrics

def average_results_across_seeds(folder):
    
    methods = ['standard', 'classwise', 'smart_cluster', 'always_cluster']
    
    file_names = sorted(glob.glob(os.path.join(folder, '*.pkl')))
    num_seeds = len(file_names)
    print('Number of seeds found:', num_seeds)
    
    metrics = initialize_metrics_dict(methods)
    
    for pth in file_names:
        with open(pth, 'rb') as f:
            results = pickle.load(f)
            
        for method in methods:
            metrics[method]['class_cov_gap'].append(results[method][2]['mean_class_cov_gap'])
            metrics[method]['avg_set_size'].append(results[method][3]['mean'])
            
    cov_means = []
    cov_ses = []
    set_size_means = []
    set_size_ses = []
    
    
    for method in methods:
#         print(metrics[method]['class_cov_gap'])
        cov_means.append(np.mean(metrics[method]['class_cov_gap']))
        cov_ses.append(np.std(metrics[method]['class_cov_gap'], ddof=1))
        
        set_size_means.append(np.mean(metrics[method]['avg_set_size']))
        set_size_ses.append(np.std(metrics[method]['avg_set_size'], ddof=1))
        
    df = pd.DataFrame({'method': methods,
                      'class_cov_gap_mean': cov_means,
                      'class_cov_gap_se': cov_ses,
                      'avg_set_size_mean': set_size_means,
                      'avg_set_size_se': set_size_ses})
    
    display(df)
    return df
        

In [30]:
for n_totalcal in n_totalcal_list:
    for score in score_function_list:
        print(f'===== n_totalcal={n_totalcal}, score={score} =====')
        folder = f'.cache/paper/enron/n_totalcal={n_totalcal}/score={score}/'
        average_results_across_seeds(folder)

===== n_totalcal=10, score=softmax =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.019061,0.001269,41.523726,1.287926
1,classwise,0.069988,0.004737,54.080526,3.038402
2,smart_cluster,0.019885,0.001865,42.360548,1.513678
3,always_cluster,0.019885,0.001865,42.360548,1.513678


===== n_totalcal=10, score=APS =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.019441,0.001165,43.223508,1.152523
1,classwise,0.069227,0.005211,56.349869,2.892184
2,smart_cluster,0.019717,0.00082,43.216523,1.165387
3,always_cluster,0.019717,0.00082,43.216523,1.165387


===== n_totalcal=30, score=softmax =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.018525,0.000277,40.526348,0.891728
1,classwise,0.039618,0.003893,45.106077,1.438274
2,smart_cluster,0.023192,0.001531,41.721836,2.080768
3,always_cluster,0.023192,0.001531,41.721836,2.080768


===== n_totalcal=30, score=APS =====
Number of seeds found: 5


Unnamed: 0,method,class_cov_gap_mean,class_cov_gap_se,avg_set_size_mean,avg_set_size_se
0,standard,0.018333,0.000275,41.554052,0.960803
1,classwise,0.039144,0.002666,47.019755,1.509452
2,smart_cluster,0.018333,0.000275,41.554052,0.960803
3,always_cluster,0.018333,0.000275,41.554052,0.960803
