In [1]:
from scipy.stats import entropy

from utils.experiment_utils import *

%load_ext autoreload
%autoreload 2

In [59]:
dataset_list = ['imagenet', 'cifar-100', 'places365', 'inaturalist']

for dataset in dataset_list:
    print(f'\n==== Dataset: {dataset} ====')
    softmax_scores, labels = load_dataset(dataset)
    cts = Counter(labels).values()
    cts = sorted(np.array(list(cts)))
    num_classes = len(cts)
    print('Min count:', min(cts))
    print('Max count:', max(cts))
    print(f'Min/max ratio: { min(cts)/max(cts):.3f}')
    frac = .05
    print(f'Normalized fraction of mass in rarest {frac} of classes: {(np.sum(cts[:int(frac*num_classes)])/len(labels)) / .05}')
    print(f'# of examples in rarest {frac} of classes divided by expected number if uniform: {np.sum(cts[:int(frac*num_classes)])/(len(labels) * .05)}') # Another view
    print('Normalized Shannon entropy:', entropy(cts) / np.log(len(cts))) # See https://stats.stackexchange.com/questions/239973/a-general-measure-of-data-set-imbalance
    print('[.25, .5, .75, .9] class count quantiles:', np.quantile(cts, [.25, .5, .75, .9]))


==== Dataset: imagenet ====
softmax_scores shape: (1153051, 1000)
Min count: 663
Max count: 1201
Min/max ratio: 0.552
Normalized fraction of mass in rarest 0.05 of classes: 0.7905461250196218
# of examples in rarest 0.05 of classes divided by expected number if uniform: 0.7905461250196218
Normalized Shannon entropy: 0.9997548276966274
[.25, .5, .75, .9] class count quantiles: [1159. 1168. 1176. 1184.]

==== Dataset: cifar-100 ====
softmax_scores shape: (30000, 100)
Min count: 257
Max count: 330
Min/max ratio: 0.779
Normalized fraction of mass in rarest 0.05 of classes: 0.9039999999999999
# of examples in rarest 0.05 of classes divided by expected number if uniform: 0.904
Normalized Shannon entropy: 0.9997848210317252
[.25, .5, .75, .9] class count quantiles: [290.   301.5  310.25 316.1 ]

==== Dataset: places365 ====
softmax_scores shape: (183996, 365)
Min count: 300
Max count: 576
Min/max ratio: 0.521
Normalized fraction of mass in rarest 0.05 of classes: 0.7687123633122458
# of exam

In [52]:
n

1153051

In [6]:
dataset_list = ['inaturalist']

for dataset in dataset_list:
    print(f'\n==== Dataset: {dataset} ====')
    softmax_scores, labels = load_dataset(dataset)
    cts = Counter(labels).values()
    cts = np.array(list(cts))
    print('Min count:', min(cts))
    print('Max count:', max(cts))
    print(f'Min/max ratio: { min(cts)/max(cts):.3f}')
    print('Normalized Shannon entropy:', entropy(cts) / np.log(len(cts))) # See https://stats.stackexchange.com/questions/239973/a-general-measure-of-data-set-imbalance
    print('[.25, .5, .75, .9] class count quantiles:', np.quantile(cts, [.25, .5, .75, .9]))


==== Dataset: inaturalist ====
softmax_scores shape: (1393421, 1103)
> [0;32m/home/tding/code/class-conditional-conformal/utils/experiment_utils.py[0m(88)[0;36mload_dataset[0;34m()[0m
[0;32m     86 [0;31m[0;34m[0m[0m
[0m[0;32m     87 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 88 [0;31m    [0;32mif[0m [0mremove_rare_cls[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     89 [0;31m        [0msoftmax_scores[0m[0;34m,[0m [0mlabels[0m [0;34m=[0m [0mremove_rare_classes[0m[0;34m([0m[0msoftmax_scores[0m[0;34m,[0m [0mlabels[0m[0;34m,[0m [0mthresh[0m[0;34m=[0m[0;36m150[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     90 [0;31m[0;34m[0m[0m
[0m
ipdb> import Counter
*** ModuleNotFoundError: No module named 'Counter'
ipdb> Counter(labels)
Counter({907: 68838, 175: 32967, 969: 32890, 160: 24086, 163: 23479, 172: 23356, 1048: 21242, 216: 17942, 888: 17333, 872: 1508

ipdb> cts = Counter(labels).values()
ipdb> np.sum(cts > 250)
*** TypeError: '>' not supported between instances of 'dict_values' and 'int'
ipdb> cts = list(cts)
ipdb> np.sum(cts > 250)
*** TypeError: '>' not supported between instances of 'list' and 'int'
ipdb> cts = np.array(cts)
ipdb> np.sum(cts > 250)
630
ipdb> np.sum(cts > 300)
554
ipdb> np.sum(cts >= 250)
633
ipdb> q


In [10]:
labels_path = '../class-conditional-conformal-datasets/notebooks/.cache/archived/best-iNaturalist-model-vallabels_frac=0.5.npy'
labels = np.load(labels_path)

In [11]:
labels.max()

6413

In [36]:
cts = np.array(list(Counter(labels).values()))
np.sum(cts >= 20)

6414

In [33]:
cts

array([149, 166, 145, ..., 141, 143, 157])

In [39]:
labels_path = '../class-conditional-conformal-datasets/notebooks/.cache/best-iNaturalist-model-vallabels_frac=0.5.npy'
labels2 = np.load(labels_path)

In [40]:
labels2.max()

1102

In [41]:
cts2 = np.array(list(Counter(labels2).values()))

In [42]:
cts2

array([ 4004, 14503, 68838, ...,    82,    84,   107])

In [44]:
np.sum(cts2 >= 250)

633