In [1]:
import sys; sys.path.append("../") # For relative imports

from scipy.stats import entropy

from utils.experiment_utils import *

%load_ext autoreload
%autoreload 2

In this notebook, we compute various measures of class imbalance for each dataset. The metrics are computed on the data not used for model training. The metric we use in the paper is `Normalized fraction of mass in rarest 0.05 of classes`, since we find that this metric best captures the type of imbalance that is challenging for our problem setting.

In [2]:
dataset_list = ['imagenet', 'cifar-100', 'places365', 'inaturalist']

for dataset in dataset_list:
    print(f'\n==== Dataset: {dataset} ====')
    softmax_scores, labels = load_dataset(dataset)
    cts = Counter(labels).values()
    cts = sorted(np.array(list(cts)))
    num_classes = len(cts)
    print('Min count:', min(cts))
    print('Max count:', max(cts))
    print(f'Min/max ratio: { min(cts)/max(cts):.3f}')
    frac = .05
    print(f'Normalized fraction of mass in rarest {frac} of classes: {(np.sum(cts[:int(frac*num_classes)])/len(labels)) / .05}')
    print(f'# of examples in rarest {frac} of classes divided by expected number if uniform: {np.sum(cts[:int(frac*num_classes)])/(len(labels) * .05)}') # Another view
    print('Normalized Shannon entropy:', entropy(cts) / np.log(len(cts))) # See https://stats.stackexchange.com/questions/239973/a-general-measure-of-data-set-imbalance
    print('[.25, .5, .75, .9] class count quantiles:', np.quantile(cts, [.25, .5, .75, .9]))


==== Dataset: imagenet ====
softmax_scores shape: (1153051, 1000)
Min count: 663
Max count: 1201
Min/max ratio: 0.552
Normalized fraction of mass in rarest 0.05 of classes: 0.7905461250196218
# of examples in rarest 0.05 of classes divided by expected number if uniform: 0.7905461250196218
Normalized Shannon entropy: 0.9997548276966274
[.25, .5, .75, .9] class count quantiles: [1159. 1168. 1176. 1184.]

==== Dataset: cifar-100 ====
softmax_scores shape: (30000, 100)
Min count: 257
Max count: 330
Min/max ratio: 0.779
Normalized fraction of mass in rarest 0.05 of classes: 0.9039999999999999
# of examples in rarest 0.05 of classes divided by expected number if uniform: 0.904
Normalized Shannon entropy: 0.9997848210317252
[.25, .5, .75, .9] class count quantiles: [290.   301.5  310.25 316.1 ]

==== Dataset: places365 ====
softmax_scores shape: (183996, 365)
Min count: 300
Max count: 576
Min/max ratio: 0.521
Normalized fraction of mass in rarest 0.05 of classes: 0.7687123633122458
# of exam