This notebook describes how the unbalanced digits datasets are created.

In [1]:
from torchvision import datasets
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib
import scipy

# MNIST

## Download training and testing dataset

In [2]:
mnist_train = datasets.MNIST('/tmp', train=True, transform=None, target_transform=None, download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /tmp/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/MNIST/raw/train-images-idx3-ubyte.gz to /tmp/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /tmp/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/MNIST/raw/train-labels-idx1-ubyte.gz to /tmp/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /tmp/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/MNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /tmp/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/MNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/MNIST/raw
Processing...
Done!


In [3]:
mnist_test = datasets.MNIST('/tmp', train=False, transform=None, target_transform=None, download=True)

## Count the number of examples for each class

In [4]:
np.unique(mnist_train.targets, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949]))

In [5]:
np.unique(mnist_test.targets, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009]))

## pick subset from a linear-like distribution (mild unbalance)

In [8]:
linear_ratio = np.array(range(1,11))
linear_ratio = linear_ratio / linear_ratio.max()

### the test set

In [9]:
stats = np.unique(mnist_test.targets, return_counts=True)[1]
subset = stats.min() * linear_ratio
subset = subset.astype(int)
subset

array([ 89, 178, 267, 356, 446, 535, 624, 713, 802, 892])

### the traiing set

In [10]:
stats = np.unique(mnist_train.targets, return_counts=True)[1]
subset = stats.min() * linear_ratio
subset = subset.astype(int)
subset

array([ 542, 1084, 1626, 2168, 2710, 3252, 3794, 4336, 4878, 5421])

## pick subset from a Pareto distribution (extreme unbalance)

In [12]:
num_of_class = 10
alpha=0.01
x = np.linspace(1, 5, num_of_class)
target = scipy.stats.genpareto.pdf(x, alpha).tolist()
severe_ratio = np.array([t/sum(target) for t in target])
severe_ratio = severe_ratio / max(severe_ratio)
stats = np.unique(mnist_train.targets, return_counts=True)[1]
subset = stats.min() * severe_ratio
subset = np.flip(subset.astype(int))

In [13]:
subset

array([ 107,  164,  253,  389,  601,  930, 1441, 2237, 3479, 5421])

# SVHN

SVHN uses the same approach to unbalance the dataset.

In [14]:
svhn_train = datasets.SVHN('/tmp', split='train', transform=None, target_transform=None, download=True)

Downloading http://ufldl.stanford.edu/housenumbers/train_32x32.mat to /tmp/train_32x32.mat


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [15]:
svhn_test = datasets.SVHN('/tmp', split='test', transform=None, target_transform=None, download=True)

Downloading http://ufldl.stanford.edu/housenumbers/test_32x32.mat to /tmp/test_32x32.mat


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [16]:
np.unique(svhn_train.labels, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 4948, 13861, 10585,  8497,  7458,  6882,  5727,  5595,  5045,
         4659]))

In [17]:
np.unique(svhn_test.labels, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1744, 5099, 4149, 2882, 2523, 2384, 1977, 2019, 1660, 1595]))

In [18]:
np.unique(svhn_test.labels, return_counts=True)[1] / np.unique(svhn_test.labels, return_counts=True)[1].sum()

array([0.06699447, 0.19587431, 0.15938076, 0.1107099 , 0.09691918,
       0.09157959, 0.07594499, 0.07755839, 0.06376767, 0.06127074])

In [19]:
linear_ratio = np.array(range(1,11))
linear_ratio = linear_ratio / linear_ratio.max()
stats = np.unique(svhn_train.labels, return_counts=True)[1]
subset = stats.min() * linear_ratio
subset = subset.astype(int)
subset

array([ 465,  931, 1397, 1863, 2329, 2795, 3261, 3727, 4193, 4659])

In [20]:
sum(subset)

25620

### severe imbalance

In [21]:
num_of_class = 10
alpha=0.01
x = np.linspace(1, 5, num_of_class)
target = scipy.stats.genpareto.pdf(x, alpha).tolist()
severe_ratio = np.array([t/sum(target) for t in target])
severe_ratio = severe_ratio / max(severe_ratio)
stats = np.unique(svhn_train.labels, return_counts=True)[1]
subset = stats.min() * severe_ratio
subset = np.flip(subset.astype(int))

In [22]:
subset

array([  92,  141,  217,  335,  517,  799, 1238, 1922, 2990, 4659])