In [1]:
import numpy as np
import scipy
from scipy.stats import chisquare
import os
import imageio

In [2]:
def get_frequency(x, size, n_splits):
    x = np.array(x)
    x = x * size / n_splits
    x = x.astype(int)
    matched = sum(x)
    return [matched, size - matched]

def get_frequencies(a, b, c, d, size, n_splits):
    af = get_frequency(a, size, n_splits)
    bf = get_frequency(b, size, n_splits)
    cf = get_frequency(c, size, n_splits)
    df = get_frequency(d, size, n_splits)
    return af, bf, cf, df

# CIFAR-100

In [40]:
a = [0.4428, 0.4512, 0.4484, 0.4485, 0.4577, 0.457]
b = [0.3183, 0.3265, 0.3134, 0.3345, 0.325, 0.3234]
c = [0.3412, 0.3624, 0.3531, 0.3609, 0.3543, 0.3513]
d = [0.3857, 0.3949, 0.3937, 0.3897, 0.4124, 0.3929]
af, bf, cf, df = get_frequencies(a, b, c, d, 60000, 6)
af, bf, cf, df

([27056, 32944], [19411, 40589], [21232, 38768], [23693, 36307])

In [47]:
chisquare(af, f_exp=bf), chisquare(af, f_exp=cf), chisquare(af, f_exp=df)

(Power_divergenceResult(statistic=4450.921847019982, pvalue=0.0),
 Power_divergenceResult(statistic=2472.462314027122, pvalue=0.0),
 Power_divergenceResult(statistic=788.850246126827, pvalue=1.4329779284388962e-173))

# SVHN

In [50]:
a = [0.9326431132417516, 0.9331238417532833, 0.9300217548948514, 0.929981468052534]
b = [0.9115739435201224, 0.9151559100797679, 0.9182177100958827, 0.9090725968898558]
c = [0.925069492003384, 0.9253484811860446, 0.9239384417049392, 0.9225284022238337]
d = [0.9249889215646779, 0.9247441785512851, 0.920554346950286, 0.9203126258963823]
af, bf, cf, df = get_frequencies(a, b, c, d, 73257 + 26032, 4)
af, bf, cf, df

([92481, 6808], [90700, 8589], [91764, 7525], [91608, 7681])

In [51]:
chisquare(af, f_exp=bf), chisquare(af, f_exp=cf), chisquare(af, f_exp=df)

(Power_divergenceResult(statistic=404.2770479471512, pvalue=6.454866885149661e-90),
 Power_divergenceResult(statistic=73.91977010071047, pvalue=8.135738846697126e-18),
 Power_divergenceResult(statistic=107.54208626128928, pvalue=3.386374489248262e-25))

# CIFAR-10

In [52]:
a = [0.7729, 0.7684, 0.773, 0.7674, 0.7749, 0.7662]
b = [0.7312, 0.7296, 0.7286, 0.7299, 0.7389, 0.7186]
c = [0.7625, 0.7541, 0.7469, 0.7564, 0.7607, 0.7567]
d = [0.7324, 0.7187, 0.7138, 0.7209, 0.7298, 0.7198]
af, bf, cf, df = get_frequencies(a, b, c, d, 60000, 6)
af, bf, cf, df

([46228, 13772], [43768, 16232], [45373, 14627], [43354, 16646])

In [53]:
chisquare(af, f_exp=bf), chisquare(af, f_exp=cf), chisquare(af, f_exp=df)

(Power_divergenceResult(statistic=511.0845220990934, pvalue=3.683445058756479e-113),
 Power_divergenceResult(statistic=66.08923476468203, pvalue=4.3096318278006714e-16),
 Power_divergenceResult(statistic=686.7295166447416, pvalue=2.298779210820388e-151))

# Fashion MNIST

In [54]:
a = [0.9333, 0.9314, 0.9313, 0.9325, 0.9384, 0.9298, 0.9321]
b = [0.923, 0.9246, 0.9233, 0.9269, 0.9218, 0.9249, 0.9231]
c = [0.9259, 0.9275, 0.9257, 0.9297, 0.9311, 0.926, 0.9271]
d = [0.9187, 0.9226, 0.9206, 0.9223, 0.9228, 0.9191, 0.9206]
af, bf, cf, df = get_frequencies(a, b, c, d, 70000, 7)
af, bf, cf, df

([65288, 4712], [64675, 5325], [64929, 5071], [64467, 5533])

In [55]:
chisquare(af, f_exp=bf), chisquare(af, f_exp=cf), chisquare(af, f_exp=df)

(Power_divergenceResult(statistic=76.37706045576384, pvalue=2.3435787913350287e-18),
 Power_divergenceResult(statistic=27.40025549622777, pvalue=1.6540938236656524e-07),
 Power_divergenceResult(statistic=132.27757466499244, pvalue=1.3009578325038647e-30))

# MNIST

In [56]:
a = [0.9926, 0.9936, 0.9928, 0.9938, 0.9946, 0.9933, 0.994]
b = [0.9923, 0.9935, 0.9937, 0.9932, 0.9941, 0.9924, 0.9943]
c = [0.9932, 0.9938, 0.9926, 0.9942, 0.9947, 0.9913, 0.9935]
d = [0.9914, 0.993, 0.9926, 0.9923, 0.9929, 0.9907, 0.9914]
af, bf, cf, df = get_frequencies(a, b, c, d, 70000, 7)
af, bf, cf, df

([69547, 453], [69535, 465], [69533, 467], [69443, 557])

In [57]:
chisquare(af, f_exp=bf), chisquare(af, f_exp=cf), chisquare(af, f_exp=df)

(Power_divergenceResult(statistic=0.31174831890182947, pvalue=0.576609323427017),
 Power_divergenceResult(statistic=0.42251901959204063, pvalue=0.5156827712976023),
 Power_divergenceResult(statistic=19.574066027467445, pvalue=9.677395049205175e-06))

# Tiny ImageNet

In [6]:
a = [0.1862, 0.1854, 0.1917, 0.1792, 0.2002, 0.1792, 0.19, 0.213, 0.1825, 0.1743, 0.1804]
b = [0.1087, 0.1094, 0.1176, 0.111, 0.1162, 0.1078, 0.1183, 0.1164, 0.1093, 0.1149, 0.1138]
c = [0.1139, 0.1136, 0.1129, 0.1177, 0.1136, 0.1063, 0.115, 0.1142, 0.1077, 0.1128, 0.1168]
d = [0.1786, 0.1581, 0.1702, 0.1729, 0.1791, 0.1723, 0.1706, 0.179, 0.1678, 0.1617, 0.168]
af, bf, cf, df = get_frequencies(a, b, c, d, 110000, 11)
af, bf, cf, df

([20621, 89379], [12434, 97566], [12445, 97555], [18783, 91217])

In [7]:
chisquare(af, f_exp=bf), chisquare(af, f_exp=cf), chisquare(af, f_exp=df)

(Power_divergenceResult(statistic=6077.61104577684, pvalue=0.0),
 Power_divergenceResult(statistic=6056.61567964288, pvalue=0.0),
 Power_divergenceResult(statistic=216.89171157633024, pvalue=4.309253518800172e-49))

In [46]:
chisquare(af, f_exp=df)

Power_divergenceResult(statistic=788.850246126827, pvalue=1.4329779284388962e-173)

In [25]:
das_wl1 = np.array([0.4428, 0.4512, 0.4484, 0.4485, 0.4577, 0.457])
size = 60000
das_wl1 = das_wl1 * size / 6
das_wl1

array([4428., 4512., 4484., 4485., 4577., 4570.])

In [26]:
das_wl1 = das_wl1.astype(int)
das_wl1

array([4428, 4512, 4484, 4485, 4577, 4570])

In [27]:
summed = sum(das_wl1)
das_wl1_f = [summed, size - summed]
das_wl1_f

[27056, 32944]

In [32]:
static_a = np.array([0.3183, 0.3265, 0.3134, 0.3345, 0.325, 0.3234])
size = 60000
static_a = static_a * size / 6
static_a

array([3183., 3265., 3134., 3345., 3250., 3234.])

In [33]:
static_a = static_a.astype(int)
static_a

array([3183, 3265, 3134, 3345, 3250, 3234])

In [34]:
summed = sum(static_a)
static_a_f = [summed, size - summed]
static_a_f

[19411, 40589]

In [35]:
chisquare(das_wl1_f, f_exp=static_a_f)

Power_divergenceResult(statistic=4450.921847019982, pvalue=0.0)

In [3]:
class Dataset:
    def __init__(self, X_train, y_train, X_test, y_test, shape, shape_flattened):
        X_train = X_train.astype(dtype) / 255.0
        y_train = y_train.astype(dtype)
        X_test = X_test.astype(dtype)  / 255.0
        y_test = y_test.astype(dtype)

        X_train = np.reshape(X_train, shape_flattened)
        X_test = np.reshape(X_test, shape_flattened)

        X = np.concatenate((X_train, X_test), axis=0)
        y = np.concatenate((y_train, y_test), axis=0)

        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler()
        scaler.fit(X_train)  # Scaling each feature independently

        X_norm = scaler.transform(X)
        X_train_norm = scaler.transform(X_train)
        X_test_norm = scaler.transform(X_test)

        X_norm = np.reshape(X_norm, shape)
        X_train_norm = np.reshape(X_train_norm, shape)
        X_test_norm = np.reshape(X_test_norm, shape)

        self.X_norm = X_norm
        self.y = y
        self.X_train_norm = X_train_norm
        self.y_train = y_train
        self.X_test_norm = X_test_norm
        self.y_test = y_test


def get_cifar_10_dataset():
    cifar10 = tf.keras.datasets.cifar10
    (X_train, y_train), (X_test, y_test) = cifar10.load_data()
    shape = (-1, 32, 32, 3)
    shape_flattened = (-1, 3072)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_cifar_100_dataset():
    cifar100 = tf.keras.datasets.cifar100
    (X_train, y_train), (X_test, y_test) = cifar100.load_data()
    shape = (-1, 32, 32, 3)
    shape_flattened = (-1, 3072)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_svhn_dataset():
    from urllib.request import urlretrieve
    from scipy import io

    train_filename, _ = urlretrieve('http://ufldl.stanford.edu/housenumbers/train_32x32.mat')
    test_filename, _ = urlretrieve('http://ufldl.stanford.edu/housenumbers/test_32x32.mat')

    X_train = io.loadmat(train_filename, variable_names='X').get('X')
    y_train = io.loadmat(train_filename, variable_names='y').get('y')
    X_test = io.loadmat(test_filename, variable_names='X').get('X')
    y_test = io.loadmat(test_filename, variable_names='y').get('y')

    X_train = np.moveaxis(X_train, -1, 0)
    y_train -= 1
    X_test = np.moveaxis(X_test, -1, 0)
    y_test -= 1

    shape = (-1, 32, 32, 3)
    shape_flattened = (-1, 3072)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_tiny_imagenet_dataset():
    """
    Original source: https://github.com/sonugiri1043/Train_ResNet_On_Tiny_ImageNet/blob/master/Train_ResNet_On_Tiny_ImageNet.ipynb
    Original author: sonugiri1043@gmail.com
    """

    if not os.path.isdir('IMagenet'):
        ! git clone https://github.com/seshuad/IMagenet

    print("Processing the downloaded dataset...")

    path = 'IMagenet/tiny-imagenet-200/'

    id_dict = {}
    for i, line in enumerate(open(path + 'wnids.txt', 'r')):
        id_dict[line.replace('\n', '')] = i

    train_data = list()
    test_data = list()
    train_labels = list()
    test_labels = list()

    for key, value in id_dict.items():
        train_data += [imageio.imread(path + 'train/{}/images/{}_{}.JPEG'.format(key, key, str(i)), pilmode='RGB') for i in range(500)]
        train_labels_ = np.array([[0]*200]*500)
        train_labels_[:, value] = 1
        train_labels += train_labels_.tolist()

    for line in open(path + 'val/val_annotations.txt'):
        img_name, class_id = line.split('\t')[:2]
        test_data.append(imageio.imread(path + 'val/images/{}'.format(img_name), pilmode='RGB'))
        test_labels_ = np.array([[0]*200])
        test_labels_[0, id_dict[class_id]] = 1
        test_labels += test_labels_.tolist()

    X_train = np.array(train_data)
    y_train = np.argmax(np.array(train_labels), axis=1)
    X_test = np.array(test_data)
    y_test = np.argmax(np.array(test_labels), axis=1)

    shape = (-1, 64, 64, 3)
    shape_flattened = (-1, 12288)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_mnist_dataset():
    mnist = tf.keras.datasets.mnist
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    shape = (-1, 28, 28, 1)
    shape_flattened = (-1, 1)  # Scaling all features together
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_fashion_mnist_dataset():
    fashion_mnist = tf.keras.datasets.fashion_mnist
    (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
    shape = (-1, 28, 28, 1)
    shape_flattened = (-1, 1)  # Scaling all features together
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)

In [49]:
svhn = get_svhn_dataset()

KeyboardInterrupt: 

In [4]:
dtype = 'float32'
tiny_imagenet = get_tiny_imagenet_dataset()

Processing the downloaded dataset...


  train_data += [imageio.imread(path + 'train/{}/images/{}_{}.JPEG'.format(key, key, str(i)), pilmode='RGB') for i in range(500)]
  test_data.append(imageio.imread(path + 'val/images/{}'.format(img_name), pilmode='RGB'))


In [5]:
tiny_imagenet.X_norm.shape

(110000, 64, 64, 3)