In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat Jan 15 11:37:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    26W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from enum import Enum
import imageio
import os
import hashlib

%matplotlib inline

dtype = 'float32'
tf.keras.backend.set_floatx(dtype)

In [3]:
################################################################################
# DATASETS
################################################################################


class Dataset:
    def __init__(self, X_train, y_train, X_test, y_test, shape, shape_flattened):
        X_train = X_train.astype(dtype) / 255.0
        y_train = y_train.astype(dtype)
        X_test = X_test.astype(dtype)  / 255.0
        y_test = y_test.astype(dtype)

        X_train = np.reshape(X_train, shape_flattened)
        X_test = np.reshape(X_test, shape_flattened)

        X = np.concatenate((X_train, X_test), axis=0)
        y = np.concatenate((y_train, y_test), axis=0)

        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler()
        scaler.fit(X_train)  # Scaling each feature independently

        X_norm = scaler.transform(X)
        X_train_norm = scaler.transform(X_train)
        X_test_norm = scaler.transform(X_test)

        X_norm = np.reshape(X_norm, shape)
        X_train_norm = np.reshape(X_train_norm, shape)
        X_test_norm = np.reshape(X_test_norm, shape)

        self.X_norm = X_norm
        self.y = y
        self.X_train_norm = X_train_norm
        self.y_train = y_train
        self.X_test_norm = X_test_norm
        self.y_test = y_test


def get_cifar_10_dataset():
    cifar10 = tf.keras.datasets.cifar10
    (X_train, y_train), (X_test, y_test) = cifar10.load_data()
    shape = (-1, 32, 32, 3)
    shape_flattened = (-1, 3072)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_cifar_100_dataset():
    cifar100 = tf.keras.datasets.cifar100
    (X_train, y_train), (X_test, y_test) = cifar100.load_data()
    shape = (-1, 32, 32, 3)
    shape_flattened = (-1, 3072)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_svhn_dataset():
    from urllib.request import urlretrieve
    from scipy import io

    train_filename, _ = urlretrieve('http://ufldl.stanford.edu/housenumbers/train_32x32.mat')
    test_filename, _ = urlretrieve('http://ufldl.stanford.edu/housenumbers/test_32x32.mat')

    X_train = io.loadmat(train_filename, variable_names='X').get('X')
    y_train = io.loadmat(train_filename, variable_names='y').get('y')
    X_test = io.loadmat(test_filename, variable_names='X').get('X')
    y_test = io.loadmat(test_filename, variable_names='y').get('y')

    X_train = np.moveaxis(X_train, -1, 0)
    y_train -= 1
    X_test = np.moveaxis(X_test, -1, 0)
    y_test -= 1

    shape = (-1, 32, 32, 3)
    shape_flattened = (-1, 3072)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_tiny_imagenet_dataset():
    """
    Original source: https://github.com/sonugiri1043/Train_ResNet_On_Tiny_ImageNet/blob/master/Train_ResNet_On_Tiny_ImageNet.ipynb
    Original author: sonugiri1043@gmail.com
    """

    if not os.path.isdir('IMagenet'):
        ! git clone https://github.com/seshuad/IMagenet

    print("Processing the downloaded dataset...")

    path = 'IMagenet/tiny-imagenet-200/'

    id_dict = {}
    for i, line in enumerate(open(path + 'wnids.txt', 'r')):
        id_dict[line.replace('\n', '')] = i

    train_data = list()
    test_data = list()
    train_labels = list()
    test_labels = list()

    for key, value in id_dict.items():
        train_data += [imageio.imread(path + 'train/{}/images/{}_{}.JPEG'.format(key, key, str(i)), pilmode='RGB') for i in range(500)]
        train_labels_ = np.array([[0]*200]*500)
        train_labels_[:, value] = 1
        train_labels += train_labels_.tolist()

    for line in open(path + 'val/val_annotations.txt'):
        img_name, class_id = line.split('\t')[:2]
        test_data.append(imageio.imread(path + 'val/images/{}'.format(img_name), pilmode='RGB'))
        test_labels_ = np.array([[0]*200])
        test_labels_[0, id_dict[class_id]] = 1
        test_labels += test_labels_.tolist()

    X_train = np.array(train_data)
    y_train = np.argmax(np.array(train_labels), axis=1)
    X_test = np.array(test_data)
    y_test = np.argmax(np.array(test_labels), axis=1)

    shape = (-1, 64, 64, 3)
    shape_flattened = (-1, 12288)  # Scaling each feature independently
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_mnist_dataset():
    mnist = tf.keras.datasets.mnist
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    shape = (-1, 28, 28, 1)
    shape_flattened = (-1, 1)  # Scaling all features together
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


def get_fashion_mnist_dataset():
    fashion_mnist = tf.keras.datasets.fashion_mnist
    (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
    shape = (-1, 28, 28, 1)
    shape_flattened = (-1, 1)  # Scaling all features together
    return Dataset(X_train, y_train, X_test, y_test, shape=shape, shape_flattened=shape_flattened)


################################################################################
# REGULARIZERS
################################################################################


class Regularizer(tf.keras.regularizers.Regularizer):
    def __init__(self):
        self.n_new_neurons = 0
        self.scaling_tensor = None
        self.set_regularization_penalty(0.)
        self.set_regularization_method(None)

    def __call__(self, x):
        if self.regularization_method is None or self.regularization_penalty == 0:
            return 0
        elif self.regularization_method == 'weighted_l1':
            return self.weighted_l1(x)
        elif self.regularization_method == 'weighted_l1_reordered':
            return self.weighted_l1_reordered(x)
        elif self.regularization_method == 'group_sparsity':
            return self.group_sparsity(x)
        elif self.regularization_method == 'l1':
            return self.l1(x)
        else:
            raise NotImplementedError(f"Unknown regularization method {self.regularization_method}")
    
    def weighted_l1(self, x):
        # I.e. for a parameter matrix of 4 input and 10 output neurons:
        #
        # [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]
        #
        # the scaling tensor, as well as the resulting weighted values, could be:
        #
        # [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]]
        #
        # Therefore every additional output neuron is regularized more.

        scaling_tensor = tf.cumsum(tf.constant(self.regularization_penalty, shape=x.shape, dtype=dtype), axis=-1)
        weighted_values = scaling_tensor * tf.abs(x)
        return tf.reduce_sum(weighted_values)
    
    def weighted_l1_reordered(self, x):
        # I.e. for a parameter matrix of 4 input and 10 output neurons:
        #
        # [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]
        #
        # the scaling tensor, as well as the resulting weighted values, could be:
        #
        # [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]]
        #
        # Therefore every additional output neuron is regularized more.

        if self.update_scaling_tensor:
            scaling_tensor_raw = tf.cumsum(tf.constant(self.regularization_penalty, shape=x.shape, dtype=dtype), axis=-1)

            scaling_tensor_old_neurons = scaling_tensor_raw[:, :-self.n_new_neurons]
            scaling_tensor_new_neurons = scaling_tensor_raw[:, -self.n_new_neurons:]
            scaling_tensor_old_neurons_shuffled = tf.transpose(tf.random.shuffle(tf.transpose(scaling_tensor_old_neurons)))
            self.scaling_tensor = tf.concat([scaling_tensor_old_neurons_shuffled, scaling_tensor_new_neurons], axis=-1)
            self.update_scaling_tensor = False

        weighted_values = self.scaling_tensor * tf.abs(x)
        return tf.reduce_sum(weighted_values)
    
    def group_sparsity(self, x):
        # I.e. for a parameter matrix of 3 input and 5 output neurons:
        #
        # [[1., 1., 1., 1., 1.],
        #  [1., 2., 2., 1., 2.],
        #  [2., 2., 3., 1., 3.]]
        #
        # The resulting vector of group norms is [2., 2., 3., 1., 3.], therefore for
        # every output neuron, its incoming connections form a group.

        group_norms = tf.norm(x, ord=2, axis=0)
        # assert group_norms.shape[0] == x.shape[1]
        return self.regularization_penalty * tf.reduce_sum(group_norms)
    
    def l1(self, x):
        weighted_values = self.regularization_penalty * tf.abs(x)
        return tf.reduce_sum(weighted_values)
    
    def prune(self):
        self.n_new_neurons = 0
        if self.regularization_method == 'weighted_l1_reordered':
            self.update_scaling_tensor = True
    
    def grow(self, n_new_neurons):
        self.n_new_neurons = n_new_neurons
        if self.regularization_method == 'weighted_l1_reordered':
            self.update_scaling_tensor = True
    
    def set_regularization_penalty(self, regularization_penalty):
        self.regularization_penalty = regularization_penalty
    
    def set_regularization_method(self, regularization_method):
        self.regularization_method = regularization_method
        if self.regularization_method == 'weighted_l1_reordered':
            self.update_scaling_tensor = True
        else:
            self.update_scaling_tensor = None

    def get_config(self):
        return {'regularization_penalty': float(self.regularization_penalty)}


################################################################################
# LAYERS
################################################################################


class CustomLayer(tf.keras.layers.Layer):
    def __init__(self, input_shape):
        super().__init__()

        self.inpt_shp = input_shape


class Dense(CustomLayer):
    def __init__(self, units, activation, kernel_initializer='glorot_uniform', 
                 bias_initializer='zeros', input_shape=None, fixed_size=False):
        super().__init__(input_shape)

        self.units = units
        self.activation = activation
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
        self.fixed_size = fixed_size
        
        self.A = tf.keras.activations.get(activation)
        self.W_init = tf.keras.initializers.get(kernel_initializer)
        self.b_init = tf.keras.initializers.get(bias_initializer)
        self.regularizer = Regularizer()
    
    def build(self, input_shape):
        input_units = input_shape[-1]

        self.W = tf.Variable(
            name='W',
            initial_value=self.W_init(shape=(input_units, self.units), dtype=dtype),
            trainable=True)
        
        self.b = tf.Variable(
            name='b',
            initial_value=self.b_init(shape=(self.units,), dtype=dtype),
            trainable=True)
        
        self.add_loss(lambda: self.regularizer(tf.concat([self.W, tf.reshape(self.b, (1, -1))], axis=0)))
    
    def call(self, inputs, training=None):
        return self.A(tf.matmul(inputs, self.W) + self.b)

    def get_size(self):
        return self.W.shape[0], self.W.shape[1]
    
    def prune(self, threshold, active_input_units_indices):
        # Remove connections from pruned units in previous layer
        new_W = tf.gather(self.W.value(), active_input_units_indices, axis=0)

        if self.fixed_size:
            active_output_neurons_indices = list(range(new_W.shape[1]))
        else:
            # Prune units in this layer
            weights_with_biases = tf.concat([new_W, tf.reshape(self.b.value(), (1, -1))], axis=0)
            neurons_are_active = tf.math.reduce_max(tf.abs(weights_with_biases), axis=0) >= threshold
            active_output_neurons_indices = tf.reshape(tf.where(neurons_are_active), (-1,))
            
            new_W = tf.gather(new_W, active_output_neurons_indices, axis=1)
            new_b = tf.gather(self.b.value(), active_output_neurons_indices, axis=0)

            self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.W = tf.Variable(name='W', initial_value=new_W, trainable=True)

        self.regularizer.prune()
        return active_output_neurons_indices
    
    def grow(self, n_new_input_units, percentage, min_new_units, scaling_factor):
        if n_new_input_units > 0:
            # Add connections to grown units in previous layer
            W_growth = self.W_init(shape=(self.W.shape[0] + n_new_input_units, self.W.shape[1]), dtype=dtype)[-n_new_input_units:, :] * scaling_factor  # TODO is it better to be multiplying here by scaling_factor? It does help with not increasing the max weights of existing neurons when new neurons are added.
            new_W = tf.concat([self.W.value(), W_growth], axis=0)
        else:
            new_W = self.W.value()

        if self.fixed_size:
            n_new_output_units = 0
        else:
            # Grow new units in this layer
            n_new_output_units = max(min_new_units, int(new_W.shape[1] * percentage))
            if n_new_output_units > 0:
                W_growth = self.W_init(shape=(new_W.shape[0], new_W.shape[1] + n_new_output_units), dtype=dtype)[:, -n_new_output_units:] * scaling_factor
                b_growth = self.b_init(shape=(n_new_output_units,), dtype=dtype)  # TODO for all possible bias initializers to work properly, the whole bias vector should be initialized at once
                new_W = tf.concat([new_W, W_growth], axis=1)
                new_b = tf.concat([self.b.value(), b_growth], axis=0)

                self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.W = tf.Variable(name='W', initial_value=new_W, trainable=True)

        self.regularizer.grow(n_new_output_units)
        return n_new_output_units
    
    def set_regularization_penalty(self, regularization_penalty):
        if not self.fixed_size:
            self.regularizer.set_regularization_penalty(regularization_penalty)
    
    def set_regularization_method(self, regularization_method):
        if not self.fixed_size:
            self.regularizer.set_regularization_method(regularization_method)
    
    def get_param_string():
        param_string = ""
        weights_with_bias = tf.concat([self.W, tf.reshape(self.b, (1, -1))], axis=0)
        max_parameters = tf.math.reduce_max(tf.abs(weights_with_bias), axis=0).numpy()
        magnitudes = np.floor(np.log10(max_parameters))
        for m in magnitudes:
            if m > 0:
                m = 0
            param_string += str(int(-m))
        return param_string


class Conv2D(CustomLayer):
    def __init__(self, filters, filter_size, activation, strides=(1, 1), 
                 padding='SAME', kernel_initializer='glorot_uniform',
                 bias_initializer='zeros', input_shape=None, fixed_size=False):
        super().__init__(input_shape)
    
        self.filters = filters
        self.filter_size = filter_size
        self.activation = activation
        self.strides = strides
        self.padding = padding
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
        self.fixed_size = fixed_size
        
        self.A = tf.keras.activations.get(activation)
        self.F_init = tf.keras.initializers.get(kernel_initializer)
        self.b_init = tf.keras.initializers.get(bias_initializer)
        self.regularizer = Regularizer()
    
    def build(self, input_shape):
        input_filters = input_shape[-1]

        self.F = tf.Variable(
            name='F',
            initial_value=self.F_init(
                shape=(self.filter_size[0], self.filter_size[1], input_filters, self.filters), dtype=dtype
            ),
            trainable=True)
        
        self.b = tf.Variable(
            name='b',
            initial_value=self.b_init(shape=(self.filters,), dtype=dtype),
            trainable=True)

        self.add_loss(lambda: self.regularizer(tf.concat([tf.reshape(self.F, (-1, self.F.shape[-1])), tf.reshape(self.b, (1, -1))], axis=0)))
    
    def call(self, inputs, training=None):
        y = tf.nn.conv2d(inputs, self.F, strides=self.strides, padding=self.padding)
        y = tf.nn.bias_add(y, self.b)
        y = self.A(y)
        return y
    
    def get_size(self):
        return self.F.shape[-2], self.F.shape[-1]
    
    def prune(self, threshold, active_input_units_indices):
        # Remove connections from pruned units in previous layer
        new_F = tf.gather(self.F.value(), active_input_units_indices, axis=-2)

        if self.fixed_size:
            active_output_filters_indices = list(range(new_F.shape[-1]))
        else:
            # Prune units in this layer
            F_reduced_max = tf.reshape(tf.math.reduce_max(tf.abs(new_F), axis=(0, 1, 2)), (1, -1))
            F_reduced_max_with_biases = tf.concat([F_reduced_max, tf.reshape(self.b.value(), (1, -1))], axis=0)
            filters_are_active = tf.math.reduce_max(tf.abs(F_reduced_max_with_biases), axis=0) >= threshold
            active_output_filters_indices = tf.reshape(tf.where(filters_are_active), (-1,))
            
            new_F = tf.gather(new_F, active_output_filters_indices, axis=-1)
            new_b = tf.gather(self.b.value(), active_output_filters_indices, axis=0)

            self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.F = tf.Variable(name='F', initial_value=new_F, trainable=True)

        self.regularizer.prune()
        return active_output_filters_indices

    def grow(self, n_new_input_units, percentage, min_new_units, scaling_factor):
        if n_new_input_units > 0:
            # Add connections to grown units in previous layer
            F_growth = self.F_init(shape=(self.F.shape[0], self.F.shape[1], self.F.shape[2] + n_new_input_units, self.F.shape[3]), dtype=dtype)[:, :, -n_new_input_units:, :] * scaling_factor  # TODO is it better to be multiplying here by scaling_factor? It does help with not increasing the max weights of existing neurons when new neurons are added.
            new_F = tf.concat([self.F.value(), F_growth], axis=-2)
        else:
            new_F = self.F.value()

        if self.fixed_size:
            n_new_output_units = 0
        else:
            # Grow new units in this layer
            n_new_output_units = max(min_new_units, int(new_F.shape[-1] * percentage))
            if n_new_output_units > 0:
                F_growth = self.F_init(shape=(new_F.shape[0], new_F.shape[1], new_F.shape[2], new_F.shape[3] + n_new_output_units), dtype=dtype)[:, :, :, -n_new_output_units:] * scaling_factor
                b_growth = self.b_init(shape=(n_new_output_units,), dtype=dtype)  # TODO for all possible bias initializers to work properly, the whole bias vector should be initialized at once
                new_F = tf.concat([new_F, F_growth], axis=-1)
                new_b = tf.concat([self.b.value(), b_growth], axis=0)

                self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.F = tf.Variable(name='F', initial_value=new_F, trainable=True)

        self.regularizer.grow(n_new_output_units)
        return n_new_output_units
    
    def set_regularization_penalty(self, regularization_penalty):
        if not self.fixed_size:
            self.regularizer.set_regularization_penalty(regularization_penalty)
    
    def set_regularization_method(self, regularization_method):
        if not self.fixed_size:
            self.regularizer.set_regularization_method(regularization_method)

    def get_param_string():
        param_string = ""
        # TODO
        return param_string


class Flatten(tf.keras.Model):
    def call(self, inputs, training=None):
        return tf.reshape(tf.transpose(inputs, perm=[0, 3, 1, 2]), (inputs.shape[0], -1))


################################################################################
# MODELS
################################################################################


class Epoch:
    def __init__(self, grow, prune, regularization_penalty, regularization_method):
        self.grow = grow
        self.prune = prune
        self.regularization_penalty = regularization_penalty
        self.regularization_method = regularization_method
    
    def __str__(self):
        return f'{int(self.grow)}{int(self.prune)}{self.regularization_penalty}{self.regularization_method}'
    
    def __repr__(self):
        return self.__str__()


class DynamicEpoch(Epoch):
    def __init__(self, regularization_penalty, regularization_method):
        super().__init__(True, True, regularization_penalty, regularization_method)


class StaticEpoch(Epoch):
    def __init__(self, regularization_penalty, regularization_method):
        super().__init__(False, False, regularization_penalty, regularization_method)


class StaticEpochNoRegularization(StaticEpoch):
    def __init__(self):
        super().__init__(0., None)


class Schedule:
    def __init__(self, epochs):
        self.epochs = epochs

    def __iter__(self):
        return self.epochs.__iter__()
    
    def __len__(self):
        return len(self.epochs)
    
    def __str__(self):
        text = ''.join([str(epoch) for epoch in self.epochs])
        return hashlib.sha1(text.encode('utf-8')).hexdigest()[:10]
    
    def __repr__(self):
        return self.__str__()


class Sequential(tf.keras.Model):
    def __init__(self, layers, activation=None):
        super().__init__()
        
        self.lrs = layers
        
    def call(self, inputs, training=None):
        x = inputs
        for layer in self.lrs:
            x = layer(x, training=training)
        return x
    
    def get_layer_input_shape(self, target_layer):
        if target_layer.inpt_shp is not None:
            return target_layer.inpt_shp

        input = np.random.normal(size=(1,) + self.lrs[0].inpt_shp)
        for layer in self.lrs:
            if layer is target_layer:
                return tuple(input.shape[1:])
            input = layer(input)
        raise Exception("Layer not found in the model.")

    def get_layer_output_shape(self, target_layer):
        input = np.random.normal(size=(1,) + self.lrs[0].inpt_shp)
        for layer in self.lrs:
            output = layer(input)
            if layer is target_layer:
                return tuple(output.shape[1:])
            input = output
        raise Exception("Layer not found in the model.")
    
    def get_layer_sizes(self):
        """
        Returns the sizes of all layers in the model, including the input and output layer.
        """
        layer_sizes = list()
        first_layer = True
        for l in range(len(self.lrs)):
            layer = self.lrs[l]
            if isinstance(layer, CustomLayer):
                layer_size = layer.get_size()
                if first_layer:
                    layer_sizes.append(layer_size[0])
                    first_layer = False
                layer_sizes.append(layer_size[1])
        return layer_sizes
    
    def get_hidden_layer_sizes(self):
        return self.get_layer_sizes()[1:-1]
    
    def get_regularization_penalty(self):
        #TODO improve
        return self.lrs[-2].regularizer.regularization_penalty
    
    def set_regularization_penalty(self, regularization_penalty):
        for layer in self.lrs:
            if isinstance(layer, CustomLayer) and not layer.fixed_size:
                layer.set_regularization_penalty(regularization_penalty)
    
    def set_regularization_method(self, regularization_method):
        for layer in self.lrs:
            if isinstance(layer, CustomLayer) and not layer.fixed_size:
                layer.set_regularization_method(regularization_method)

    def prune(self, params):
        input_shape = self.get_layer_input_shape(self.lrs[0])
        n_input_units = input_shape[-1]
        active_units_indices = list(range(n_input_units))

        last_custom_layer = None
        for layer in self.lrs:
            if isinstance(layer, CustomLayer):
                if last_custom_layer is not None and type(last_custom_layer) != type(layer):
                    if type(last_custom_layer) == Conv2D and type(layer) == Dense:
                        convolutional_shape = self.get_layer_output_shape(last_custom_layer)
                        active_units_indices = self.convert_channel_indices_to_flattened_indices(active_units_indices, convolutional_shape)
                    else:
                        raise Exception("Incorrect order of custom layer types.")
                active_units_indices = layer.prune(params.pruning_threshold, active_units_indices)
                last_custom_layer = layer
    
    def grow(self, params):   
        n_new_units = 0

        last_custom_layer = None
        for layer in self.lrs:
            if isinstance(layer, CustomLayer):
                if last_custom_layer is not None and type(last_custom_layer) != type(layer):
                    if type(last_custom_layer) == Conv2D and type(layer) == Dense:
                        convolutional_shape = self.get_layer_output_shape(last_custom_layer)
                        n_new_units = n_new_units * convolutional_shape[0] * convolutional_shape[1]
                    else:
                        raise Exception("Incorrect order of custom layer types.")
                n_new_units = layer.grow(n_new_units, params.growth_percentage, min_new_units=params.min_new_neurons, scaling_factor=params.pruning_threshold)
                last_custom_layer = layer
    
    @staticmethod
    def convert_channel_indices_to_flattened_indices(channel_indices, convolutional_shape):
        dense_indices = list()
        units_per_channel = convolutional_shape[0] * convolutional_shape[1]
        for channel_index in channel_indices:
            for iter in range(units_per_channel):
                dense_indices.append(channel_index * units_per_channel + iter)
        return dense_indices
    
    def print_neurons(self):
        for layer in self.lrs[:-1]:
            print(layer.get_param_string())
    
    def evaluate(self, params, summed_training_loss, summed_training_accuracy):
        # Calculate training loss and accuracy
        if summed_training_loss is not None:
            loss = summed_training_loss / params.x.shape[0]
        else:
            loss = None
        
        if summed_training_accuracy is not None:
            accuracy = summed_training_accuracy / params.x.shape[0]
        else:
            accuracy = None
        
        # Calculate val loss and accuracy
        summed_val_loss = 0
        summed_val_accuracy = 0
        n_val_instances = 0
        
        for step, (x_batch, y_batch) in enumerate(params.val_dataset):
            y_pred = self(x_batch, training=False)
            summed_val_loss += tf.reduce_sum(tf.keras.losses.sparse_categorical_crossentropy(y_batch, y_pred))
            summed_val_accuracy += float(tf.reduce_sum(tf.keras.metrics.sparse_categorical_accuracy(y_batch, y_pred)))
            n_val_instances += x_batch.shape[0]
        
        val_loss = summed_val_loss / n_val_instances
        val_accuracy = summed_val_accuracy / n_val_instances

        return loss, accuracy, val_loss, val_accuracy
    
    def count_params(self):
        trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
        non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])
        total_count = trainable_count + non_trainable_count

        print('Total params: {:,}'.format(total_count))
        print('Trainable params: {:,}'.format(trainable_count))
        print('Non-trainable params: {:,}'.format(non_trainable_count))

        return total_count, trainable_count, non_trainable_count
    
    def print_epoch_statistics(self, params, summed_training_loss, summed_training_accuracy, message=None, require_result=False):
        if not params.verbose:
            if require_result:
                return self.evaluate(params, summed_training_loss, summed_training_accuracy)
            else:
                return
        
        loss, accuracy, val_loss, val_accuracy = self.evaluate(params, summed_training_loss, summed_training_accuracy)  

        if message is not None:
            print(message)
        
        print(f"loss: {loss} - accuracy: {accuracy} - val_loss: {val_loss} - val_accuracy: {val_accuracy} - penalty: {self.get_regularization_penalty()}")
        hidden_layer_sizes = self.get_hidden_layer_sizes()
        print(f"hidden layer sizes: {hidden_layer_sizes}, total units: {sum(hidden_layer_sizes)}")
        if params.print_neurons:
            self.print_neurons()
        
        if require_result:
            return loss, accuracy, val_loss, val_accuracy
    
    def update_history(self, params, loss, accuracy, val_loss, val_accuracy):
        params.history['loss'].append(loss)
        params.history['accuracy'].append(accuracy)
        params.history['val_loss'].append(val_loss)
        params.history['val_accuracy'].append(val_accuracy)
        params.history['hidden_layer_sizes'].append(self.get_hidden_layer_sizes())
    
    @staticmethod
    def prepare_datasets(x, y, batch_size, validation_data):
        train_dataset = tf.data.Dataset.from_tensor_slices((x, y))
        train_dataset = train_dataset.shuffle(buffer_size=20000).batch(batch_size)
        val_dataset = tf.data.Dataset.from_tensor_slices(validation_data).batch(batch_size)
        return train_dataset.prefetch(tf.data.AUTOTUNE), val_dataset.prefetch(tf.data.AUTOTUNE)
    
    def manage_dynamic_regularization(self, params, val_loss):
        if val_loss >= params.best_conditional_val_loss * params.stall_coefficient:
            # Training is currently in stall
            if not params.training_stalled:
                penalty = self.get_regularization_penalty() * params.regularization_penalty_multiplier
                print("Changing penalty...")
                # TODO this must be modified, penalty can differ for each layer
                self.set_regularization_penalty(penalty)
                params.training_stalled = True
        else:
            params.best_conditional_val_loss = val_loss
            params.training_stalled = False
    
    def grow_wrapper(self, params):
        dynamic_reqularization_active = params.regularization_penalty_multiplier != 1.
        if dynamic_reqularization_active:
            loss, accuracy, val_loss, val_accuracy = self.print_epoch_statistics(params, None, None, "Before growing:", require_result=True)
            self.manage_dynamic_regularization(params, val_loss)
        else:
            self.print_epoch_statistics(params, None, None, "Before growing:")

        self.grow(params)
        self.print_epoch_statistics(params, None, None, "After growing:")
    
    def prune_wrapper(self, params, summed_loss, summed_accuracy):
        loss, accuracy, _, _ = self.print_epoch_statistics(params, summed_loss, summed_accuracy, "Before pruning:", require_result=True)
        self.prune(params)
        _, _, val_loss, val_accuracy = self.print_epoch_statistics(params, None, None, "After pruning:", require_result=True)

        self.update_history(params, loss, accuracy, val_loss, val_accuracy)
    
    class ParameterContainer:
        def __init__(self, x, y, optimizer, batch_size, min_new_neurons, validation_data, pruning_threshold, 
                regularization_penalty_multiplier, stall_coefficient, growth_percentage, mini_epochs_per_epoch, verbose, print_neurons, use_static_graph):
            self.x = x
            self.y = y
            self.optimizer = optimizer
            self.batch_size = batch_size
            self.min_new_neurons = min_new_neurons
            self.validation_data = validation_data
            self.pruning_threshold = pruning_threshold
            self.regularization_penalty_multiplier = regularization_penalty_multiplier
            self.stall_coefficient = stall_coefficient
            self.growth_percentage = growth_percentage
            self.mini_epochs_per_epoch = mini_epochs_per_epoch
            self.verbose = verbose
            self.print_neurons = print_neurons
            self.use_static_graph = use_static_graph

            self.train_dataset, self.val_dataset = Sequential.prepare_datasets(x, y, batch_size, validation_data)
            self.history = self.prepare_history()

            self.best_conditional_val_loss = np.inf
            self.training_stalled = False
        
        @staticmethod
        def prepare_history():
            history = {
                'loss': list(),
                'accuracy': list(),
                'val_loss': list(),
                'val_accuracy': list(),
                'hidden_layer_sizes': list(),
            }
            return history
    
    def fit_single_step(self, params, x_batch, y_batch):
        with tf.GradientTape() as tape:
            y_pred = self(x_batch, training=True)
            raw_loss = tf.keras.losses.sparse_categorical_crossentropy(y_batch, y_pred)
            loss_value = tf.reduce_mean(raw_loss)
            loss_value += sum(self.losses)  # Add losses registered by model.add_loss

            loss = tf.reduce_sum(raw_loss)
            accuracy = float(tf.reduce_sum(tf.keras.metrics.sparse_categorical_accuracy(y_batch, y_pred)))

        grads = tape.gradient(loss_value, self.trainable_variables)
        params.optimizer.apply_gradients(zip(grads, self.trainable_variables))

        return loss, accuracy
    
    def fit_single_epoch(self, params):
        summed_loss = 0
        summed_accuracy = 0
        
        for mini_epoch in range(params.mini_epochs_per_epoch):
            summed_loss = 0
            summed_accuracy = 0

            if params.use_static_graph:
                fit_single_step_function = tf.function(self.fit_single_step)
            else:
                fit_single_step_function = self.fit_single_step
            for step, (x_batch, y_batch) in enumerate(params.train_dataset):
                loss, accuracy = fit_single_step_function(params, x_batch, y_batch)
                summed_loss += loss
                summed_accuracy += accuracy
        
        return summed_loss, summed_accuracy

    def fit(self, x, y, optimizer, schedule, batch_size, min_new_neurons, validation_data, pruning_threshold=0.001, regularization_penalty_multiplier=1., 
            stall_coefficient=1, growth_percentage=0.2, mini_epochs_per_epoch=1, verbose=True, print_neurons=False, use_static_graph=True):
        params = self.ParameterContainer(x=x, y=y, optimizer=optimizer, batch_size=batch_size, min_new_neurons=min_new_neurons, validation_data=validation_data, 
                                         pruning_threshold=pruning_threshold, regularization_penalty_multiplier=regularization_penalty_multiplier, stall_coefficient=stall_coefficient, 
                                         growth_percentage=growth_percentage, mini_epochs_per_epoch=mini_epochs_per_epoch, verbose=verbose, print_neurons=print_neurons, 
                                         use_static_graph=use_static_graph)
        self.build(x.shape)  # Necessary when verbose == False

        for epoch_no, epoch in enumerate(schedule):
            if verbose:
                print("##########################################################")
                print(f"Epoch {epoch_no + 1}/{len(schedule)}")
            
            self.set_regularization_penalty(epoch.regularization_penalty)
            self.set_regularization_method(epoch.regularization_method)

            if epoch.grow:
                self.grow_wrapper(params)
            
            summed_loss, summed_accuracy = self.fit_single_epoch(params)

            if epoch.prune:
                self.prune_wrapper(params, summed_loss, summed_accuracy)
            else:
                loss, accuracy, val_loss, val_accuracy = self.print_epoch_statistics(params, summed_loss, summed_accuracy, require_result=True)
                self.update_history(params, loss, accuracy, val_loss, val_accuracy)
        
        return params.history


################################################################################
# HELPER FUNCTIONS
################################################################################


def get_statistics_from_history(history):
    best_epoch_number = np.argmax(history['val_accuracy'])
    best_val_accuracy = history['val_accuracy'][best_epoch_number]
    best_hidden_layer_sizes = history['hidden_layer_sizes'][best_epoch_number]
    return best_val_accuracy, best_hidden_layer_sizes


def get_statistics_from_histories(histories):
    best_val_accuracies = list()
    all_best_hidden_layer_sizes = list()

    for history in histories:
        best_val_accuracy, best_hidden_layer_sizes = get_statistics_from_history(history)
        best_val_accuracies.append(best_val_accuracy)
        all_best_hidden_layer_sizes.append(best_hidden_layer_sizes)
    
    mean_best_val_accuracy = np.mean(best_val_accuracies)
    mean_best_hidden_layer_sizes = [np.mean(layer) for layer in list(zip(*all_best_hidden_layer_sizes))]
    
    return mean_best_val_accuracy, mean_best_hidden_layer_sizes


def cross_validate(train_fn, x, y, n_splits, random_state=42, *args, **kwargs):
    from sklearn.model_selection import KFold

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    histories = list()
    for i, (train_index, test_index) in enumerate(kf.split(x)):
        xtrain, xtest = x[train_index], x[test_index]
        ytrain, ytest = y[train_index], y[test_index]

        history = train_fn(xtrain, ytrain, validation_data=(xtest, ytest), *args, **kwargs)
        histories.append(history)

        best_val_accuracy, best_hidden_layer_sizes = get_statistics_from_history(history)
        print(f"Run {i} completed, best_val_accuracy: {best_val_accuracy}, best_hidden_layer_sizes: {best_hidden_layer_sizes}")

    mean_best_val_accuracy, mean_best_hidden_layer_sizes = get_statistics_from_histories(histories)
    print(f'mean_best_val_accuracy: {mean_best_val_accuracy}')
    print(f'mean_best_hidden_layer_sizes: {mean_best_hidden_layer_sizes}')

    return histories, mean_best_hidden_layer_sizes


def hyperparameter_search(train_fn, x, y, validation_data, *args, **kwargs):
    from itertools import product

    all_params = [*args] + list(kwargs.values())
    histories = list()

    best_overall_val_accuracy = -np.inf
    best_overall_combination = None

    for combination in product(*all_params):
        combination_args = combination[:len(args)]

        combination_kwargs_values = combination[len(args):]
        combination_kwargs = dict(zip(kwargs.keys(), combination_kwargs_values))

        history = train_fn(x, y, validation_data, *combination_args, **combination_kwargs)
        history['parameters'] = combination
        histories.append(history)

        best_val_accuracy, best_hidden_layer_sizes = get_statistics_from_history(history)
        print(f"Run with parameters {combination} completed, best_val_accuracy: {best_val_accuracy}, best_hidden_layer_sizes sizes: {best_hidden_layer_sizes}")

        if best_val_accuracy > best_overall_val_accuracy:
            best_overall_val_accuracy = best_val_accuracy
            best_overall_combination = combination
    
    print(f'Best overall combination: {best_overall_combination}, val_accuracy: {best_overall_val_accuracy}')

    return histories, best_overall_combination


def get_convolutional_model(x, layer_sizes, output_neurons=10):
    model = Sequential([
        Conv2D(layer_sizes[0], filter_size=(3, 3), activation='selu', strides=(1, 1), padding='SAME', kernel_initializer='lecun_normal', input_shape=x[0,:,:,:].shape),
        Conv2D(layer_sizes[1], filter_size=(3, 3), activation='selu', strides=(2, 2), padding='SAME', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dropout(0.2),
        Conv2D(layer_sizes[2], filter_size=(3, 3), activation='selu', strides=(1, 1), padding='SAME', kernel_initializer='lecun_normal'),
        Conv2D(layer_sizes[3], filter_size=(3, 3), activation='selu', strides=(2, 2), padding='SAME', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dropout(0.5),
        Flatten(),
        Dense(layer_sizes[4], activation='selu', kernel_initializer='lecun_normal'),
        Dense(output_neurons, activation='softmax', fixed_size=True),
    ])
    return model


def train_fn(x, y, validation_data, learning_rate, schedule, layer_sizes, output_neurons=10, min_new_neurons=20, 
             growth_percentage=0.2, verbose=False, use_static_graph=True):
    batch_size = 128

    model = get_convolutional_model(x, layer_sizes, output_neurons)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    history = model.fit(x=x, y=y, optimizer=optimizer, schedule=schedule, batch_size=batch_size, min_new_neurons=min_new_neurons, 
                        validation_data=validation_data, growth_percentage=growth_percentage, verbose=verbose, use_static_graph=use_static_graph)
    
    return history

# Final performance analysis

## Dynamic Auto-Sizing

In [4]:
cifar100 = get_cifar_100_dataset()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ec4ba8ef9e, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.44, best_hidden_layer_sizes sizes: [90, 29, 38, 43, 100]
Run with parameters (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.4448, best_hidden_layer_sizes sizes: [67, 23, 27, 73, 195]
Run with parameters (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.4358, best_hidden_layer_sizes sizes: [60, 18, 55, 120, 452]
Best overall combination: (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.4448
CPU times: user 9min 57s, sys: 20.8 s, total: 10min 18s
Wall time: 9min 3s


In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0002

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.4428, best_hidden_layer_sizes: [68, 19, 33, 74, 191]
Run 1 completed, best_val_accuracy: 0.4512, best_hidden_layer_sizes: [62, 22, 29, 61, 195]
Run 2 completed, best_val_accuracy: 0.4484, best_hidden_layer_sizes: [67, 19, 30, 61, 200]
Run 3 completed, best_val_accuracy: 0.4485, best_hidden_layer_sizes: [67, 18, 33, 63, 199]
Run 4 completed, best_val_accuracy: 0.4577, best_hidden_layer_sizes: [62, 20, 27, 62, 190]
Run 5 completed, best_val_accuracy: 0.457, best_hidden_layer_sizes: [69, 23, 28, 66, 192]
mean_best_val_accuracy: 0.45093333333333335
mean_best_hidden_layer_sizes: [65.83333333333333, 20.166666666666668, 30.0, 64.5, 194.5]
CPU times: user 19min 26s, sys: 41.6 s, total: 20min 8s
Wall time: 17min 17s


In [5]:
mean_best_hidden_layer_sizes = [65.83333333333333, 20.166666666666668, 30.0, 64.5, 194.5]

In [6]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[66, 20, 30, 64, 194]

## Static, no regularization

In [7]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[66, 20, 30, 64, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3175, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0002, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3379, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0004, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3427, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Best overall combination: (0.0004, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.3427
CPU times: user 7min 14s, sys: 30.5 s, total: 7min 45s
Wall time: 6min 3s


In [12]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0008], schedule=[schedule], layer_sizes=[[66, 20, 30, 64, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0008, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3319, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Best overall combination: (0.0008, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.3319
CPU times: user 2min 21s, sys: 10.3 s, total: 2min 32s
Wall time: 1min 55s


In [13]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=0.0004,
    schedule=schedule, layer_sizes=[66, 20, 30, 64, 194], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.3312, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 1 completed, best_val_accuracy: 0.3364, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 2 completed, best_val_accuracy: 0.3348, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 3 completed, best_val_accuracy: 0.3389, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 4 completed, best_val_accuracy: 0.336, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 5 completed, best_val_accuracy: 0.3362, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
mean_best_val_accuracy: 0.33558333333333334
mean_best_hidden_layer_sizes: [66.0, 20.0, 30.0, 64.0, 194.0]
CPU times: user 14min 2s, sys: 48.2 s, total: 14min 51s
Wall time: 11min 17s


In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016], schedule=[schedule], layer_sizes=[rounded_mean_best_hidden_layer_sizes], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3397, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0002, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3392, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0004, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.334, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0008, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3272, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0016, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3199, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Best overall combination: (0.0001, 4a0f172746, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.3397
CPU times: user 11min 56s, sys: 29 s, total: 12min

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0001

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.3183, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 1 completed, best_val_accuracy: 0.3265, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 2 completed, best_val_accuracy: 0.3134, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 3 completed, best_val_accuracy: 0.3345, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 4 completed, best_val_accuracy: 0.325, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
Run 5 completed, best_val_accuracy: 0.3234, best_hidden_layer_sizes: [66, 20, 30, 64, 194]
mean_best_val_accuracy: 0.3235166666666667
mean_best_hidden_layer_sizes: [66.0, 20.0, 30.0, 64.0, 194.0]
CPU times: user 14min 32s, sys: 34.6 s, total: 15min 7s
Wall time: 11min 33s


## Pruning only, weighted L1 + fine-tuning

In [8]:
%%time

schedule = Schedule([Epoch(False, True, 0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[66, 20, 30, 64, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 758db6f544, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4388, best_hidden_layer_sizes sizes: [66, 20, 30, 51, 194]
Run with parameters (0.0002, 758db6f544, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4495, best_hidden_layer_sizes sizes: [60, 20, 29, 60, 194]
Run with parameters (0.0004, 758db6f544, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4394, best_hidden_layer_sizes sizes: [46, 20, 30, 64, 194]
Best overall combination: (0.0002, 758db6f544, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.4495
CPU times: user 8min 55s, sys: 22.3 s, total: 9min 18s
Wall time: 7min 18s


In [14]:
%%time

schedule = Schedule([Epoch(False, True, 0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=0.0002,
    schedule=schedule, layer_sizes=[66, 20, 30, 64, 194], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.45, best_hidden_layer_sizes: [63, 20, 30, 58, 194]
Run 1 completed, best_val_accuracy: 0.4486, best_hidden_layer_sizes: [62, 20, 30, 64, 194]
Run 2 completed, best_val_accuracy: 0.4473, best_hidden_layer_sizes: [60, 20, 30, 62, 194]
Run 3 completed, best_val_accuracy: 0.4588, best_hidden_layer_sizes: [60, 20, 30, 60, 194]
Run 4 completed, best_val_accuracy: 0.4508, best_hidden_layer_sizes: [60, 20, 30, 63, 194]
Run 5 completed, best_val_accuracy: 0.4444, best_hidden_layer_sizes: [61, 20, 30, 63, 194]
mean_best_val_accuracy: 0.44998333333333335
mean_best_hidden_layer_sizes: [61.0, 20.0, 30.0, 61.666666666666664, 194.0]
CPU times: user 17min 32s, sys: 42.3 s, total: 18min 14s
Wall time: 14min 18s


## Pruning only, weighted L1

In [9]:
%%time

schedule = Schedule([Epoch(False, True, 0.00002, 'weighted_l1')] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[66, 20, 30, 64, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 865b7710ae, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4217, best_hidden_layer_sizes sizes: [63, 20, 30, 43, 142]
Run with parameters (0.0002, 865b7710ae, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.437, best_hidden_layer_sizes sizes: [52, 20, 29, 63, 194]
Run with parameters (0.0004, 865b7710ae, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4319, best_hidden_layer_sizes sizes: [37, 20, 27, 64, 194]
Best overall combination: (0.0002, 865b7710ae, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.437
CPU times: user 10min 45s, sys: 26.9 s, total: 11min 12s
Wall time: 8min 58s


In [None]:
%%time

schedule = Schedule([Epoch(False, True, 0.00002, 'weighted_l1')] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=0.0002,
    schedule=schedule, layer_sizes=[66, 20, 30, 64, 194], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.4338, best_hidden_layer_sizes: [46, 20, 28, 64, 194]
Run 1 completed, best_val_accuracy: 0.4405, best_hidden_layer_sizes: [48, 20, 29, 63, 194]
Run 2 completed, best_val_accuracy: 0.4322, best_hidden_layer_sizes: [44, 20, 27, 63, 194]
Run 3 completed, best_val_accuracy: 0.44, best_hidden_layer_sizes: [47, 20, 26, 62, 193]


## Static, weighted L1 + fine-tuning

In [10]:
%%time

schedule = Schedule([StaticEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[66, 20, 30, 64, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, e7901b090a, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3984, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0002, e7901b090a, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4299, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0004, e7901b090a, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.436, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Best overall combination: (0.0004, e7901b090a, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.436
CPU times: user 7min 48s, sys: 27.9 s, total: 8min 16s
Wall time: 6min 18s


In [None]:
%%time

schedule = Schedule([StaticEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=0.0004,
    schedule=schedule, layer_sizes=[66, 20, 30, 64, 194], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

## Static, weighted L1

In [11]:
%%time

schedule = Schedule([StaticEpoch(0.00002, 'weighted_l1')] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[66, 20, 30, 64, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, f4be7cd642, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3692, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0002, f4be7cd642, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.406, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Run with parameters (0.0004, f4be7cd642, [66, 20, 30, 64, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.4305, best_hidden_layer_sizes sizes: [66, 20, 30, 64, 194]
Best overall combination: (0.0004, f4be7cd642, [66, 20, 30, 64, 194], 100, 20, 0.2), val_accuracy: 0.4305
CPU times: user 8min 16s, sys: 27.1 s, total: 8min 43s
Wall time: 6min 40s


In [None]:
%%time

schedule = Schedule([StaticEpoch(0.00002, 'weighted_l1')] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=0.0004,
    schedule=schedule, layer_sizes=[66, 20, 30, 64, 194], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

# Legacy code

In [None]:
learning_rates = [0.0001 * 2 ** i for i in range(7)]
learning_rates

[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032, 0.0064]

## CIFAR100

In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[66, 20, 30, 64, 194]

In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[66, 20, 30, 64, 194], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 850,840.0
Trainable params: 850,840
Non-trainable params: 0.0


(850840.0, 850840, 0.0)

In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[60, 60, 60, 60, 194], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 863,714.0
Trainable params: 863,714
Non-trainable params: 0.0


(863714.0, 863714, 0.0)

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016], schedule=[schedule], layer_sizes=[[60, 60, 60, 60, 194]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [60, 60, 60, 60, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3466, best_hidden_layer_sizes sizes: [60, 60, 60, 60, 194]
Run with parameters (0.0002, 4a0f172746, [60, 60, 60, 60, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3612, best_hidden_layer_sizes sizes: [60, 60, 60, 60, 194]
Run with parameters (0.0004, 4a0f172746, [60, 60, 60, 60, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3489, best_hidden_layer_sizes sizes: [60, 60, 60, 60, 194]
Run with parameters (0.0008, 4a0f172746, [60, 60, 60, 60, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.35, best_hidden_layer_sizes sizes: [60, 60, 60, 60, 194]
Run with parameters (0.0016, 4a0f172746, [60, 60, 60, 60, 194], 100, 20, 0.2) completed, best_val_accuracy: 0.3435, best_hidden_layer_sizes sizes: [60, 60, 60, 60, 194]
Best overall combination: (0.0002, 4a0f172746, [60, 60, 60, 60, 194], 100, 20, 0.2), val_accuracy: 0.3612
CPU times: user 12min 5s, sys: 27.7 s, total: 12min

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0002

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[60, 60, 60, 60, 194], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.3412, best_hidden_layer_sizes: [60, 60, 60, 60, 194]
Run 1 completed, best_val_accuracy: 0.3624, best_hidden_layer_sizes: [60, 60, 60, 60, 194]
Run 2 completed, best_val_accuracy: 0.3531, best_hidden_layer_sizes: [60, 60, 60, 60, 194]
Run 3 completed, best_val_accuracy: 0.3609, best_hidden_layer_sizes: [60, 60, 60, 60, 194]
Run 4 completed, best_val_accuracy: 0.3543, best_hidden_layer_sizes: [60, 60, 60, 60, 194]
Run 5 completed, best_val_accuracy: 0.3513, best_hidden_layer_sizes: [60, 60, 60, 60, 194]
mean_best_val_accuracy: 0.3538666666666667
mean_best_hidden_layer_sizes: [60.0, 60.0, 60.0, 60.0, 194.0]
CPU times: user 14min 30s, sys: 32.4 s, total: 15min 3s
Wall time: 12min 27s


### Group sparsity regularization

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.01, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ddb3901a26, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.3943, best_hidden_layer_sizes sizes: [56, 34, 16, 301, 566]


KeyboardInterrupt: ignored

In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[56, 34, 16, 301, 566], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 11,027,985.0
Trainable params: 11,027,985
Non-trainable params: 0.0


(11027985.0, 11027985, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.02, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0002, db867b524a, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.3215, best_hidden_layer_sizes sizes: [15, 8, 7, 16, 1290]
Best overall combination: (0.0002, db867b524a, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.3215
CPU times: user 3min 7s, sys: 11.9 s, total: 3min 19s
Wall time: 2min 42s


In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[15, 8, 7, 16, 1290], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 1,454,393.0
Trainable params: 1,454,393
Non-trainable params: 0.0


(1454393.0, 1454393, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.025, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0002, fb58024a1e, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.2699, best_hidden_layer_sizes sizes: [7, 6, 6, 11, 746]
Best overall combination: (0.0002, fb58024a1e, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.2699
CPU times: user 3min 1s, sys: 7.68 s, total: 3min 9s
Wall time: 2min 33s


In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[7, 6, 6, 11, 746], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 602,145.0
Trainable params: 602,145
Non-trainable params: 0.0


(602145.0, 602145, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.022, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0002, 11714c0041, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.3146, best_hidden_layer_sizes sizes: [10, 7, 8, 16, 870]
Best overall combination: (0.0002, 11714c0041, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.3146
CPU times: user 3min 3s, sys: 7.67 s, total: 3min 10s
Wall time: 2min 34s


In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[10, 7, 8, 16, 870], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 981,447.0
Trainable params: 981,447
Non-trainable params: 0.0


(981447.0, 981447, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.022, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 11714c0041, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.2733, best_hidden_layer_sizes sizes: [11, 8, 8, 18, 107]
Run with parameters (0.0002, 11714c0041, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.2852, best_hidden_layer_sizes sizes: [6, 7, 6, 13, 892]
Run with parameters (0.0004, 11714c0041, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.2611, best_hidden_layer_sizes sizes: [6, 6, 5, 14, 3537]


KeyboardInterrupt: ignored

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.015, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001, 0.00005], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, c20b664f72, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.4011, best_hidden_layer_sizes sizes: [54, 18, 17, 39, 238]
Run with parameters (5e-05, c20b664f72, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.3814, best_hidden_layer_sizes sizes: [73, 28, 23, 71, 104]
Best overall combination: (0.0001, c20b664f72, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.4011
CPU times: user 6min 40s, sys: 13.7 s, total: 6min 54s
Wall time: 6min 3s


In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[54, 18, 17, 39, 238], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 637,241.0
Trainable params: 637,241
Non-trainable params: 0.0


(637241.0, 637241, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.013, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, b952173216, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.4081, best_hidden_layer_sizes sizes: [44, 17, 17, 47, 400]
Best overall combination: (0.0001, b952173216, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.4081
CPU times: user 3min 17s, sys: 8.96 s, total: 3min 26s
Wall time: 2min 56s


In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[44, 17, 17, 47, 400], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 1,261,537.0
Trainable params: 1,261,537
Non-trainable params: 0.0


(1261537.0, 1261537, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.014, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, d60ab0bdd4, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.4035, best_hidden_layer_sizes sizes: [45, 17, 15, 42, 304]
Best overall combination: (0.0001, d60ab0bdd4, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.4035
CPU times: user 3min 16s, sys: 7.74 s, total: 3min 23s
Wall time: 2min 54s


In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=[45, 17, 15, 42, 304], output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 864,140.0
Trainable params: 864,140
Non-trainable params: 0.0


(864140.0, 864140, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.014, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar100.X_train_norm, y=cifar100.y_train, validation_data=(cifar100.X_test_norm, cifar100.y_test), 
                                  learning_rate=[0.00005, 0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[100], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (5e-05, d60ab0bdd4, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.3878, best_hidden_layer_sizes sizes: [70, 28, 28, 79, 108]
Run with parameters (0.0002, d60ab0bdd4, [100, 100, 100, 100, 100], 100, 20, 0.2) completed, best_val_accuracy: 0.3903, best_hidden_layer_sizes sizes: [26, 13, 12, 28, 2396]
Best overall combination: (0.0002, d60ab0bdd4, [100, 100, 100, 100, 100], 100, 20, 0.2), val_accuracy: 0.3903
CPU times: user 6min 31s, sys: 14.7 s, total: 6min 46s
Wall time: 5min 53s


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.014, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar100.X_norm, cifar100.y, n_splits=6, learning_rate=0.0001,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=100, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.3857, best_hidden_layer_sizes: [38, 20, 16, 41, 307]
Run 1 completed, best_val_accuracy: 0.3949, best_hidden_layer_sizes: [40, 17, 16, 41, 265]
Run 2 completed, best_val_accuracy: 0.3937, best_hidden_layer_sizes: [43, 16, 15, 41, 273]
Run 3 completed, best_val_accuracy: 0.3897, best_hidden_layer_sizes: [38, 16, 14, 42, 286]
Run 4 completed, best_val_accuracy: 0.4124, best_hidden_layer_sizes: [57, 16, 18, 39, 288]
Run 5 completed, best_val_accuracy: 0.3929, best_hidden_layer_sizes: [42, 17, 16, 42, 275]
mean_best_val_accuracy: 0.3948833333333333
mean_best_hidden_layer_sizes: [43.0, 17.0, 15.833333333333334, 41.0, 282.3333333333333]
CPU times: user 19min 46s, sys: 44.5 s, total: 20min 30s
Wall time: 17min 30s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[43, 17, 16, 41, 282]

In [None]:
model = get_convolutional_model(cifar100.X_norm, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=100)
model.build(cifar100.X_norm.shape)
model.count_params()

Total params: 784,759.0
Trainable params: 784,759
Non-trainable params: 0.0


(784759.0, 784759, 0.0)

## Street View House Numbers

In [None]:
svhn = get_svhn_dataset()

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0008], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9207897971727105, best_hidden_layer_sizes sizes: [12, 18, 43, 64, 329]
Best overall combination: (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9207897971727105
CPU times: user 5min 3s, sys: 12.6 s, total: 5min 15s
Wall time: 4min 17s


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9225568531038721, best_hidden_layer_sizes sizes: [35, 23, 18, 49, 56]
Run with parameters (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9231330669944684, best_hidden_layer_sizes sizes: [23, 16, 18, 43, 68]
Run with parameters (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9256684081130916, best_hidden_layer_sizes sizes: [17, 15, 27, 42, 172]
Best overall combination: (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9256684081130916
CPU times: user 15min 7s, sys: 33.3 s, total: 15min 41s
Wall time: 12min 49s


In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, svhn.X_norm, svhn.y, n_splits=4, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9326431132417516, best_hidden_layer_sizes: [19, 16, 17, 41, 148]
Run 1 completed, best_val_accuracy: 0.9331238417532833, best_hidden_layer_sizes: [15, 17, 27, 48, 169]
Run 2 completed, best_val_accuracy: 0.9300217548948514, best_hidden_layer_sizes: [18, 16, 19, 51, 158]
Run 3 completed, best_val_accuracy: 0.929981468052534, best_hidden_layer_sizes: [16, 17, 21, 41, 159]
mean_best_val_accuracy: 0.9314425444856052
mean_best_hidden_layer_sizes: [17.0, 16.5, 21.0, 45.25, 158.5]
CPU times: user 19min 51s, sys: 46.1 s, total: 20min 37s
Wall time: 16min 40s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[17, 16, 21, 45, 158]

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016], schedule=[schedule], layer_sizes=[rounded_mean_best_hidden_layer_sizes], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [17, 16, 21, 45, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.8831438229870928, best_hidden_layer_sizes sizes: [17, 16, 21, 45, 158]
Run with parameters (0.0002, 4a0f172746, [17, 16, 21, 45, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.8705055316533498, best_hidden_layer_sizes sizes: [17, 16, 21, 45, 158]
Run with parameters (0.0004, 4a0f172746, [17, 16, 21, 45, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.8845651505838967, best_hidden_layer_sizes sizes: [17, 16, 21, 45, 158]
Run with parameters (0.0008, 4a0f172746, [17, 16, 21, 45, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.9056929932390904, best_hidden_layer_sizes sizes: [17, 16, 21, 45, 158]
Run with parameters (0.0016, 4a0f172746, [17, 16, 21, 45, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.8907882606023356, best_hidden_layer_sizes sizes: [17, 16, 21, 45, 158]
Best overall combination: (0.0008, 4a0f172746, [17, 16, 21, 45, 158], 10, 20, 0.2), val_accuracy: 0.

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0008

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, svhn.X_norm, svhn.y, n_splits=4, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9115739435201224, best_hidden_layer_sizes: [17, 16, 21, 45, 158]
Run 1 completed, best_val_accuracy: 0.9151559100797679, best_hidden_layer_sizes: [17, 16, 21, 45, 158]
Run 2 completed, best_val_accuracy: 0.9182177100958827, best_hidden_layer_sizes: [17, 16, 21, 45, 158]
Run 3 completed, best_val_accuracy: 0.9090725968898558, best_hidden_layer_sizes: [17, 16, 21, 45, 158]
mean_best_val_accuracy: 0.9135050401464072
mean_best_hidden_layer_sizes: [17.0, 16.0, 21.0, 45.0, 158.0]
CPU times: user 15min 31s, sys: 39.8 s, total: 16min 11s
Wall time: 12min 19s


In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[17, 16, 21, 45, 158], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 471,323.0
Trainable params: 471,323
Non-trainable params: 0.0


(471323.0, 471323, 0.0)

In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[42, 42, 42, 42, 158], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 475,382.0
Trainable params: 475,382
Non-trainable params: 0.0


(475382.0, 475382, 0.0)

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016], schedule=[schedule], layer_sizes=[[42, 42, 42, 42, 158]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [42, 42, 42, 42, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.9119545175169023, best_hidden_layer_sizes sizes: [42, 42, 42, 42, 158]
Run with parameters (0.0002, 4a0f172746, [42, 42, 42, 42, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.9114167178856791, best_hidden_layer_sizes sizes: [42, 42, 42, 42, 158]
Run with parameters (0.0004, 4a0f172746, [42, 42, 42, 42, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.909380762138906, best_hidden_layer_sizes sizes: [42, 42, 42, 42, 158]
Run with parameters (0.0008, 4a0f172746, [42, 42, 42, 42, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.9127612169637369, best_hidden_layer_sizes sizes: [42, 42, 42, 42, 158]
Run with parameters (0.0016, 4a0f172746, [42, 42, 42, 42, 158], 10, 20, 0.2) completed, best_val_accuracy: 0.9083051628764598, best_hidden_layer_sizes sizes: [42, 42, 42, 42, 158]
Best overall combination: (0.0008, 4a0f172746, [42, 42, 42, 42, 158], 10, 20, 0.2), val_accuracy: 0.9

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0008

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, svhn.X_norm, svhn.y, n_splits=4, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[42, 42, 42, 42, 158], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.925069492003384, best_hidden_layer_sizes: [42, 42, 42, 42, 158]
Run 1 completed, best_val_accuracy: 0.9253484811860446, best_hidden_layer_sizes: [42, 42, 42, 42, 158]
Run 2 completed, best_val_accuracy: 0.9239384417049392, best_hidden_layer_sizes: [42, 42, 42, 42, 158]
Run 3 completed, best_val_accuracy: 0.9225284022238337, best_hidden_layer_sizes: [42, 42, 42, 42, 158]
mean_best_val_accuracy: 0.9242212042795503
mean_best_hidden_layer_sizes: [42.0, 42.0, 42.0, 42.0, 158.0]
CPU times: user 15min 31s, sys: 38.8 s, total: 16min 10s
Wall time: 12min 22s


### Group sparsity regularization

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.014, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, d60ab0bdd4, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9049247080516287, best_hidden_layer_sizes sizes: [13, 10, 9, 36, 72]
Best overall combination: (0.0001, d60ab0bdd4, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9049247080516287
CPU times: user 4min 55s, sys: 12.9 s, total: 5min 8s
Wall time: 4min 10s


In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[13, 10, 9, 36, 72], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 172,005.0
Trainable params: 172,005
Non-trainable params: 0.0


(172005.0, 172005, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.01, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ddb3901a26, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9086124769514444, best_hidden_layer_sizes sizes: [16, 12, 10, 41, 93]
Best overall combination: (0.0001, ddb3901a26, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9086124769514444
CPU times: user 4min 57s, sys: 12.1 s, total: 5min 9s
Wall time: 4min 14s


In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[16, 12, 10, 41, 93], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 252,074.0
Trainable params: 252,074
Non-trainable params: 0.0


(252074.0, 252074, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.007, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 6450454c27, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9135295021511985, best_hidden_layer_sizes sizes: [21, 17, 12, 53, 104]
Best overall combination: (0.0001, 6450454c27, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9135295021511985
CPU times: user 5min 6s, sys: 11.8 s, total: 5min 18s
Wall time: 4min 25s


In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[21, 17, 12, 53, 104], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 365,365.0
Trainable params: 365,365
Non-trainable params: 0.0


(365365.0, 365365, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.005, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, b5c152c4f6, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9112246465888137, best_hidden_layer_sizes sizes: [27, 20, 14, 138, 100]
Best overall combination: (0.0001, b5c152c4f6, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9112246465888137
CPU times: user 5min 13s, sys: 11.2 s, total: 5min 24s
Wall time: 4min 40s


In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[27, 20, 14, 138, 100], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 910,006.0
Trainable params: 910,006
Non-trainable params: 0.0


(910006.0, 910006, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.006, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=svhn.X_train_norm, y=svhn.y_train, validation_data=(svhn.X_test_norm, svhn.y_test), 
                                  learning_rate=[0.00005, 0.0001, 0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (5e-05, abda604a7f, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9076521204671174, best_hidden_layer_sizes sizes: [44, 29, 18, 98, 100]
Run with parameters (0.0001, abda604a7f, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9129917025199754, best_hidden_layer_sizes sizes: [21, 18, 12, 70, 99]
Run with parameters (0.0002, abda604a7f, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.905001536570375, best_hidden_layer_sizes sizes: [11, 15, 9, 211, 103]
Best overall combination: (0.0001, abda604a7f, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9129917025199754
CPU times: user 15min 19s, sys: 32.6 s, total: 15min 51s
Wall time: 13min 31s


In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=[21, 18, 12, 70, 99], output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 458,213.0
Trainable params: 458,213
Non-trainable params: 0.0


(458213.0, 458213, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.006, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, svhn.X_norm, svhn.y, n_splits=4, learning_rate=0.0001,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9249889215646779, best_hidden_layer_sizes: [24, 22, 12, 65, 100]
Run 1 completed, best_val_accuracy: 0.9247441785512851, best_hidden_layer_sizes: [28, 20, 12, 75, 99]
Run 2 completed, best_val_accuracy: 0.920554346950286, best_hidden_layer_sizes: [28, 17, 12, 69, 100]
Run 3 completed, best_val_accuracy: 0.9203126258963823, best_hidden_layer_sizes: [24, 18, 11, 68, 100]
mean_best_val_accuracy: 0.9226500182406578
mean_best_hidden_layer_sizes: [26.0, 19.25, 11.75, 69.25, 99.75]
CPU times: user 20min 10s, sys: 43 s, total: 20min 53s
Wall time: 17min 40s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[26, 19, 12, 69, 100]

In [None]:
model = get_convolutional_model(svhn.X_norm, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=10)
model.build(svhn.X_norm.shape)
model.count_params()

Total params: 457,488.0
Trainable params: 457,488
Non-trainable params: 0.0


(457488.0, 457488, 0.0)

##CIFAR10

In [None]:
cifar10 = get_cifar_10_dataset()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0008], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7539, best_hidden_layer_sizes sizes: [42, 17, 62, 79, 520]
Best overall combination: (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.7539
CPU times: user 3min 13s, sys: 7.1 s, total: 3min 20s
Wall time: 2min 49s


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7309, best_hidden_layer_sizes sizes: [86, 24, 32, 48, 51]
Run with parameters (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7502, best_hidden_layer_sizes sizes: [58, 18, 25, 48, 78]
Run with parameters (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7699, best_hidden_layer_sizes sizes: [38, 20, 21, 56, 161]
Best overall combination: (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.7699
CPU times: user 9min 51s, sys: 19.3 s, total: 10min 11s
Wall time: 8min 38s


In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar10.X_norm, cifar10.y, n_splits=6, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.7729, best_hidden_layer_sizes: [41, 16, 27, 59, 215]
Run 1 completed, best_val_accuracy: 0.7684, best_hidden_layer_sizes: [39, 17, 32, 55, 147]
Run 2 completed, best_val_accuracy: 0.773, best_hidden_layer_sizes: [38, 19, 24, 54, 177]
Run 3 completed, best_val_accuracy: 0.7674, best_hidden_layer_sizes: [37, 20, 22, 67, 174]
Run 4 completed, best_val_accuracy: 0.7749, best_hidden_layer_sizes: [38, 18, 26, 57, 145]
Run 5 completed, best_val_accuracy: 0.7662, best_hidden_layer_sizes: [42, 19, 25, 51, 192]
mean_best_val_accuracy: 0.7704666666666666
mean_best_hidden_layer_sizes: [39.166666666666664, 18.166666666666668, 26.0, 57.166666666666664, 175.0]
CPU times: user 19min 26s, sys: 39 s, total: 20min 5s
Wall time: 16min 45s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[39, 18, 26, 57, 175]

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[rounded_mean_best_hidden_layer_sizes], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [39, 18, 26, 57, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.6574, best_hidden_layer_sizes sizes: [39, 18, 26, 57, 175]
Run with parameters (0.0002, 4a0f172746, [39, 18, 26, 57, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.6557, best_hidden_layer_sizes sizes: [39, 18, 26, 57, 175]
Run with parameters (0.0004, 4a0f172746, [39, 18, 26, 57, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.6811, best_hidden_layer_sizes sizes: [39, 18, 26, 57, 175]
Run with parameters (0.0008, 4a0f172746, [39, 18, 26, 57, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.7077, best_hidden_layer_sizes sizes: [39, 18, 26, 57, 175]
Run with parameters (0.0016, 4a0f172746, [39, 18, 26, 57, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.7281, best_hidden_layer_sizes sizes: [39, 18, 26, 57, 175]
Run with parameters (0.0032, 4a0f172746, [39, 18, 26, 57, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.7032, best_hidden_layer_sizes sizes: [39, 18, 26, 5

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0016

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar10.X_norm, cifar10.y, n_splits=6, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.7312, best_hidden_layer_sizes: [39, 18, 26, 57, 175]
Run 1 completed, best_val_accuracy: 0.7296, best_hidden_layer_sizes: [39, 18, 26, 57, 175]
Run 2 completed, best_val_accuracy: 0.7286, best_hidden_layer_sizes: [39, 18, 26, 57, 175]
Run 3 completed, best_val_accuracy: 0.7299, best_hidden_layer_sizes: [39, 18, 26, 57, 175]
Run 4 completed, best_val_accuracy: 0.7389, best_hidden_layer_sizes: [39, 18, 26, 57, 175]
Run 5 completed, best_val_accuracy: 0.7186, best_hidden_layer_sizes: [39, 18, 26, 57, 175]
mean_best_val_accuracy: 0.7294666666666667
mean_best_hidden_layer_sizes: [39.0, 18.0, 26.0, 57.0, 175.0]
CPU times: user 14min 48s, sys: 38.2 s, total: 15min 26s
Wall time: 11min 49s


In [None]:
model = get_convolutional_model(cifar10.X_norm, layer_sizes=[39, 18, 26, 57, 175], output_neurons=10)
model.build(cifar10.X_norm.shape)
model.count_params()

Total params: 665,396.0
Trainable params: 665,396
Non-trainable params: 0.0


(665396.0, 665396, 0.0)

In [None]:
model = get_convolutional_model(cifar10.X_norm, layer_sizes=[53, 53, 53, 53, 175], output_neurons=10)
model.build(cifar10.X_norm.shape)
model.count_params()

Total params: 673,021.0
Trainable params: 673,021
Non-trainable params: 0.0


(673021.0, 673021, 0.0)

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[[53, 53, 53, 53, 175]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [53, 53, 53, 53, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.701, best_hidden_layer_sizes sizes: [53, 53, 53, 53, 175]
Run with parameters (0.0002, 4a0f172746, [53, 53, 53, 53, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.7091, best_hidden_layer_sizes sizes: [53, 53, 53, 53, 175]
Run with parameters (0.0004, 4a0f172746, [53, 53, 53, 53, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.7355, best_hidden_layer_sizes sizes: [53, 53, 53, 53, 175]
Run with parameters (0.0008, 4a0f172746, [53, 53, 53, 53, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.7554, best_hidden_layer_sizes sizes: [53, 53, 53, 53, 175]
Run with parameters (0.0016, 4a0f172746, [53, 53, 53, 53, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.743, best_hidden_layer_sizes sizes: [53, 53, 53, 53, 175]
Run with parameters (0.0032, 4a0f172746, [53, 53, 53, 53, 175], 10, 20, 0.2) completed, best_val_accuracy: 0.6991, best_hidden_layer_sizes sizes: [53, 53, 53, 53,

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0008

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar10.X_norm, cifar10.y, n_splits=6, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[53, 53, 53, 53, 175], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.7625, best_hidden_layer_sizes: [53, 53, 53, 53, 175]
Run 1 completed, best_val_accuracy: 0.7541, best_hidden_layer_sizes: [53, 53, 53, 53, 175]
Run 2 completed, best_val_accuracy: 0.7469, best_hidden_layer_sizes: [53, 53, 53, 53, 175]
Run 3 completed, best_val_accuracy: 0.7564, best_hidden_layer_sizes: [53, 53, 53, 53, 175]
Run 4 completed, best_val_accuracy: 0.7607, best_hidden_layer_sizes: [53, 53, 53, 53, 175]
Run 5 completed, best_val_accuracy: 0.7567, best_hidden_layer_sizes: [53, 53, 53, 53, 175]
mean_best_val_accuracy: 0.7562166666666666
mean_best_hidden_layer_sizes: [53.0, 53.0, 53.0, 53.0, 175.0]
CPU times: user 15min 27s, sys: 37.5 s, total: 16min 5s
Wall time: 12min 32s


### Group sparsity regularization

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.006, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, abda604a7f, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7121, best_hidden_layer_sizes sizes: [72, 41, 21, 1095, 100]
Best overall combination: (0.0001, abda604a7f, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.7121
CPU times: user 4min 55s, sys: 9.77 s, total: 5min 5s
Wall time: 5min 2s


In [None]:
model = get_convolutional_model(cifar10.X_norm, layer_sizes=[72, 41, 21, 1095, 100], output_neurons=10)
model.build(cifar10.X_norm.shape)
model.count_params()

Total params: 7,253,555.0
Trainable params: 7,253,555
Non-trainable params: 0.0


(7253555.0, 7253555, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.009, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, abd10b3ecf, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7261, best_hidden_layer_sizes sizes: [42, 23, 17, 68, 103]
Best overall combination: (0.0001, abd10b3ecf, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.7261
CPU times: user 3min 30s, sys: 7.32 s, total: 3min 38s
Wall time: 3min 3s


In [None]:
model = get_convolutional_model(cifar10.X_norm, layer_sizes=[42, 23, 17, 68, 103], output_neurons=10)
model.build(cifar10.X_norm.shape)
model.count_params()

Total params: 473,300.0
Trainable params: 473,300
Non-trainable params: 0.0


(473300.0, 473300, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.0085, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 5b6fe21c0a, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7152, best_hidden_layer_sizes sizes: [46, 26, 16, 105, 100]
Best overall combination: (0.0001, 5b6fe21c0a, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.7152
CPU times: user 3min 31s, sys: 7.82 s, total: 3min 39s
Wall time: 3min 7s


In [None]:
model = get_convolutional_model(cifar10.X_norm, layer_sizes=[46, 26, 16, 105, 100], output_neurons=10)
model.build(cifar10.X_norm.shape)
model.count_params()

Total params: 704,173.0
Trainable params: 704,173
Non-trainable params: 0.0


(704173.0, 704173, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.0085, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=cifar10.X_train_norm, y=cifar10.y_train, validation_data=(cifar10.X_test_norm, cifar10.y_test), 
                                  learning_rate=[0.00005, 0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (5e-05, 5b6fe21c0a, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.7144, best_hidden_layer_sizes sizes: [89, 55, 28, 105, 100]
Run with parameters (0.0002, 5b6fe21c0a, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.6923, best_hidden_layer_sizes sizes: [25, 22, 11, 589, 105]
Best overall combination: (5e-05, 5b6fe21c0a, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.7144
CPU times: user 7min 18s, sys: 1min 22s, total: 8min 41s
Wall time: 7min 43s


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.0085, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, cifar10.X_norm, cifar10.y, n_splits=6, learning_rate=0.0001,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.7324, best_hidden_layer_sizes: [42, 30, 16, 87, 99]
Run 1 completed, best_val_accuracy: 0.7187, best_hidden_layer_sizes: [47, 24, 16, 59, 100]
Run 2 completed, best_val_accuracy: 0.7138, best_hidden_layer_sizes: [42, 22, 17, 79, 100]
Run 3 completed, best_val_accuracy: 0.7209, best_hidden_layer_sizes: [48, 24, 18, 99, 101]
Run 4 completed, best_val_accuracy: 0.7298, best_hidden_layer_sizes: [37, 33, 14, 85, 100]
Run 5 completed, best_val_accuracy: 0.7198, best_hidden_layer_sizes: [45, 23, 16, 72, 100]
mean_best_val_accuracy: 0.7225666666666667
mean_best_hidden_layer_sizes: [43.5, 26.0, 16.166666666666668, 80.16666666666667, 100.0]
CPU times: user 19min 52s, sys: 1min 21s, total: 21min 14s
Wall time: 18min 18s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[44, 26, 16, 80, 100]

In [None]:
model = get_convolutional_model(cifar10.X_norm, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=10)
model.build(cifar10.X_norm.shape)
model.count_params()

Total params: 540,024.0
Trainable params: 540,024
Non-trainable params: 0.0


(540024.0, 540024, 0.0)

## Fashion MNIST

In [None]:
fashion_mnist = get_fashion_mnist_dataset()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=fashion_mnist.X_train_norm, y=fashion_mnist.y_train, validation_data=(fashion_mnist.X_test_norm, fashion_mnist.y_test), 
                                  learning_rate=[0.0008], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9284, best_hidden_layer_sizes sizes: [16, 12, 36, 47, 236]
Best overall combination: (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9284
CPU times: user 2min 44s, sys: 7.12 s, total: 2min 51s
Wall time: 2min 27s


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=fashion_mnist.X_train_norm, y=fashion_mnist.y_train, validation_data=(fashion_mnist.X_test_norm, fashion_mnist.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9215, best_hidden_layer_sizes sizes: [50, 20, 18, 42, 49]
Run with parameters (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9263, best_hidden_layer_sizes sizes: [35, 15, 20, 30, 77]
Run with parameters (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.931, best_hidden_layer_sizes sizes: [28, 13, 28, 31, 119]
Best overall combination: (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.931
CPU times: user 8min 13s, sys: 20.3 s, total: 8min 33s
Wall time: 7min 29s


In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, fashion_mnist.X_norm, fashion_mnist.y, n_splits=7, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9333, best_hidden_layer_sizes: [25, 15, 24, 35, 121]
Run 1 completed, best_val_accuracy: 0.9314, best_hidden_layer_sizes: [25, 13, 30, 39, 108]
Run 2 completed, best_val_accuracy: 0.9313, best_hidden_layer_sizes: [23, 14, 31, 35, 119]
Run 3 completed, best_val_accuracy: 0.9325, best_hidden_layer_sizes: [22, 14, 12, 40, 113]
Run 4 completed, best_val_accuracy: 0.9384, best_hidden_layer_sizes: [21, 15, 18, 32, 141]
Run 5 completed, best_val_accuracy: 0.9298, best_hidden_layer_sizes: [19, 17, 19, 38, 122]
Run 6 completed, best_val_accuracy: 0.9321, best_hidden_layer_sizes: [20, 15, 15, 33, 143]
mean_best_val_accuracy: 0.9326857142857143
mean_best_hidden_layer_sizes: [22.142857142857142, 14.714285714285714, 21.285714285714285, 36.0, 123.85714285714286]
CPU times: user 19min 2s, sys: 49.2 s, total: 19min 51s
Wall time: 16min 58s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[22, 15, 21, 36, 124]

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=fashion_mnist.X_train_norm, y=fashion_mnist.y_train, validation_data=(fashion_mnist.X_test_norm, fashion_mnist.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[rounded_mean_best_hidden_layer_sizes], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [22, 15, 21, 36, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9066, best_hidden_layer_sizes sizes: [22, 15, 21, 36, 124]
Run with parameters (0.0002, 4a0f172746, [22, 15, 21, 36, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9126, best_hidden_layer_sizes sizes: [22, 15, 21, 36, 124]
Run with parameters (0.0004, 4a0f172746, [22, 15, 21, 36, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.921, best_hidden_layer_sizes sizes: [22, 15, 21, 36, 124]
Run with parameters (0.0008, 4a0f172746, [22, 15, 21, 36, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9189, best_hidden_layer_sizes sizes: [22, 15, 21, 36, 124]
Run with parameters (0.0016, 4a0f172746, [22, 15, 21, 36, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9187, best_hidden_layer_sizes sizes: [22, 15, 21, 36, 124]
Run with parameters (0.0032, 4a0f172746, [22, 15, 21, 36, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9066, best_hidden_layer_sizes sizes: [22, 15, 21, 36

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, fashion_mnist.X_norm, fashion_mnist.y, n_splits=7, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.923, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
Run 1 completed, best_val_accuracy: 0.9246, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
Run 2 completed, best_val_accuracy: 0.9233, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
Run 3 completed, best_val_accuracy: 0.9269, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
Run 4 completed, best_val_accuracy: 0.9218, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
Run 5 completed, best_val_accuracy: 0.9249, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
Run 6 completed, best_val_accuracy: 0.9231, best_hidden_layer_sizes: [22, 15, 21, 36, 124]
mean_best_val_accuracy: 0.9239428571428572
mean_best_hidden_layer_sizes: [22.0, 15.0, 21.0, 36.0, 124.0]
CPU times: user 15min 17s, sys: 41 s, total: 15min 58s
Wall time: 12min 44s


In [None]:
model = get_convolutional_model(fashion_mnist.X_norm, layer_sizes=[22, 15, 21, 36, 124], output_neurons=10)
model.build(fashion_mnist.X_norm.shape)
model.count_params()

Total params: 233,011.0
Trainable params: 233,011
Non-trainable params: 0.0


(233011.0, 233011, 0.0)

In [None]:
model = get_convolutional_model(fashion_mnist.X_norm, layer_sizes=[34, 34, 34, 34, 124], output_neurons=10)
model.build(fashion_mnist.X_norm.shape)
model.count_params()

Total params: 239,612.0
Trainable params: 239,612
Non-trainable params: 0.0


(239612.0, 239612, 0.0)

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=fashion_mnist.X_train_norm, y=fashion_mnist.y_train, validation_data=(fashion_mnist.X_test_norm, fashion_mnist.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[[34, 34, 34, 34, 124]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [34, 34, 34, 34, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9175, best_hidden_layer_sizes sizes: [34, 34, 34, 34, 124]
Run with parameters (0.0002, 4a0f172746, [34, 34, 34, 34, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9217, best_hidden_layer_sizes sizes: [34, 34, 34, 34, 124]
Run with parameters (0.0004, 4a0f172746, [34, 34, 34, 34, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9252, best_hidden_layer_sizes sizes: [34, 34, 34, 34, 124]
Run with parameters (0.0008, 4a0f172746, [34, 34, 34, 34, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9251, best_hidden_layer_sizes sizes: [34, 34, 34, 34, 124]
Run with parameters (0.0016, 4a0f172746, [34, 34, 34, 34, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9209, best_hidden_layer_sizes sizes: [34, 34, 34, 34, 124]
Run with parameters (0.0032, 4a0f172746, [34, 34, 34, 34, 124], 10, 20, 0.2) completed, best_val_accuracy: 0.9055, best_hidden_layer_sizes sizes: [34, 34, 34, 3

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, fashion_mnist.X_norm, fashion_mnist.y, n_splits=7, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[34, 34, 34, 34, 124], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9259, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
Run 1 completed, best_val_accuracy: 0.9275, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
Run 2 completed, best_val_accuracy: 0.9257, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
Run 3 completed, best_val_accuracy: 0.9297, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
Run 4 completed, best_val_accuracy: 0.9311, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
Run 5 completed, best_val_accuracy: 0.926, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
Run 6 completed, best_val_accuracy: 0.9271, best_hidden_layer_sizes: [34, 34, 34, 34, 124]
mean_best_val_accuracy: 0.9275714285714286
mean_best_hidden_layer_sizes: [34.0, 34.0, 34.0, 34.0, 124.0]
CPU times: user 14min 27s, sys: 42.5 s, total: 15min 10s
Wall time: 12min


### Group sparsity regularization

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.006, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=fashion_mnist.X_train_norm, y=fashion_mnist.y_train, validation_data=(fashion_mnist.X_test_norm, fashion_mnist.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

KeyboardInterrupt: ignored

## MNIST

In [None]:
mnist = get_mnist_dataset()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=mnist.X_train_norm, y=mnist.y_train, validation_data=(mnist.X_test_norm, mnist.y_test), 
                                  learning_rate=[0.0008], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9934, best_hidden_layer_sizes sizes: [18, 14, 40, 46, 220]
Best overall combination: (0.0008, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9934
CPU times: user 2min 44s, sys: 7.18 s, total: 2min 51s
Wall time: 2min 29s


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=mnist.X_train_norm, y=mnist.y_train, validation_data=(mnist.X_test_norm, mnist.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9924, best_hidden_layer_sizes sizes: [36, 20, 18, 35, 44]
Run with parameters (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9931, best_hidden_layer_sizes sizes: [19, 19, 14, 31, 39]
Run with parameters (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2) completed, best_val_accuracy: 0.9933, best_hidden_layer_sizes sizes: [16, 14, 19, 45, 109]
Best overall combination: (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 10, 20, 0.2), val_accuracy: 0.9933
CPU times: user 8min 4s, sys: 20.5 s, total: 8min 25s
Wall time: 7min 18s


In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, mnist.X_norm, mnist.y, n_splits=7, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9926, best_hidden_layer_sizes: [18, 15, 16, 34, 77]
Run 1 completed, best_val_accuracy: 0.9936, best_hidden_layer_sizes: [16, 13, 16, 38, 91]
Run 2 completed, best_val_accuracy: 0.9928, best_hidden_layer_sizes: [20, 13, 14, 32, 102]
Run 3 completed, best_val_accuracy: 0.9938, best_hidden_layer_sizes: [18, 14, 16, 35, 81]
Run 4 completed, best_val_accuracy: 0.9946, best_hidden_layer_sizes: [18, 14, 19, 31, 88]
Run 5 completed, best_val_accuracy: 0.9933, best_hidden_layer_sizes: [17, 16, 13, 34, 113]
Run 6 completed, best_val_accuracy: 0.994, best_hidden_layer_sizes: [23, 14, 29, 31, 100]
mean_best_val_accuracy: 0.9935285714285714
mean_best_hidden_layer_sizes: [18.571428571428573, 14.142857142857142, 17.571428571428573, 33.57142857142857, 93.14285714285714]
CPU times: user 18min 40s, sys: 50.5 s, total: 19min 31s
Wall time: 16min 42s


In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[19, 14, 18, 34, 93]

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=mnist.X_train_norm, y=mnist.y_train, validation_data=(mnist.X_test_norm, mnist.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[rounded_mean_best_hidden_layer_sizes], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [19, 14, 18, 34, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9917, best_hidden_layer_sizes sizes: [19, 14, 18, 34, 93]
Run with parameters (0.0002, 4a0f172746, [19, 14, 18, 34, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9925, best_hidden_layer_sizes sizes: [19, 14, 18, 34, 93]
Run with parameters (0.0004, 4a0f172746, [19, 14, 18, 34, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9929, best_hidden_layer_sizes sizes: [19, 14, 18, 34, 93]
Run with parameters (0.0008, 4a0f172746, [19, 14, 18, 34, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9919, best_hidden_layer_sizes sizes: [19, 14, 18, 34, 93]
Run with parameters (0.0016, 4a0f172746, [19, 14, 18, 34, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9925, best_hidden_layer_sizes sizes: [19, 14, 18, 34, 93]
Run with parameters (0.0032, 4a0f172746, [19, 14, 18, 34, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9902, best_hidden_layer_sizes sizes: [19, 14, 18, 34, 93]
Best

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, mnist.X_norm, mnist.y, n_splits=7, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9923, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
Run 1 completed, best_val_accuracy: 0.9935, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
Run 2 completed, best_val_accuracy: 0.9937, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
Run 3 completed, best_val_accuracy: 0.9932, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
Run 4 completed, best_val_accuracy: 0.9941, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
Run 5 completed, best_val_accuracy: 0.9924, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
Run 6 completed, best_val_accuracy: 0.9943, best_hidden_layer_sizes: [19, 14, 18, 34, 93]
mean_best_val_accuracy: 0.9933571428571427
mean_best_hidden_layer_sizes: [19.0, 14.0, 18.0, 34.0, 93.0]
CPU times: user 14min 52s, sys: 41.7 s, total: 15min 34s
Wall time: 12min 25s


In [None]:
model = get_convolutional_model(mnist.X_norm, layer_sizes=[19, 14, 18, 34, 93], output_neurons=10)
model.build(mnist.X_norm.shape)
model.count_params()

Total params: 166,397.0
Trainable params: 166,397
Non-trainable params: 0.0


(166397.0, 166397, 0.0)

In [None]:
model = get_convolutional_model(mnist.X_norm, layer_sizes=[31, 31, 31, 31, 93], output_neurons=10)
model.build(mnist.X_norm.shape)
model.count_params()

Total params: 168,650.0
Trainable params: 168,650
Non-trainable params: 0.0


(168650.0, 168650, 0.0)

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=mnist.X_train_norm, y=mnist.y_train, validation_data=(mnist.X_test_norm, mnist.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[[31, 31, 31, 31, 93]], 
                                  output_neurons=[10], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [31, 31, 31, 31, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9931, best_hidden_layer_sizes sizes: [31, 31, 31, 31, 93]
Run with parameters (0.0002, 4a0f172746, [31, 31, 31, 31, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.994, best_hidden_layer_sizes sizes: [31, 31, 31, 31, 93]
Run with parameters (0.0004, 4a0f172746, [31, 31, 31, 31, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9941, best_hidden_layer_sizes sizes: [31, 31, 31, 31, 93]
Run with parameters (0.0008, 4a0f172746, [31, 31, 31, 31, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9941, best_hidden_layer_sizes sizes: [31, 31, 31, 31, 93]
Run with parameters (0.0016, 4a0f172746, [31, 31, 31, 31, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9916, best_hidden_layer_sizes sizes: [31, 31, 31, 31, 93]
Run with parameters (0.0032, 4a0f172746, [31, 31, 31, 31, 93], 10, 20, 0.2) completed, best_val_accuracy: 0.9902, best_hidden_layer_sizes sizes: [31, 31, 31, 31, 93]
Best 

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, mnist.X_norm, mnist.y, n_splits=7, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[31, 31, 31, 31, 93], output_neurons=10, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.9932, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
Run 1 completed, best_val_accuracy: 0.9938, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
Run 2 completed, best_val_accuracy: 0.9926, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
Run 3 completed, best_val_accuracy: 0.9942, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
Run 4 completed, best_val_accuracy: 0.9947, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
Run 5 completed, best_val_accuracy: 0.9913, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
Run 6 completed, best_val_accuracy: 0.9935, best_hidden_layer_sizes: [31, 31, 31, 31, 93]
mean_best_val_accuracy: 0.9933285714285713
mean_best_hidden_layer_sizes: [31.0, 31.0, 31.0, 31.0, 93.0]
CPU times: user 14min 57s, sys: 41.7 s, total: 15min 39s
Wall time: 12min 29s


## Tiny ImageNet

In [None]:
tiny_imagenet = get_tiny_imagenet_dataset()

Cloning into 'IMagenet'...
remote: Enumerating objects: 120594, done.[K
remote: Total 120594 (delta 0), reused 0 (delta 0), pack-reused 120594[K
Receiving objects: 100% (120594/120594), 212.68 MiB | 28.49 MiB/s, done.
Resolving deltas: 100% (1115/1115), done.
Checking out files: 100% (120206/120206), done.
Processing the downloaded dataset...


In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, ec4ba8ef9e, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1674, best_hidden_layer_sizes sizes: [76, 22, 33, 36, 120]
Run with parameters (0.0002, ec4ba8ef9e, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1669, best_hidden_layer_sizes sizes: [66, 14, 29, 51, 190]
Run with parameters (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1766, best_hidden_layer_sizes sizes: [48, 15, 45, 38, 381]
Best overall combination: (0.0004, ec4ba8ef9e, [100, 100, 100, 100, 100], 200, 20, 0.2), val_accuracy: 0.1766
CPU times: user 46min 45s, sys: 54 s, total: 47min 39s
Wall time: 38min 22s


In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.00002, 'weighted_l1')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, tiny_imagenet.X_norm, tiny_imagenet.y, n_splits=11, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=200, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.1862, best_hidden_layer_sizes: [49, 12, 38, 46, 404]
Run 1 completed, best_val_accuracy: 0.1854, best_hidden_layer_sizes: [51, 15, 46, 46, 384]
Run 2 completed, best_val_accuracy: 0.1917, best_hidden_layer_sizes: [46, 14, 40, 47, 391]
Run 3 completed, best_val_accuracy: 0.1792, best_hidden_layer_sizes: [49, 12, 48, 62, 376]
Run 4 completed, best_val_accuracy: 0.2002, best_hidden_layer_sizes: [45, 13, 38, 41, 371]
Run 5 completed, best_val_accuracy: 0.1792, best_hidden_layer_sizes: [42, 12, 56, 49, 358]
Run 6 completed, best_val_accuracy: 0.19, best_hidden_layer_sizes: [52, 13, 66, 52, 348]
Run 7 completed, best_val_accuracy: 0.213, best_hidden_layer_sizes: [45, 13, 53, 31, 388]
Run 8 completed, best_val_accuracy: 0.1825, best_hidden_layer_sizes: [45, 13, 38, 80, 396]
Run 9 completed, best_val_accuracy: 0.1743, best_hidden_layer_sizes: [43, 15, 54, 79, 368]
Run 10 completed, best_val_accuracy: 0.1804, best_hidden_layer_sizes: [46, 13, 50, 66, 362]
m

In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[47, 13, 48, 54, 377]

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[rounded_mean_best_hidden_layer_sizes], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [47, 13, 48, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.088, best_hidden_layer_sizes sizes: [47, 13, 48, 54, 377]
Run with parameters (0.0002, 4a0f172746, [47, 13, 48, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0932, best_hidden_layer_sizes sizes: [47, 13, 48, 54, 377]
Run with parameters (0.0004, 4a0f172746, [47, 13, 48, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0995, best_hidden_layer_sizes sizes: [47, 13, 48, 54, 377]
Run with parameters (0.0008, 4a0f172746, [47, 13, 48, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0923, best_hidden_layer_sizes sizes: [47, 13, 48, 54, 377]
Run with parameters (0.0016, 4a0f172746, [47, 13, 48, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0852, best_hidden_layer_sizes sizes: [47, 13, 48, 54, 377]
Run with parameters (0.0032, 4a0f172746, [47, 13, 48, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.005, best_hidden_layer_sizes sizes: [47, 13, 4

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0004

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, tiny_imagenet.X_norm, tiny_imagenet.y, n_splits=11, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=200, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.1087, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 1 completed, best_val_accuracy: 0.1094, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 2 completed, best_val_accuracy: 0.1176, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 3 completed, best_val_accuracy: 0.111, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 4 completed, best_val_accuracy: 0.1162, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 5 completed, best_val_accuracy: 0.1078, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 6 completed, best_val_accuracy: 0.1183, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 7 completed, best_val_accuracy: 0.1164, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 8 completed, best_val_accuracy: 0.1093, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 9 completed, best_val_accuracy: 0.1149, best_hidden_layer_sizes: [47, 13, 48, 54, 377]
Run 10 completed, best_val_accuracy: 0.1138, best_hidden_layer_sizes: [47, 13, 48, 54, 377]

In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=[47, 13, 48, 54, 377], output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 5,323,499.0
Trainable params: 5,323,499
Non-trainable params: 0.0


(5323499.0, 5323499, 0.0)

In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=[54, 54, 54, 54, 377], output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 5,368,031.0
Trainable params: 5,368,031
Non-trainable params: 0.0


(5368031.0, 5368031, 0.0)

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.0001, 0.0002, 0.0004, 0.0008, 0.0016, 0.0032], schedule=[schedule], layer_sizes=[[54, 54, 54, 54, 377]], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 4a0f172746, [54, 54, 54, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.105, best_hidden_layer_sizes sizes: [54, 54, 54, 54, 377]
Run with parameters (0.0002, 4a0f172746, [54, 54, 54, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0968, best_hidden_layer_sizes sizes: [54, 54, 54, 54, 377]
Run with parameters (0.0004, 4a0f172746, [54, 54, 54, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0951, best_hidden_layer_sizes sizes: [54, 54, 54, 54, 377]
Run with parameters (0.0008, 4a0f172746, [54, 54, 54, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.0974, best_hidden_layer_sizes sizes: [54, 54, 54, 54, 377]
Run with parameters (0.0016, 4a0f172746, [54, 54, 54, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.005, best_hidden_layer_sizes sizes: [54, 54, 54, 54, 377]
Run with parameters (0.0032, 4a0f172746, [54, 54, 54, 54, 377], 200, 20, 0.2) completed, best_val_accuracy: 0.005, best_hidden_layer_sizes sizes: [54, 54, 54

In [None]:
best_learning_rate = best_overall_combination[0]
best_learning_rate

0.0001

In [None]:
%%time

schedule = Schedule([StaticEpochNoRegularization()] * 40)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, tiny_imagenet.X_norm, tiny_imagenet.y, n_splits=11, learning_rate=best_learning_rate,
    schedule=schedule, layer_sizes=[54, 54, 54, 54, 377], output_neurons=200, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.1139, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 1 completed, best_val_accuracy: 0.1136, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 2 completed, best_val_accuracy: 0.1129, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 3 completed, best_val_accuracy: 0.1177, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 4 completed, best_val_accuracy: 0.1136, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 5 completed, best_val_accuracy: 0.1063, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 6 completed, best_val_accuracy: 0.115, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 7 completed, best_val_accuracy: 0.1142, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 8 completed, best_val_accuracy: 0.1077, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 9 completed, best_val_accuracy: 0.1128, best_hidden_layer_sizes: [54, 54, 54, 54, 377]
Run 10 completed, best_val_accuracy: 0.1168, best_hidden_layer_sizes: [54, 54, 54, 54, 377]

### Group sparsity regularization

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.014, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, d60ab0bdd4, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1268, best_hidden_layer_sizes sizes: [55, 8, 12, 31, 781]
Best overall combination: (0.0001, d60ab0bdd4, [100, 100, 100, 100, 100], 200, 20, 0.2), val_accuracy: 0.1268
CPU times: user 14min 39s, sys: 15.7 s, total: 14min 55s
Wall time: 11min 59s


In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=[55, 8, 12, 31, 781], output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 6,364,960.0
Trainable params: 6,364,960
Non-trainable params: 0.0


(6364960.0, 6364960, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.016, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, 02547f1454, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1389, best_hidden_layer_sizes sizes: [33, 11, 13, 35, 741]
Best overall combination: (0.0001, 02547f1454, [100, 100, 100, 100, 100], 200, 20, 0.2), val_accuracy: 0.1389
CPU times: user 13min 33s, sys: 16.4 s, total: 13min 49s
Wall time: 10min 55s


In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=[33, 11, 13, 35, 741], output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 6,798,133.0
Trainable params: 6,798,133
Non-trainable params: 0.0


(6798133.0, 6798133, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.018, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.0001], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (0.0001, bc920eb6bb, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1234, best_hidden_layer_sizes sizes: [35, 8, 11, 31, 419]
Best overall combination: (0.0001, bc920eb6bb, [100, 100, 100, 100, 100], 200, 20, 0.2), val_accuracy: 0.1234
CPU times: user 12min 51s, sys: 15.4 s, total: 13min 7s
Wall time: 10min 19s


In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=[35, 8, 11, 31, 419], output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 3,417,014.0
Trainable params: 3,417,014
Non-trainable params: 0.0


(3417014.0, 3417014, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.016, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, best_overall_combination = hyperparameter_search(train_fn, x=tiny_imagenet.X_train_norm, y=tiny_imagenet.y_train, validation_data=(tiny_imagenet.X_test_norm, tiny_imagenet.y_test), 
                                  learning_rate=[0.00005, 0.0001, 0.0002], schedule=[schedule], layer_sizes=[[100, 100, 100, 100, 100]], 
                                  output_neurons=[200], min_new_neurons=[20], growth_percentage=[0.2])

Run with parameters (5e-05, 02547f1454, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1336, best_hidden_layer_sizes sizes: [71, 19, 20, 71, 160]
Run with parameters (0.0001, 02547f1454, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1387, best_hidden_layer_sizes sizes: [29, 10, 15, 33, 807]
Run with parameters (0.0002, 02547f1454, [100, 100, 100, 100, 100], 200, 20, 0.2) completed, best_val_accuracy: 0.1124, best_hidden_layer_sizes sizes: [23, 7, 6, 23, 1711]
Best overall combination: (0.0001, 02547f1454, [100, 100, 100, 100, 100], 200, 20, 0.2), val_accuracy: 0.1387
CPU times: user 42min 4s, sys: 50.1 s, total: 42min 54s
Wall time: 34min 8s


In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=[29, 10, 15, 33, 807], output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 6,989,228.0
Trainable params: 6,989,228
Non-trainable params: 0.0


(6989228.0, 6989228, 0.0)

In [None]:
%%time

schedule = Schedule([DynamicEpoch(0.016, 'group_sparsity')] * 20 + [StaticEpochNoRegularization()] * 20)
histories, mean_best_hidden_layer_sizes = cross_validate(
    train_fn, tiny_imagenet.X_norm, tiny_imagenet.y, n_splits=11, learning_rate=0.0001,
    schedule=schedule, layer_sizes=[100, 100, 100, 100, 100], output_neurons=200, min_new_neurons=20, growth_percentage=0.2
)

Run 0 completed, best_val_accuracy: 0.1786, best_hidden_layer_sizes: [58, 12, 10, 27, 569]
Run 1 completed, best_val_accuracy: 0.1581, best_hidden_layer_sizes: [38, 9, 9, 26, 506]
Run 2 completed, best_val_accuracy: 0.1702, best_hidden_layer_sizes: [44, 8, 11, 31, 507]
Run 3 completed, best_val_accuracy: 0.1729, best_hidden_layer_sizes: [39, 9, 9, 28, 496]
Run 4 completed, best_val_accuracy: 0.1791, best_hidden_layer_sizes: [40, 9, 10, 35, 612]
Run 5 completed, best_val_accuracy: 0.1723, best_hidden_layer_sizes: [36, 9, 10, 31, 552]
Run 6 completed, best_val_accuracy: 0.1706, best_hidden_layer_sizes: [41, 9, 10, 25, 505]
Run 7 completed, best_val_accuracy: 0.179, best_hidden_layer_sizes: [38, 11, 9, 25, 484]
Run 8 completed, best_val_accuracy: 0.1678, best_hidden_layer_sizes: [41, 8, 10, 31, 484]
Run 9 completed, best_val_accuracy: 0.1617, best_hidden_layer_sizes: [37, 10, 9, 29, 513]
Run 10 completed, best_val_accuracy: 0.168, best_hidden_layer_sizes: [47, 11, 9, 24, 585]
mean_best_va

In [None]:
mean_best_hidden_layer_sizes = [41.72727272727273, 9.545454545454545, 9.636363636363637, 28.363636363636363, 528.4545454545455]

In [None]:
rounded_mean_best_hidden_layer_sizes = [round(x) for x in mean_best_hidden_layer_sizes]
rounded_mean_best_hidden_layer_sizes

[42, 10, 10, 28, 528]

In [None]:
model = get_convolutional_model(tiny_imagenet.X_norm, layer_sizes=rounded_mean_best_hidden_layer_sizes, output_neurons=200)
model.build(tiny_imagenet.X_norm.shape)
model.count_params()

Total params: 3,899,456.0
Trainable params: 3,899,456
Non-trainable params: 0.0


(3899456.0, 3899456, 0.0)