In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat Jan  8 17:10:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

%matplotlib inline

In [3]:
dtype = 'float32'
tf.keras.backend.set_floatx(dtype)

In [4]:
cifar10 = tf.keras.datasets.cifar10
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 3072))
X_test = np.reshape(X_test, (-1, 3072))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling each feature independently

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 32, 32, 3))
X_train_norm = np.reshape(X_train_norm, (-1, 32, 32, 3))
X_test_norm = np.reshape(X_test_norm, (-1, 32, 32, 3))

In [24]:
################################################################################
# REGULARIZERS
################################################################################


class Regularizer(tf.keras.regularizers.Regularizer):
    def __init__(self, regularization_penalty, regularization_method):
        self.regularization_penalty = regularization_penalty
        self.regularization_method = regularization_method
        self.n_new_neurons = 0
        self.scaling_tensor = None
        if self.regularization_method == 'weighted_l1_reordered':
            self.update_scaling_tensor = True
        else:
            self.update_scaling_tensor = None

    def __call__(self, x):
        if self.regularization_method == 'weighted_l1':
            return self.weighted_l1(x)
        elif self.regularization_method == 'weighted_l1_reordered':
            return self.weighted_l1_reordered(x)
        elif self.regularization_method == 'group_sparsity':
            return self.group_sparsity(x)
        else:
            raise NotImplementedError(f"Unknown regularization method {self.regularization_method}")
    
    def weighted_l1(self, x):
        # I.e. for a parameter matrix of 4 input and 10 output neurons:
        #
        # [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]
        #
        # the scaling tensor, as well as the resulting weighted values, could be:
        #
        # [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]]
        #
        # Therefore every additional output neuron is regularized more.

        scaling_tensor = tf.cumsum(tf.constant(self.regularization_penalty, shape=x.shape, dtype=dtype), axis=-1)
        weighted_values = scaling_tensor * tf.abs(x)
        return tf.reduce_sum(weighted_values)
    
    def weighted_l1_reordered(self, x):
        # I.e. for a parameter matrix of 4 input and 10 output neurons:
        #
        # [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]
        #
        # the scaling tensor, as well as the resulting weighted values, could be:
        #
        # [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
        #  [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]]
        #
        # Therefore every additional output neuron is regularized more.

        if self.update_scaling_tensor:
            scaling_tensor_raw = tf.cumsum(tf.constant(self.regularization_penalty, shape=x.shape, dtype=dtype), axis=-1)

            scaling_tensor_old_neurons = scaling_tensor_raw[:, :-self.n_new_neurons]
            scaling_tensor_new_neurons = scaling_tensor_raw[:, -self.n_new_neurons:]
            scaling_tensor_old_neurons_shuffled = tf.transpose(tf.random.shuffle(tf.transpose(scaling_tensor_old_neurons)))
            self.scaling_tensor = tf.concat([scaling_tensor_old_neurons_shuffled, scaling_tensor_new_neurons], axis=-1)
            self.update_scaling_tensor = False

        weighted_values = self.scaling_tensor * tf.abs(x)
        return tf.reduce_sum(weighted_values)
    
    def group_sparsity(self, x):
        # I.e. for a parameter matrix of 3 input and 5 output neurons:
        #
        # [[1., 1., 1., 1., 1.],
        #  [1., 2., 2., 1., 2.],
        #  [2., 2., 3., 1., 3.]]
        #
        # The resulting vector of group norms is [2., 2., 3., 1., 3.], therefore for
        # every output neuron, its incoming connections form a group.

        group_norms = tf.norm(x, ord=2, axis=0)
        # assert group_norms.shape[0] == x.shape[1]
        return self.regularization_penalty * tf.reduce_sum(group_norms)
    
    def prune(self):
        pass
    
    def grow(self, n_new_neurons):
        self.n_new_neurons = n_new_neurons
        if self.regularization_method == 'weighted_l1_reordered':
            self.update_scaling_tensor = True

    def get_config(self):
        return {'regularization_penalty': float(self.regularization_penalty)}


################################################################################
# LAYERS
################################################################################


class CustomLayer(tf.keras.layers.Layer):
    def __init__(self, input_shape):
        super().__init__()

        self.inpt_shp = input_shape


class Dense(CustomLayer):
    def __init__(self, units, activation, regularization_penalty=0.01, 
                 regularization_method='weighted_l1', kernel_initializer='glorot_uniform', 
                 bias_initializer='zeros', input_shape=None, fixed_size=False):
        super().__init__(input_shape)

        self.units = units
        self.activation = activation
        self.regularization_penalty = regularization_penalty
        self.regularization_method = regularization_method
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
        self.fixed_size = fixed_size
        
        self.A = tf.keras.activations.get(activation)
        self.W_init = tf.keras.initializers.get(kernel_initializer)
        self.b_init = tf.keras.initializers.get(bias_initializer)
        self.regularizer = Regularizer(self.regularization_penalty, self.regularization_method)
    
    def build(self, input_shape):
        input_units = input_shape[-1]

        self.W = tf.Variable(
            name='W',
            initial_value=self.W_init(shape=(input_units, self.units), dtype=dtype),
            trainable=True)
        
        self.b = tf.Variable(
            name='b',
            initial_value=self.b_init(shape=(self.units,), dtype=dtype),
            trainable=True)
        
        if self.regularization_method is not None:
            self.add_loss(lambda: self.regularizer(tf.concat([self.W, tf.reshape(self.b, (1, -1))], axis=0)))
    
    def call(self, inputs, training=None):
        return self.A(tf.matmul(inputs, self.W) + self.b)

    def get_size(self):
        return self.W.shape[0], self.W.shape[1]
    
    def prune(self, threshold, active_input_units_indices):
        # Remove connections from pruned units in previous layer
        new_W = tf.gather(self.W.value(), active_input_units_indices, axis=0)

        if self.fixed_size:
            active_output_neurons_indices = list(range(new_W.shape[1]))
        else:
            # Prune units in this layer
            weights_with_biases = tf.concat([new_W, tf.reshape(self.b.value(), (1, -1))], axis=0)
            neurons_are_active = tf.math.reduce_max(tf.abs(weights_with_biases), axis=0) >= threshold
            active_output_neurons_indices = tf.reshape(tf.where(neurons_are_active), (-1,))
            
            new_W = tf.gather(new_W, active_output_neurons_indices, axis=1)
            new_b = tf.gather(self.b.value(), active_output_neurons_indices, axis=0)

            self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.W = tf.Variable(name='W', initial_value=new_W, trainable=True)

        self.regularizer.prune()
        return active_output_neurons_indices
    
    def grow(self, n_new_input_units, percentage, min_new_units, scaling_factor):
        if n_new_input_units > 0:
            # Add connections to grown units in previous layer
            W_growth = self.W_init(shape=(self.W.shape[0] + n_new_input_units, self.W.shape[1]), dtype=dtype)[-n_new_input_units:, :] * scaling_factor  # TODO is it better to be multiplying here by scaling_factor? It does help with not increasing the max weights of existing neurons when new neurons are added.
            new_W = tf.concat([self.W.value(), W_growth], axis=0)
        else:
            new_W = self.W.value()

        if self.fixed_size:
            n_new_output_units = 0
        else:
            # Grow new units in this layer
            n_new_output_units = max(min_new_units, int(new_W.shape[1] * percentage))
            W_growth = self.W_init(shape=(new_W.shape[0], new_W.shape[1] + n_new_output_units), dtype=dtype)[:, -n_new_output_units:] * scaling_factor
            b_growth = self.b_init(shape=(n_new_output_units,), dtype=dtype)  # TODO for all possible bias initializers to work properly, the whole bias vector should be initialized at once
            new_W = tf.concat([new_W, W_growth], axis=1)
            new_b = tf.concat([self.b.value(), b_growth], axis=0)

            self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.W = tf.Variable(name='W', initial_value=new_W, trainable=True)

        self.regularizer.grow(n_new_output_units)
        return n_new_output_units
    
    def get_param_string():
        param_string = ""
        weights_with_bias = tf.concat([self.W, tf.reshape(self.b, (1, -1))], axis=0)
        max_parameters = tf.math.reduce_max(tf.abs(weights_with_bias), axis=0).numpy()
        magnitudes = np.floor(np.log10(max_parameters))
        for m in magnitudes:
            if m > 0:
                m = 0
            param_string += str(int(-m))
        return param_string


class Conv2D(CustomLayer):
    def __init__(self, filters, filter_size, activation, strides=(1, 1), 
                 padding='SAME', regularization_penalty=0.01, 
                 regularization_method='weighted_l1', kernel_initializer='glorot_uniform',
                 bias_initializer='zeros', input_shape=None, fixed_size=False):
        super().__init__(input_shape)
    
        self.filters = filters
        self.filter_size = filter_size
        self.activation = activation
        self.strides = strides
        self.padding = padding
        self.regularization_penalty = regularization_penalty
        self.regularization_method = regularization_method
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
        self.fixed_size = fixed_size
        
        self.A = tf.keras.activations.get(activation)
        self.F_init = tf.keras.initializers.get(kernel_initializer)
        self.b_init = tf.keras.initializers.get(bias_initializer)
        self.regularizer = Regularizer(self.regularization_penalty, self.regularization_method)
    
    def build(self, input_shape):
        input_filters = input_shape[-1]

        self.F = tf.Variable(
            name='F',
            initial_value=self.F_init(
                shape=(self.filter_size[0], self.filter_size[1], input_filters, self.filters), dtype=dtype
            ),
            trainable=True)
        
        self.b = tf.Variable(
            name='b',
            initial_value=self.b_init(shape=(self.filters,), dtype=dtype),
            trainable=True)

        if self.regularization_method is not None:
            self.add_loss(lambda: self.regularizer(tf.concat([tf.reshape(self.F, (-1, self.F.shape[-1])), tf.reshape(self.b, (1, -1))], axis=0)))
    
    def call(self, inputs, training=None):
        y = tf.nn.conv2d(inputs, self.F, strides=self.strides, padding=self.padding)
        y = tf.nn.bias_add(y, self.b)
        y = self.A(y)
        return y
    
    def get_size(self):
        return self.F.shape[-2], self.F.shape[-1]
    
    def prune(self, threshold, active_input_units_indices):
        # Remove connections from pruned units in previous layer
        new_F = tf.gather(self.F.value(), active_input_units_indices, axis=-2)

        if self.fixed_size:
            active_output_filters_indices = list(range(new_F.shape[-1]))
        else:
            # Prune units in this layer
            F_reduced_max = tf.reshape(tf.math.reduce_max(tf.abs(new_F), axis=(0, 1, 2)), (1, -1))
            F_reduced_max_with_biases = tf.concat([F_reduced_max, tf.reshape(self.b.value(), (1, -1))], axis=0)
            filters_are_active = tf.math.reduce_max(tf.abs(F_reduced_max_with_biases), axis=0) >= threshold
            active_output_filters_indices = tf.reshape(tf.where(filters_are_active), (-1,))
            
            new_F = tf.gather(new_F, active_output_filters_indices, axis=-1)
            new_b = tf.gather(self.b.value(), active_output_filters_indices, axis=0)

            self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.F = tf.Variable(name='F', initial_value=new_F, trainable=True)

        self.regularizer.prune()
        return active_output_filters_indices

    def grow(self, n_new_input_units, percentage, min_new_units, scaling_factor):
        if n_new_input_units > 0:
            # Add connections to grown units in previous layer
            F_growth = self.F_init(shape=(self.F.shape[0], self.F.shape[1], self.F.shape[2] + n_new_input_units, self.F.shape[3]), dtype=dtype)[:, :, -n_new_input_units:, :] * scaling_factor  # TODO is it better to be multiplying here by scaling_factor? It does help with not increasing the max weights of existing neurons when new neurons are added.
            new_F = tf.concat([self.F.value(), F_growth], axis=-2)
        else:
            new_F = self.F.value()

        if self.fixed_size:
            n_new_output_units = 0
        else:
            # Grow new units in this layer
            n_new_output_units = max(min_new_units, int(new_F.shape[-1] * percentage))
            F_growth = self.F_init(shape=(new_F.shape[0], new_F.shape[1], new_F.shape[2], new_F.shape[3] + n_new_output_units), dtype=dtype)[:, :, :, -n_new_output_units:] * scaling_factor
            b_growth = self.b_init(shape=(n_new_output_units,), dtype=dtype)  # TODO for all possible bias initializers to work properly, the whole bias vector should be initialized at once
            new_F = tf.concat([new_F, F_growth], axis=-1)
            new_b = tf.concat([self.b.value(), b_growth], axis=0)

            self.b = tf.Variable(name='b', initial_value=new_b, trainable=True)

        self.F = tf.Variable(name='F', initial_value=new_F, trainable=True)

        self.regularizer.grow(n_new_output_units)
        return n_new_output_units

    def get_param_string():
        param_string = ""
        # TODO
        return param_string


class Flatten(tf.keras.Model):
    def call(self, inputs, training=None):
        return tf.reshape(tf.transpose(inputs, perm=[0, 3, 1, 2]), (inputs.shape[0], -1))


################################################################################
# MODELS
################################################################################


class Sequential(tf.keras.Model):
    def __init__(self, layers, activation=None):
        super().__init__()
        
        self.lrs = layers
        
    def call(self, inputs, training=None):
        x = inputs
        for layer in self.lrs:
            x = layer(x, training=training)
        return x
    
    def get_layer_input_shape(self, target_layer):
        if target_layer.inpt_shp is not None:
            return target_layer.inpt_shp

        input = np.random.normal(size=(1,) + self.lrs[0].inpt_shp)
        for layer in self.lrs:
            if layer is target_layer:
                return tuple(input.shape[1:])
            input = layer(input)
        raise Exception("Layer not found in the model.")

    def get_layer_output_shape(self, target_layer):
        input = np.random.normal(size=(1,) + self.lrs[0].inpt_shp)
        for layer in self.lrs:
            output = layer(input)
            if layer is target_layer:
                return tuple(output.shape[1:])
            input = output
        raise Exception("Layer not found in the model.")
    
    def get_layer_sizes(self):
        """
        Returns the sizes of all layers in the model, including the input and output layer.
        """
        layer_sizes = list()
        first_layer = True
        for l in range(len(self.lrs)):
            layer = self.lrs[l]
            if isinstance(layer, CustomLayer):
                layer_size = layer.get_size()
                if first_layer:
                    layer_sizes.append(layer_size[0])
                    first_layer = False
                layer_sizes.append(layer_size[1])
        return layer_sizes
    
    def get_hidden_layer_sizes(self):
        return self.get_layer_sizes()[1:-1]
    
    def remove_regularization(self):
        self.set_regularization_penalty(0.)
    
    def get_regularization_penalty(self):
        #TODO improve
        return self.lrs[-2].regularizer.regularization_penalty
    
    def set_regularization_penalty(self, regularization_penalty):
        for layer in self.lrs:
            if isinstance(layer, CustomLayer) and not layer.fixed_size:
                layer.regularizer.regularization_penalty = regularization_penalty
    
    def prune(self, params):
        input_shape = self.get_layer_input_shape(self.lrs[0])
        n_input_units = input_shape[-1]
        active_units_indices = list(range(n_input_units))

        last_custom_layer = None
        for layer in self.lrs:
            if isinstance(layer, CustomLayer):
                if last_custom_layer is not None and type(last_custom_layer) != type(layer):
                    if type(last_custom_layer) == Conv2D and type(layer) == Dense:
                        convolutional_shape = self.get_layer_output_shape(last_custom_layer)
                        active_units_indices = self.convert_channel_indices_to_flattened_indices(active_units_indices, convolutional_shape)
                    else:
                        raise Exception("Incorrect order of custom layer types.")
                active_units_indices = layer.prune(params.pruning_threshold, active_units_indices)
                last_custom_layer = layer
    
    def grow(self, params):   
        n_new_units = 0

        last_custom_layer = None
        for layer in self.lrs:
            if isinstance(layer, CustomLayer):
                if last_custom_layer is not None and type(last_custom_layer) != type(layer):
                    if type(last_custom_layer) == Conv2D and type(layer) == Dense:
                        convolutional_shape = self.get_layer_output_shape(last_custom_layer)
                        n_new_units = n_new_units * convolutional_shape[0] * convolutional_shape[1]
                    else:
                        raise Exception("Incorrect order of custom layer types.")
                n_new_units = layer.grow(n_new_units, params.growth_percentage, min_new_units=params.min_new_neurons, scaling_factor=params.pruning_threshold)
                last_custom_layer = layer
    
    @staticmethod
    def convert_channel_indices_to_flattened_indices(channel_indices, convolutional_shape):
        dense_indices = list()
        units_per_channel = convolutional_shape[0] * convolutional_shape[1]
        for channel_index in channel_indices:
            for iter in range(units_per_channel):
                dense_indices.append(channel_index * units_per_channel + iter)
        return dense_indices
    
    def print_neurons(self):
        for layer in self.lrs[:-1]:
            print(layer.get_param_string())
    
    def evaluate(self, params, summed_training_loss, summed_training_accuracy):
        # Calculate training loss and accuracy
        if summed_training_loss is not None:
            loss = summed_training_loss / params.x.shape[0]
        else:
            loss = None
        
        if summed_training_accuracy is not None:
            accuracy = summed_training_accuracy / params.x.shape[0]
        else:
            accuracy = None
        
        # Calculate val loss and accuracy
        summed_val_loss = 0
        summed_val_accuracy = 0
        n_val_instances = 0
        
        for step, (x_batch, y_batch) in enumerate(params.val_dataset):
            y_pred = self(x_batch, training=False)
            summed_val_loss += tf.reduce_sum(tf.keras.losses.sparse_categorical_crossentropy(y_batch, y_pred))
            summed_val_accuracy += float(tf.reduce_sum(tf.keras.metrics.sparse_categorical_accuracy(y_batch, y_pred)))
            n_val_instances += x_batch.shape[0]
        
        val_loss = summed_val_loss / n_val_instances
        val_accuracy = summed_val_accuracy / n_val_instances

        return loss, accuracy, val_loss, val_accuracy
    
    def print_epoch_statistics(self, params, summed_training_loss, summed_training_accuracy, message=None, require_result=False):
        if not params.verbose:
            if require_result:
                return self.evaluate(params, summed_training_loss, summed_training_accuracy)
            else:
                return
        
        loss, accuracy, val_loss, val_accuracy = self.evaluate(params, summed_training_loss, summed_training_accuracy)  

        if message is not None:
            print(message)
        
        print(f"loss: {loss} - accuracy: {accuracy} - val_loss: {val_loss} - val_accuracy: {val_accuracy} - penalty: {self.get_regularization_penalty()}")
        hidden_layer_sizes = self.get_hidden_layer_sizes()
        print(f"hidden layer sizes: {hidden_layer_sizes}, total units: {sum(hidden_layer_sizes)}")
        if params.print_neurons:
            self.print_neurons()
        
        if require_result:
            return loss, accuracy, val_loss, val_accuracy
    
    def update_history(self, params, loss, accuracy, val_loss, val_accuracy):
        params.history['loss'].append(loss)
        params.history['accuracy'].append(accuracy)
        params.history['val_loss'].append(val_loss)
        params.history['val_accuracy'].append(val_accuracy)
        params.history['hidden_layer_sizes'].append(self.get_hidden_layer_sizes())
    
    @staticmethod
    def prepare_datasets(x, y, batch_size, validation_data):
        train_dataset = tf.data.Dataset.from_tensor_slices((x, y))
        train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
        val_dataset = tf.data.Dataset.from_tensor_slices(validation_data).batch(batch_size)
        return train_dataset, val_dataset
    
    def manage_dynamic_regularization(self, params, val_loss):
        if val_loss >= params.best_conditional_val_loss * params.stall_coefficient:
            # Training is currently in stall
            if not params.training_stalled:
                penalty = self.get_regularization_penalty() * params.regularization_penalty_multiplier
                print("Changing penalty...")
                # TODO this must be modified, penalty can differ for each layer
                self.set_regularization_penalty(penalty)
                params.training_stalled = True
        else:
            params.best_conditional_val_loss = val_loss
            params.training_stalled = False
    
    def grow_wrapper(self, params):
        dynamic_reqularization_active = params.regularization_penalty_multiplier != 1.
        if dynamic_reqularization_active:
            loss, accuracy, val_loss, val_accuracy = self.print_epoch_statistics(params, None, None, "Before growing:", require_result=True)
            self.manage_dynamic_regularization(params, val_loss)
        else:
            self.print_epoch_statistics(params, None, None, "Before growing:")

        self.grow(params)
        self.print_epoch_statistics(params, None, None, "After growing:")
    
    def prune_wrapper(self, params, summed_loss, summed_accuracy):
        loss, accuracy, _, _ = self.print_epoch_statistics(params, summed_loss, summed_accuracy, "Before pruning:", require_result=True)
        self.prune(params)
        _, _, val_loss, val_accuracy = self.print_epoch_statistics(params, None, None, "After pruning:", require_result=True)

        self.update_history(params, loss, accuracy, val_loss, val_accuracy)
    
    class ParameterContainer:
        def __init__(self, x, y, optimizer, epochs, self_scaling_epochs, batch_size, min_new_neurons, validation_data, pruning_threshold, 
                regularization_penalty_multiplier, stall_coefficient, growth_percentage, mini_epochs_per_epoch, verbose, print_neurons):
            self.x = x
            self.y = y
            self.optimizer = optimizer
            self.epochs = epochs
            self.self_scaling_epochs = self_scaling_epochs
            self.batch_size = batch_size
            self.min_new_neurons = min_new_neurons
            self.validation_data = validation_data
            self.pruning_threshold = pruning_threshold
            self.regularization_penalty_multiplier = regularization_penalty_multiplier
            self.stall_coefficient = stall_coefficient
            self.growth_percentage = growth_percentage
            self.mini_epochs_per_epoch = mini_epochs_per_epoch
            self.verbose = verbose
            self.print_neurons = print_neurons

            self.train_dataset, self.val_dataset = Sequential.prepare_datasets(x, y, batch_size, validation_data)
            self.history = self.prepare_history()

            self.best_conditional_val_loss = np.inf
            self.training_stalled = False
        
        @staticmethod
        def prepare_history():
            history = {
                'loss': list(),
                'accuracy': list(),
                'val_loss': list(),
                'val_accuracy': list(),
                'hidden_layer_sizes': list(),
            }
            return history
    
    def fit_single_epoch(self, params):       
        summed_loss = 0
        summed_accuracy = 0
        
        for mini_epoch in range(params.mini_epochs_per_epoch):
            summed_loss = 0
            summed_accuracy = 0

            for step, (x_batch, y_batch) in enumerate(params.train_dataset):
                with tf.GradientTape() as tape:
                    y_pred = self(x_batch, training=True)
                    raw_loss = tf.keras.losses.sparse_categorical_crossentropy(y_batch, y_pred)
                    loss_value = tf.reduce_mean(raw_loss)
                    loss_value += sum(self.losses)  # Add losses registered by model.add_loss

                    summed_loss += tf.reduce_sum(raw_loss)
                    summed_accuracy += float(tf.reduce_sum(tf.keras.metrics.sparse_categorical_accuracy(y_batch, y_pred)))

                grads = tape.gradient(loss_value, self.trainable_variables)
                params.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        
        return summed_loss, summed_accuracy

    def fit(self, x, y, optimizer, epochs, self_scaling_epochs, batch_size, min_new_neurons, validation_data, pruning_threshold=0.001, 
            regularization_penalty_multiplier=1., stall_coefficient=1, growth_percentage=0.2, mini_epochs_per_epoch=1, pruning_only_epochs=0, verbose=True, print_neurons=False):
        params = self.ParameterContainer(x, y, optimizer, epochs, self_scaling_epochs, batch_size, min_new_neurons, 
                                         validation_data, pruning_threshold, regularization_penalty_multiplier, stall_coefficient, 
                                         growth_percentage, mini_epochs_per_epoch, verbose, print_neurons)
        self.build(x.shape)  # Necessary when verbose == False

        for epoch in range(epochs):
            if verbose:
                print("##########################################################")
                print(f"Epoch {epoch + 1}/{epochs}")

            if epoch < self_scaling_epochs - pruning_only_epochs:
                self.grow_wrapper(params)
            
            if epoch == self_scaling_epochs:
                self.remove_regularization()

            summed_loss, summed_accuracy = self.fit_single_epoch(params)
            
            if epoch < self_scaling_epochs:
                self.prune_wrapper(params, summed_loss, summed_accuracy)
            else:
                loss, accuracy, val_loss, val_accuracy = self.print_epoch_statistics(params, summed_loss, summed_accuracy, require_result=True)
                self.update_history(params, loss, accuracy, val_loss, val_accuracy)

        return params.history


################################################################################
# HELPER FUNCTIONS
################################################################################


def get_statistics_from_history(history):
    best_epoch_number = np.argmax(history['val_accuracy'])
    best_val_accuracy = history['val_accuracy'][best_epoch_number]
    best_hidden_layer_sizes = history['hidden_layer_sizes'][best_epoch_number]
    return best_val_accuracy, best_hidden_layer_sizes


def get_statistics_from_histories(histories):
    best_val_accuracies = list()
    all_best_hidden_layer_sizes = list()

    for history in histories:
        best_val_accuracy, best_hidden_layer_sizes = get_statistics_from_history(history)
        best_val_accuracies.append(best_val_accuracy)
        all_best_hidden_layer_sizes.append(best_hidden_layer_sizes)
    
    mean_best_val_accuracy = np.mean(best_val_accuracies)
    mean_best_hidden_layer_sizes = [np.mean(layer) for layer in list(zip(*all_best_hidden_layer_sizes))]
    
    return mean_best_val_accuracy, mean_best_hidden_layer_sizes


def cross_validate(train_fn, x, y, n_splits, random_state=42, *args, **kwargs):
    from sklearn.model_selection import KFold

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    histories = list()
    for i, (train_index, test_index) in enumerate(kf.split(x)):
        xtrain, xtest = x[train_index], x[test_index]
        ytrain, ytest = y[train_index], y[test_index]

        history = train_fn(*args, **kwargs)
        histories.append(history)

        best_val_accuracy, best_hidden_layer_sizes = get_statistics_from_history(history)
        print(f"Run {i} completed, best_val_accuracy: {best_val_accuracy}, best_hidden_layer_sizes: {best_hidden_layer_sizes}")

    mean_best_val_accuracy, mean_best_hidden_layer_sizes = get_statistics_from_histories(histories)
    print(f'mean_best_val_accuracy: {mean_best_val_accuracy}')
    print(f'mean_best_hidden_layer_sizes: {mean_best_hidden_layer_sizes}')

    return histories


def hyperparameter_search(train_fn, *args, **kwargs):
    from itertools import product

    all_params = [*args] + list(kwargs.values())
    histories = list()

    best_overall_val_accuracy = -np.inf
    best_overall_combination = None

    for combination in product(*all_params):
        combination_args = combination[:len(args)]

        combination_kwargs_values = combination[len(args):]
        combination_kwargs = dict(zip(kwargs.keys(), combination_kwargs_values))

        history = train_fn(*combination_args, **combination_kwargs)
        history['parameters'] = combination
        histories.append(history)

        best_val_accuracy, best_hidden_layer_sizes = get_statistics_from_history(history)
        print(f"Run with parameters {combination} completed, best_val_accuracy: {best_val_accuracy}, best_hidden_layer_sizes sizes: {best_hidden_layer_sizes}")

        if best_val_accuracy > best_overall_val_accuracy:
            best_overall_val_accuracy = best_val_accuracy
            best_overall_combination = combination
    
    print(f'Best overall combination: {best_overall_combination}, val_accuracy: {best_overall_val_accuracy}')

    return histories

# Accuracy benchmark - FF and convolutions

## CIFAR10

In [15]:
def get_convolutional_model(regularization_penalty, regularization_method, layer_sizes, output_neurons=10):
    model = Sequential([
        Conv2D(layer_sizes[0], filter_size=(3, 3), activation='selu', strides=(1, 1), padding='SAME', 
            regularization_penalty=regularization_penalty, regularization_method=regularization_method, 
            kernel_initializer='lecun_normal', input_shape=X_train_norm[0,:,:,:].shape),
        Conv2D(layer_sizes[1], filter_size=(3, 3), activation='selu', strides=(2, 2), padding='SAME', 
            regularization_penalty=regularization_penalty, regularization_method=regularization_method, 
            kernel_initializer='lecun_normal'),
        tf.keras.layers.Dropout(0.2),
        Conv2D(layer_sizes[2], filter_size=(3, 3), activation='selu', strides=(1, 1), padding='SAME', 
            regularization_penalty=regularization_penalty, regularization_method=regularization_method, 
            kernel_initializer='lecun_normal'),
        Conv2D(layer_sizes[3], filter_size=(3, 3), activation='selu', strides=(2, 2), padding='SAME', 
            regularization_penalty=regularization_penalty, regularization_method=regularization_method, 
            kernel_initializer='lecun_normal'),
        tf.keras.layers.Dropout(0.5),
        Flatten(),
        Dense(layer_sizes[4], activation='selu', regularization_penalty=regularization_penalty, 
            regularization_method=regularization_method, kernel_initializer='lecun_normal'),
        Dense(output_neurons, activation='softmax', regularization_penalty=0., 
            regularization_method=None, fixed_size=True),
    ])
    return model


def train_fn(learning_rate, regularization_penalty, regularization_method, self_scaling_epochs, layer_sizes, output_neurons=10, 
             epochs=40, pruning_only_epochs=0, min_new_neurons=20, growth_percentage=0.2, verbose=False):
    batch_size = 128

    model = get_convolutional_model(regularization_penalty, regularization_method, layer_sizes, output_neurons)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    history = model.fit(X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
                        min_new_neurons, validation_data=(X_test_norm, y_test), pruning_only_epochs=pruning_only_epochs, 
                        growth_percentage=growth_percentage, verbose=verbose)
    
    return history

In [11]:
batch_size = 128

model = get_convolutional_model(regularization_penalty=0.00002, regularization_method='weighted_l1', layer_sizes=[100, 100, 100, 100, 100], output_neurons=10)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0006)

history = model.fit(X_train_norm, y_train, optimizer, epochs=20, self_scaling_epochs=20, batch_size=batch_size, 
                    min_new_neurons=20, validation_data=(X_test_norm, y_test), pruning_only_epochs=0, 
                    growth_percentage=0.2, verbose=True)

##########################################################
Epoch 1/20
Before growing:
loss: None - accuracy: None - val_loss: 3.0063858032226562 - val_accuracy: 0.0844 - penalty: 2e-05
hidden layer sizes: [100, 100, 100, 100, 100], total units: 500
After growing:
loss: None - accuracy: None - val_loss: 3.006385326385498 - val_accuracy: 0.0844 - penalty: 2e-05
hidden layer sizes: [120, 120, 120, 120, 120], total units: 600
Before pruning:
loss: 1.762871265411377 - accuracy: 0.39802 - val_loss: 1.4641473293304443 - val_accuracy: 0.4651 - penalty: 2e-05
hidden layer sizes: [120, 120, 120, 120, 120], total units: 600
After pruning:
loss: None - accuracy: None - val_loss: 1.4639816284179688 - val_accuracy: 0.4651 - penalty: 2e-05
hidden layer sizes: [100, 100, 100, 100, 112], total units: 512
##########################################################
Epoch 2/20
Before growing:
loss: None - accuracy: None - val_loss: 1.4639816284179688 - val_accuracy: 0.4651 - penalty: 2e-05
hidden layer siz

In [26]:
batch_size = 128

model = get_convolutional_model(regularization_penalty=0.00002, regularization_method='weighted_l1_reordered', layer_sizes=[100, 100, 100, 100, 100], output_neurons=10)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0006)

history = model.fit(X_train_norm, y_train, optimizer, epochs=20, self_scaling_epochs=20, batch_size=batch_size, 
                    min_new_neurons=20, validation_data=(X_test_norm, y_test), pruning_only_epochs=0, 
                    growth_percentage=0.2, verbose=True)

##########################################################
Epoch 1/20
Before growing:
loss: None - accuracy: None - val_loss: 2.8939507007598877 - val_accuracy: 0.1005 - penalty: 2e-05
hidden layer sizes: [100, 100, 100, 100, 100], total units: 500
After growing:
loss: None - accuracy: None - val_loss: 2.8939504623413086 - val_accuracy: 0.1005 - penalty: 2e-05
hidden layer sizes: [120, 120, 120, 120, 120], total units: 600
Before pruning:
loss: 1.7473422288894653 - accuracy: 0.39658 - val_loss: 1.4361164569854736 - val_accuracy: 0.4886 - penalty: 2e-05
hidden layer sizes: [120, 120, 120, 120, 120], total units: 600
After pruning:
loss: None - accuracy: None - val_loss: 1.436206579208374 - val_accuracy: 0.4881 - penalty: 2e-05
hidden layer sizes: [100, 100, 100, 100, 106], total units: 506
##########################################################
Epoch 2/20
Before growing:
loss: None - accuracy: None - val_loss: 1.436206579208374 - val_accuracy: 0.4881 - penalty: 2e-05
hidden layer siz

In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0006,
                           regularization_penalty=0.00002, regularization_method='weighted_l1_reordered',
                           self_scaling_epochs=20, layer_sizes=[100, 100, 100, 100, 100])

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0006], 
                                  regularization_penalty=[0.00002], regularization_method=['weighted_l1'], 
                                  self_scaling_epochs=[20], layer_sizes=[[100, 100, 100, 100, 100]])

Run with parameters (5e-05, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.7074, final_hidden_layer sizes: [100, 46, 47, 53, 71]
Run with parameters (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.7496, final_hidden_layer sizes: [50, 25, 23, 60, 86]
Run with parameters (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.7554, final_hidden_layer sizes: [36, 15, 30, 81, 266]
Best overall combination: (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]), val_accuracy: 0.7554
CPU times: user 27min 12s, sys: 31.1 s, total: 27min 43s
Wall time: 27min 44s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0006,
                           regularization_penalty=0.00002, regularization_method='weighted_l1',
                           self_scaling_epochs=20, layer_sizes=[100, 100, 100, 100, 100])

Run 0 completed, best_val_accuracy: 0.7661, final_hidden_layer sizes: [36, 16, 42, 81, 275]
Run 1 completed, best_val_accuracy: 0.7548, final_hidden_layer sizes: [32, 17, 25, 93, 244]
Run 2 completed, best_val_accuracy: 0.7611, final_hidden_layer sizes: [29, 17, 32, 80, 244]
Run 3 completed, best_val_accuracy: 0.7671, final_hidden_layer sizes: [32, 17, 28, 61, 332]
Run 4 completed, best_val_accuracy: 0.7603, final_hidden_layer sizes: [31, 18, 38, 72, 294]
Run 5 completed, best_val_accuracy: 0.7689, final_hidden_layer sizes: [34, 16, 37, 71, 316]
mean_best_val_accuracy: 0.7630500000000001
mean_final_hidden_layer_sizes: [32.333333333333336, 16.833333333333332, 33.666666666666664, 76.33333333333333, 284.1666666666667]
CPU times: user 54min 56s, sys: 1min 3s, total: 56min
Wall time: 55min 15s


In [None]:
histories = hyperparameter_search(train_fn, learning_rate=[0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[32, 17, 34, 76, 284]])

Run with parameters (0.0005, 0.0, None, 0, [32, 17, 34, 76, 284]) completed, best_val_accuracy: 0.6696, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run with parameters (0.002, 0.0, None, 0, [32, 17, 34, 76, 284]) completed, best_val_accuracy: 0.7203, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run with parameters (0.006, 0.0, None, 0, [32, 17, 34, 76, 284]) completed, best_val_accuracy: 0.5657, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Best overall combination: (0.002, 0.0, None, 0, [32, 17, 34, 76, 284]), val_accuracy: 0.7203


In [None]:
histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.002,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[32, 17, 34, 76, 284])

Run 0 completed, best_val_accuracy: 0.7183, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run 1 completed, best_val_accuracy: 0.7177, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run 2 completed, best_val_accuracy: 0.7154, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run 3 completed, best_val_accuracy: 0.7285, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run 4 completed, best_val_accuracy: 0.7061, final_hidden_layer sizes: [32, 17, 34, 76, 284]
Run 5 completed, best_val_accuracy: 0.7166, final_hidden_layer sizes: [32, 17, 34, 76, 284]
mean_best_val_accuracy: 0.7170999999999998
mean_final_hidden_layer_sizes: [32.0, 17.0, 34.0, 76.0, 284.0]


In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[32, 17, 34, 76, 284])
model.build(X_norm.shape)
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_36 (Conv2D)           multiple                  896       
_________________________________________________________________
conv2d_37 (Conv2D)           multiple                  4913      
_________________________________________________________________
dropout_18 (Dropout)         multiple                  0         
_________________________________________________________________
conv2d_38 (Conv2D)           multiple                  5236      
_________________________________________________________________
conv2d_39 (Conv2D)           multiple                  23332     
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
_________________________________________________________________
flatten_9 (Flatten)          multiple                 

In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[71, 71, 71, 71, 284])
model.build(X_norm.shape)
model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_68 (Conv2D)           multiple                  1988      
_________________________________________________________________
conv2d_69 (Conv2D)           multiple                  45440     
_________________________________________________________________
dropout_34 (Dropout)         multiple                  0         
_________________________________________________________________
conv2d_70 (Conv2D)           multiple                  45440     
_________________________________________________________________
conv2d_71 (Conv2D)           multiple                  45440     
_________________________________________________________________
dropout_35 (Dropout)         multiple                  0         
_________________________________________________________________
flatten_17 (Flatten)         multiple                

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[71, 71, 71, 71, 284]])

Run with parameters (0.0005, 0.0, None, 0, [71, 71, 71, 71, 284]) completed, best_val_accuracy: 0.7377, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run with parameters (0.002, 0.0, None, 0, [71, 71, 71, 71, 284]) completed, best_val_accuracy: 0.7363, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run with parameters (0.006, 0.0, None, 0, [71, 71, 71, 71, 284]) completed, best_val_accuracy: 0.1, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Best overall combination: (0.0005, 0.0, None, 0, [71, 71, 71, 71, 284]), val_accuracy: 0.7377
CPU times: user 11min 50s, sys: 13 s, total: 12min 3s
Wall time: 13min 22s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0005,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[71, 71, 71, 71, 284])

Run 0 completed, best_val_accuracy: 0.7418, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run 1 completed, best_val_accuracy: 0.7336, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run 2 completed, best_val_accuracy: 0.7415, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run 3 completed, best_val_accuracy: 0.7389, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run 4 completed, best_val_accuracy: 0.7345, final_hidden_layer sizes: [71, 71, 71, 71, 284]
Run 5 completed, best_val_accuracy: 0.727, final_hidden_layer sizes: [71, 71, 71, 71, 284]
mean_best_val_accuracy: 0.7362166666666666
mean_final_hidden_layer_sizes: [71.0, 71.0, 71.0, 71.0, 284.0]
CPU times: user 23min 38s, sys: 30.1 s, total: 24min 8s
Wall time: 26min 44s


In [None]:
fashion_mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 1))
X_test = np.reshape(X_test, (-1, 1))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling all features together

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 28, 28))
X_train_norm = np.reshape(X_train_norm, (-1, 28, 28))
X_test_norm = np.reshape(X_test_norm, (-1, 28, 28))

## Fashnion MNIST

In [None]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 1))
X_test = np.reshape(X_test, (-1, 1))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling all features together

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 28, 28, 1))
X_train_norm = np.reshape(X_train_norm, (-1, 28, 28, 1))
X_test_norm = np.reshape(X_test_norm, (-1, 28, 28, 1))

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0006], 
                                  regularization_penalty=[0.00002], regularization_method=['weighted_l1'], 
                                  self_scaling_epochs=[20], layer_sizes=[[100, 100, 100, 100, 100]])

Run with parameters (5e-05, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9158, final_hidden_layer sizes: [78, 21, 26, 43, 76]
Run with parameters (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9232, final_hidden_layer sizes: [42, 17, 16, 32, 46]
Run with parameters (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9306, final_hidden_layer sizes: [20, 13, 26, 51, 197]
Best overall combination: (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]), val_accuracy: 0.9306
CPU times: user 18min 26s, sys: 16.4 s, total: 18min 42s
Wall time: 18min 57s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0006,
                           regularization_penalty=0.00002, regularization_method='weighted_l1',
                           self_scaling_epochs=20, layer_sizes=[100, 100, 100, 100, 100])

Run 0 completed, best_val_accuracy: 0.9288, final_hidden_layer sizes: [17, 15, 22, 43, 168]
Run 1 completed, best_val_accuracy: 0.9284, final_hidden_layer sizes: [19, 14, 20, 45, 196]
Run 2 completed, best_val_accuracy: 0.9313, final_hidden_layer sizes: [20, 14, 24, 64, 198]
Run 3 completed, best_val_accuracy: 0.9292, final_hidden_layer sizes: [28, 12, 25, 41, 181]
Run 4 completed, best_val_accuracy: 0.9313, final_hidden_layer sizes: [23, 14, 27, 42, 172]
Run 5 completed, best_val_accuracy: 0.9314, final_hidden_layer sizes: [23, 14, 23, 48, 186]
mean_best_val_accuracy: 0.9300666666666667
mean_final_hidden_layer_sizes: [21.666666666666668, 13.833333333333334, 23.5, 47.166666666666664, 183.5]
CPU times: user 36min 47s, sys: 31.1 s, total: 37min 18s
Wall time: 37min 18s


In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[22, 14, 24, 47, 184]])

Run with parameters (0.0005, 0.0, None, 0, [22, 14, 24, 47, 184]) completed, best_val_accuracy: 0.9175, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run with parameters (0.002, 0.0, None, 0, [22, 14, 24, 47, 184]) completed, best_val_accuracy: 0.9154, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run with parameters (0.006, 0.0, None, 0, [22, 14, 24, 47, 184]) completed, best_val_accuracy: 0.8663, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Best overall combination: (0.0005, 0.0, None, 0, [22, 14, 24, 47, 184]), val_accuracy: 0.9175
CPU times: user 11min 43s, sys: 9.79 s, total: 11min 53s
Wall time: 11min 33s


In [None]:
histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0005,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[22, 14, 24, 47, 184])

Run 0 completed, best_val_accuracy: 0.9207, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run 1 completed, best_val_accuracy: 0.9216, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run 2 completed, best_val_accuracy: 0.9195, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run 3 completed, best_val_accuracy: 0.9179, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run 4 completed, best_val_accuracy: 0.9158, final_hidden_layer sizes: [22, 14, 24, 47, 184]
Run 5 completed, best_val_accuracy: 0.9152, final_hidden_layer sizes: [22, 14, 24, 47, 184]
mean_best_val_accuracy: 0.91845
mean_final_hidden_layer_sizes: [22.0, 14.0, 24.0, 47.0, 184.0]


In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[22, 14, 24, 47, 184])
model.build(X_norm.shape)
model.summary()

Model: "sequential_51"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_204 (Conv2D)          multiple                  220       
_________________________________________________________________
conv2d_205 (Conv2D)          multiple                  2786      
_________________________________________________________________
dropout_102 (Dropout)        multiple                  0         
_________________________________________________________________
conv2d_206 (Conv2D)          multiple                  3048      
_________________________________________________________________
conv2d_207 (Conv2D)          multiple                  10199     
_________________________________________________________________
dropout_103 (Dropout)        multiple                  0         
_________________________________________________________________
flatten_51 (Flatten)         multiple                

In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[43, 43, 43, 43, 184])
model.build(X_norm.shape)
model.summary()

Model: "sequential_55"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_220 (Conv2D)          multiple                  430       
_________________________________________________________________
conv2d_221 (Conv2D)          multiple                  16684     
_________________________________________________________________
dropout_110 (Dropout)        multiple                  0         
_________________________________________________________________
conv2d_222 (Conv2D)          multiple                  16684     
_________________________________________________________________
conv2d_223 (Conv2D)          multiple                  16684     
_________________________________________________________________
dropout_111 (Dropout)        multiple                  0         
_________________________________________________________________
flatten_55 (Flatten)         multiple                

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.0002, 0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[43, 43, 43, 43, 184]])

Run with parameters (0.0002, 0.0, None, 0, [43, 43, 43, 43, 184]) completed, best_val_accuracy: 0.9236, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run with parameters (0.0005, 0.0, None, 0, [43, 43, 43, 43, 184]) completed, best_val_accuracy: 0.9276, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run with parameters (0.002, 0.0, None, 0, [43, 43, 43, 43, 184]) completed, best_val_accuracy: 0.9145, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run with parameters (0.006, 0.0, None, 0, [43, 43, 43, 43, 184]) completed, best_val_accuracy: 0.8717, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Best overall combination: (0.0005, 0.0, None, 0, [43, 43, 43, 43, 184]), val_accuracy: 0.9276
CPU times: user 15min 40s, sys: 13.4 s, total: 15min 53s
Wall time: 15min 32s


In [None]:
histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0005,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[43, 43, 43, 43, 184])

Run 0 completed, best_val_accuracy: 0.9243, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run 1 completed, best_val_accuracy: 0.9249, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run 2 completed, best_val_accuracy: 0.9244, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run 3 completed, best_val_accuracy: 0.9287, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run 4 completed, best_val_accuracy: 0.9232, final_hidden_layer sizes: [43, 43, 43, 43, 184]
Run 5 completed, best_val_accuracy: 0.9239, final_hidden_layer sizes: [43, 43, 43, 43, 184]
mean_best_val_accuracy: 0.9249
mean_final_hidden_layer_sizes: [43.0, 43.0, 43.0, 43.0, 184.0]


# MNIST

In [None]:
mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 1))
X_test = np.reshape(X_test, (-1, 1))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling all features together

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 28, 28, 1))
X_train_norm = np.reshape(X_train_norm, (-1, 28, 28, 1))
X_test_norm = np.reshape(X_test_norm, (-1, 28, 28, 1))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0006], 
                                  regularization_penalty=[0.00002], regularization_method=['weighted_l1'], 
                                  self_scaling_epochs=[20], layer_sizes=[[100, 100, 100, 100, 100]])

Run with parameters (5e-05, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9918, final_hidden_layer sizes: [59, 22, 20, 42, 67]
Run with parameters (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9935, final_hidden_layer sizes: [19, 17, 16, 35, 50]
Run with parameters (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9933, final_hidden_layer sizes: [17, 14, 17, 41, 156]
Best overall combination: (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]), val_accuracy: 0.9935
CPU times: user 18min 24s, sys: 16.1 s, total: 18min 40s
Wall time: 18min 52s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0002,
                           regularization_penalty=0.00002, regularization_method='weighted_l1',
                           self_scaling_epochs=20, layer_sizes=[100, 100, 100, 100, 100])

Run 0 completed, best_val_accuracy: 0.9929, final_hidden_layer sizes: [19, 18, 12, 36, 54]
Run 1 completed, best_val_accuracy: 0.9931, final_hidden_layer sizes: [21, 20, 13, 32, 43]
Run 2 completed, best_val_accuracy: 0.9927, final_hidden_layer sizes: [23, 19, 14, 30, 47]
Run 3 completed, best_val_accuracy: 0.9938, final_hidden_layer sizes: [19, 20, 15, 31, 53]
Run 4 completed, best_val_accuracy: 0.9928, final_hidden_layer sizes: [24, 18, 10, 36, 46]
Run 5 completed, best_val_accuracy: 0.9936, final_hidden_layer sizes: [22, 16, 13, 33, 38]
mean_best_val_accuracy: 0.99315
mean_final_hidden_layer_sizes: [21.333333333333332, 18.5, 12.833333333333334, 33.0, 46.833333333333336]
CPU times: user 36min 33s, sys: 32.2 s, total: 37min 5s
Wall time: 37min 13s


In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.0002, 0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[21, 19, 13, 33, 47]])

Run with parameters (0.0002, 0.0, None, 0, [21, 19, 13, 33, 47]) completed, best_val_accuracy: 0.9919, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run with parameters (0.0005, 0.0, None, 0, [21, 19, 13, 33, 47]) completed, best_val_accuracy: 0.9937, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run with parameters (0.002, 0.0, None, 0, [21, 19, 13, 33, 47]) completed, best_val_accuracy: 0.9913, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run with parameters (0.006, 0.0, None, 0, [21, 19, 13, 33, 47]) completed, best_val_accuracy: 0.9818, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Best overall combination: (0.0005, 0.0, None, 0, [21, 19, 13, 33, 47]), val_accuracy: 0.9937
CPU times: user 15min 31s, sys: 12.9 s, total: 15min 44s
Wall time: 15min 18s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0005,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[21, 19, 13, 33, 47])

Run 0 completed, best_val_accuracy: 0.9934, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run 1 completed, best_val_accuracy: 0.9913, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run 2 completed, best_val_accuracy: 0.9928, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run 3 completed, best_val_accuracy: 0.9927, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run 4 completed, best_val_accuracy: 0.993, final_hidden_layer sizes: [21, 19, 13, 33, 47]
Run 5 completed, best_val_accuracy: 0.993, final_hidden_layer sizes: [21, 19, 13, 33, 47]
mean_best_val_accuracy: 0.9927000000000001
mean_final_hidden_layer_sizes: [21.0, 19.0, 13.0, 33.0, 47.0]
CPU times: user 23min 16s, sys: 21.4 s, total: 23min 38s
Wall time: 23min 16s


In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[21, 19, 13, 33, 47])
model.build(X_norm.shape)
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           multiple                  210       
_________________________________________________________________
conv2d_25 (Conv2D)           multiple                  3610      
_________________________________________________________________
dropout_12 (Dropout)         multiple                  0         
_________________________________________________________________
conv2d_26 (Conv2D)           multiple                  2236      
_________________________________________________________________
conv2d_27 (Conv2D)           multiple                  3894      
_________________________________________________________________
dropout_13 (Dropout)         multiple                  0         
_________________________________________________________________
flatten_6 (Flatten)          multiple                 

In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[28, 28, 28, 28, 47])
model.build(X_norm.shape)
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_32 (Conv2D)           multiple                  280       
_________________________________________________________________
conv2d_33 (Conv2D)           multiple                  7084      
_________________________________________________________________
dropout_16 (Dropout)         multiple                  0         
_________________________________________________________________
conv2d_34 (Conv2D)           multiple                  7084      
_________________________________________________________________
conv2d_35 (Conv2D)           multiple                  7084      
_________________________________________________________________
dropout_17 (Dropout)         multiple                  0         
_________________________________________________________________
flatten_8 (Flatten)          multiple                 

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.0002, 0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[28, 28, 28, 28, 47]])

Run with parameters (0.0002, 0.0, None, 0, [28, 28, 28, 28, 47]) completed, best_val_accuracy: 0.9925, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run with parameters (0.0005, 0.0, None, 0, [28, 28, 28, 28, 47]) completed, best_val_accuracy: 0.992, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run with parameters (0.002, 0.0, None, 0, [28, 28, 28, 28, 47]) completed, best_val_accuracy: 0.9914, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run with parameters (0.006, 0.0, None, 0, [28, 28, 28, 28, 47]) completed, best_val_accuracy: 0.9812, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Best overall combination: (0.0002, 0.0, None, 0, [28, 28, 28, 28, 47]), val_accuracy: 0.9925
CPU times: user 15min 18s, sys: 13.3 s, total: 15min 32s
Wall time: 15min 6s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0002,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[28, 28, 28, 28, 47])

Run 0 completed, best_val_accuracy: 0.9925, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run 1 completed, best_val_accuracy: 0.9929, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run 2 completed, best_val_accuracy: 0.9921, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run 3 completed, best_val_accuracy: 0.9928, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run 4 completed, best_val_accuracy: 0.9928, final_hidden_layer sizes: [28, 28, 28, 28, 47]
Run 5 completed, best_val_accuracy: 0.9925, final_hidden_layer sizes: [28, 28, 28, 28, 47]
mean_best_val_accuracy: 0.9925999999999999
mean_final_hidden_layer_sizes: [28.0, 28.0, 28.0, 28.0, 47.0]
CPU times: user 23min 4s, sys: 20.3 s, total: 23min 24s
Wall time: 22min 46s


## CIFAR100

In [None]:
cifar100 = tf.keras.datasets.cifar100
(X_train, y_train), (X_test, y_test) = cifar100.load_data()

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 3072))
X_test = np.reshape(X_test, (-1, 3072))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling each feature independently

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 32, 32, 3))
X_train_norm = np.reshape(X_train_norm, (-1, 32, 32, 3))
X_test_norm = np.reshape(X_test_norm, (-1, 32, 32, 3))

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0006], 
                                  regularization_penalty=[0.00002], regularization_method=['weighted_l1'], 
                                  self_scaling_epochs=[20], layer_sizes=[[100, 100, 100, 100, 100]], output_neurons=[100])

Run with parameters (5e-05, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100], 100) completed, best_val_accuracy: 0.4097, final_hidden_layer sizes: [100, 40, 48, 48, 100]
Run with parameters (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100], 100) completed, best_val_accuracy: 0.451, final_hidden_layer sizes: [69, 21, 33, 67, 193]
Run with parameters (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100], 100) completed, best_val_accuracy: 0.4063, final_hidden_layer sizes: [52, 21, 100, 124, 808]
Best overall combination: (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100], 100), val_accuracy: 0.451
CPU times: user 18min 25s, sys: 19.1 s, total: 18min 45s
Wall time: 20min 34s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0002,
                           regularization_penalty=0.00002, regularization_method='weighted_l1',
                           self_scaling_epochs=20, layer_sizes=[100, 100, 100, 100, 100], output_neurons=100)

Run 0 completed, best_val_accuracy: 0.4509, final_hidden_layer sizes: [65, 22, 30, 68, 200]
Run 1 completed, best_val_accuracy: 0.4497, final_hidden_layer sizes: [63, 24, 31, 69, 206]
Run 2 completed, best_val_accuracy: 0.4431, final_hidden_layer sizes: [63, 23, 28, 81, 189]
Run 3 completed, best_val_accuracy: 0.4468, final_hidden_layer sizes: [65, 27, 32, 70, 206]
Run 4 completed, best_val_accuracy: 0.459, final_hidden_layer sizes: [68, 19, 30, 69, 200]
Run 5 completed, best_val_accuracy: 0.4536, final_hidden_layer sizes: [65, 24, 25, 75, 204]
mean_best_val_accuracy: 0.4505166666666667
mean_final_hidden_layer_sizes: [64.83333333333333, 23.166666666666668, 29.333333333333332, 72.0, 200.83333333333334]
CPU times: user 36min 16s, sys: 35.5 s, total: 36min 52s
Wall time: 39min 4s


In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.0002, 0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[65, 23, 29, 72, 201]], output_neurons=[100])

Run with parameters (0.0002, 0.0, None, 0, [65, 23, 29, 72, 201], 100) completed, best_val_accuracy: 0.3203, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run with parameters (0.0005, 0.0, None, 0, [65, 23, 29, 72, 201], 100) completed, best_val_accuracy: 0.3173, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run with parameters (0.002, 0.0, None, 0, [65, 23, 29, 72, 201], 100) completed, best_val_accuracy: 0.2612, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run with parameters (0.006, 0.0, None, 0, [65, 23, 29, 72, 201], 100) completed, best_val_accuracy: 0.01, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Best overall combination: (0.0002, 0.0, None, 0, [65, 23, 29, 72, 201], 100), val_accuracy: 0.3203
CPU times: user 15min 45s, sys: 28.4 s, total: 16min 13s
Wall time: 15min 36s


In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[65, 23, 29, 72, 201]], output_neurons=[100])

Run with parameters (5e-05, 0.0, None, 0, [65, 23, 29, 72, 201], 100) completed, best_val_accuracy: 0.3112, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Best overall combination: (5e-05, 0.0, None, 0, [65, 23, 29, 72, 201], 100), val_accuracy: 0.3112
CPU times: user 3min 54s, sys: 6.71 s, total: 4min 1s
Wall time: 3min 52s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0002,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[65, 23, 29, 72, 201], output_neurons=100)

Run 0 completed, best_val_accuracy: 0.3274, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run 1 completed, best_val_accuracy: 0.3229, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run 2 completed, best_val_accuracy: 0.3256, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run 3 completed, best_val_accuracy: 0.3191, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run 4 completed, best_val_accuracy: 0.3226, final_hidden_layer sizes: [65, 23, 29, 72, 201]
Run 5 completed, best_val_accuracy: 0.3247, final_hidden_layer sizes: [65, 23, 29, 72, 201]
mean_best_val_accuracy: 0.3237166666666667
mean_final_hidden_layer_sizes: [65.0, 23.0, 29.0, 72.0, 201.0]
CPU times: user 23min 32s, sys: 21.1 s, total: 23min 53s
Wall time: 22min 57s


In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[65, 23, 29, 72, 201], output_neurons=100)
model.build(X_norm.shape)
model.summary()

Model: "sequential_51"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_204 (Conv2D)          multiple                  1820      
_________________________________________________________________
conv2d_205 (Conv2D)          multiple                  13478     
_________________________________________________________________
dropout_102 (Dropout)        multiple                  0         
_________________________________________________________________
conv2d_206 (Conv2D)          multiple                  6032      
_________________________________________________________________
conv2d_207 (Conv2D)          multiple                  18864     
_________________________________________________________________
dropout_103 (Dropout)        multiple                  0         
_________________________________________________________________
flatten_51 (Flatten)         multiple                

In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[66, 66, 66, 66, 201], output_neurons=100)
model.build(X_norm.shape)
model.summary()

Model: "sequential_65"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_260 (Conv2D)          multiple                  1848      
_________________________________________________________________
conv2d_261 (Conv2D)          multiple                  39270     
_________________________________________________________________
dropout_130 (Dropout)        multiple                  0         
_________________________________________________________________
conv2d_262 (Conv2D)          multiple                  39270     
_________________________________________________________________
conv2d_263 (Conv2D)          multiple                  39270     
_________________________________________________________________
dropout_131 (Dropout)        multiple                  0         
_________________________________________________________________
flatten_65 (Flatten)         multiple                

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0005, 0.002, 0.006], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[66, 66, 66, 66, 201]], output_neurons=[100])

Run with parameters (5e-05, 0.0, None, 0, [66, 66, 66, 66, 201], 100) completed, best_val_accuracy: 0.3275, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run with parameters (0.0002, 0.0, None, 0, [66, 66, 66, 66, 201], 100) completed, best_val_accuracy: 0.3547, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run with parameters (0.0005, 0.0, None, 0, [66, 66, 66, 66, 201], 100) completed, best_val_accuracy: 0.3479, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run with parameters (0.002, 0.0, None, 0, [66, 66, 66, 66, 201], 100) completed, best_val_accuracy: 0.3233, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run with parameters (0.006, 0.0, None, 0, [66, 66, 66, 66, 201], 100) completed, best_val_accuracy: 0.01, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Best overall combination: (0.0002, 0.0, None, 0, [66, 66, 66, 66, 201], 100), val_accuracy: 0.3547
CPU times: user 20min 10s, sys: 20.7 s, total: 20min 31s
Wall time: 21min 51s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0002,
                           regularization_penalty=0., regularization_method=None,
                           self_scaling_epochs=0, layer_sizes=[66, 66, 66, 66, 201], output_neurons=100)

Run 0 completed, best_val_accuracy: 0.349, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run 1 completed, best_val_accuracy: 0.372, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run 2 completed, best_val_accuracy: 0.3374, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run 3 completed, best_val_accuracy: 0.3614, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run 4 completed, best_val_accuracy: 0.3378, final_hidden_layer sizes: [66, 66, 66, 66, 201]
Run 5 completed, best_val_accuracy: 0.3616, final_hidden_layer sizes: [66, 66, 66, 66, 201]
mean_best_val_accuracy: 0.3532
mean_final_hidden_layer_sizes: [66.0, 66.0, 66.0, 66.0, 201.0]
CPU times: user 24min 12s, sys: 24.8 s, total: 24min 37s
Wall time: 26min 13s


## Street View House Numbers

In [None]:
from urllib.request import urlretrieve

train_filename, _ = urlretrieve('http://ufldl.stanford.edu/housenumbers/train_32x32.mat')
test_filename, _ = urlretrieve('http://ufldl.stanford.edu/housenumbers/test_32x32.mat')

In [None]:
from scipy import io

X_train = io.loadmat(train_filename, variable_names='X').get('X')
y_train = io.loadmat(train_filename, variable_names='y').get('y')
X_test = io.loadmat(test_filename, variable_names='X').get('X')
y_test = io.loadmat(test_filename, variable_names='y').get('y')

X_train = np.moveaxis(X_train, -1, 0)
y_train -= 1
X_test = np.moveaxis(X_test, -1, 0)
y_test -= 1

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 3072))
X_test = np.reshape(X_test, (-1, 3072))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling each feature independently

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 32, 32, 3))
X_train_norm = np.reshape(X_train_norm, (-1, 32, 32, 3))
X_test_norm = np.reshape(X_test_norm, (-1, 32, 32, 3))

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0006], 
                                  regularization_penalty=[0.00002], regularization_method=['weighted_l1'], 
                                  self_scaling_epochs=[20], layer_sizes=[[100, 100, 100, 100, 100]])

Run with parameters (5e-05, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9157959434542102, best_hidden_layer_sizes sizes: [54, 28, 29, 52, 78]
Run with parameters (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9251306084818685, best_hidden_layer_sizes sizes: [24, 19, 17, 43, 67]
Run with parameters (0.0006, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]) completed, best_val_accuracy: 0.9240934234787953, best_hidden_layer_sizes sizes: [15, 17, 33, 55, 241]
Best overall combination: (0.0002, 2e-05, 'weighted_l1', 20, [100, 100, 100, 100, 100]), val_accuracy: 0.9251306084818685
CPU times: user 28min 8s, sys: 29.6 s, total: 28min 38s
Wall time: 29min 6s


In [None]:
%%time

histories = cross_validate(train_fn, X_norm, y, n_splits=6, learning_rate=0.0002,
                           regularization_penalty=0.00002, regularization_method='weighted_l1',
                           self_scaling_epochs=20, layer_sizes=[100, 100, 100, 100, 100])

Run 0 completed, best_val_accuracy: 0.9232867240319607, best_hidden_layer_sizes: [22, 21, 18, 46, 61]
Run 1 completed, best_val_accuracy: 0.9245928088506453, best_hidden_layer_sizes: [20, 18, 20, 39, 82]
Run 2 completed, best_val_accuracy: 0.9235556238475722, best_hidden_layer_sizes: [26, 17, 17, 43, 77]
Run 3 completed, best_val_accuracy: 0.926359864781807, best_hidden_layer_sizes: [25, 15, 17, 44, 79]
Run 4 completed, best_val_accuracy: 0.9247464658881377, best_hidden_layer_sizes: [26, 18, 21, 47, 73]
Run 5 completed, best_val_accuracy: 0.9237476951444377, best_hidden_layer_sizes: [25, 18, 21, 39, 74]
mean_best_val_accuracy: 0.9243815304240934
mean_best_hidden_layer_sizes: [24.0, 17.833333333333332, 19.0, 43.0, 74.33333333333333]
CPU times: user 56min 3s, sys: 59.3 s, total: 57min 2s
Wall time: 57min 16s


In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0005, 0.002], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[24, 18, 19, 43, 74]])

Run with parameters (5e-05, 0.0, None, 0, [24, 18, 19, 43, 74]) completed, best_val_accuracy: 0.8676628764597418, best_hidden_layer_sizes sizes: [24, 18, 19, 43, 74]
Run with parameters (0.0002, 0.0, None, 0, [24, 18, 19, 43, 74]) completed, best_val_accuracy: 0.8984711124769514, best_hidden_layer_sizes sizes: [24, 18, 19, 43, 74]
Run with parameters (0.0005, 0.0, None, 0, [24, 18, 19, 43, 74]) completed, best_val_accuracy: 0.9091502765826674, best_hidden_layer_sizes sizes: [24, 18, 19, 43, 74]
Run with parameters (0.002, 0.0, None, 0, [24, 18, 19, 43, 74]) completed, best_val_accuracy: 0.9029655808236017, best_hidden_layer_sizes sizes: [24, 18, 19, 43, 74]
Best overall combination: (0.0005, 0.0, None, 0, [24, 18, 19, 43, 74]), val_accuracy: 0.9091502765826674
CPU times: user 25min, sys: 23.6 s, total: 25min 24s
Wall time: 24min 22s


In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[24, 18, 19, 43, 74])
model.build(X_norm.shape)
model.summary()

Model: "sequential_93"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_372 (Conv2D)          multiple                  672       
_________________________________________________________________
conv2d_373 (Conv2D)          multiple                  3906      
_________________________________________________________________
dropout_186 (Dropout)        multiple                  0         
_________________________________________________________________
conv2d_374 (Conv2D)          multiple                  3097      
_________________________________________________________________
conv2d_375 (Conv2D)          multiple                  7396      
_________________________________________________________________
dropout_187 (Dropout)        multiple                  0         
_________________________________________________________________
flatten_93 (Flatten)         multiple                

In [None]:
model = get_convolutional_model(regularization_penalty=0., regularization_method=None, layer_sizes=[38, 38, 38, 38, 74])
model.build(X_norm.shape)
model.summary()

Model: "sequential_94"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_376 (Conv2D)          multiple                  1064      
_________________________________________________________________
conv2d_377 (Conv2D)          multiple                  13034     
_________________________________________________________________
dropout_188 (Dropout)        multiple                  0         
_________________________________________________________________
conv2d_378 (Conv2D)          multiple                  13034     
_________________________________________________________________
conv2d_379 (Conv2D)          multiple                  13034     
_________________________________________________________________
dropout_189 (Dropout)        multiple                  0         
_________________________________________________________________
flatten_94 (Flatten)         multiple                

In [None]:
%%time

histories = hyperparameter_search(train_fn, learning_rate=[0.00005, 0.0002, 0.0005, 0.002], 
                                  regularization_penalty=[0.], regularization_method=[None], 
                                  self_scaling_epochs=[0], layer_sizes=[[38, 38, 38, 38, 74]])

Run with parameters (5e-05, 0.0, None, 0, [38, 38, 38, 38, 74]) completed, best_val_accuracy: 0.8982406269207129, best_hidden_layer_sizes sizes: [38, 38, 38, 38, 74]
Run with parameters (0.0002, 0.0, None, 0, [38, 38, 38, 38, 74]) completed, best_val_accuracy: 0.9143746158574063, best_hidden_layer_sizes sizes: [38, 38, 38, 38, 74]
Run with parameters (0.0005, 0.0, None, 0, [38, 38, 38, 38, 74]) completed, best_val_accuracy: 0.9153733866011063, best_hidden_layer_sizes sizes: [38, 38, 38, 38, 74]
Run with parameters (0.002, 0.0, None, 0, [38, 38, 38, 38, 74]) completed, best_val_accuracy: 0.9020436385986478, best_hidden_layer_sizes sizes: [38, 38, 38, 38, 74]
Best overall combination: (0.0005, 0.0, None, 0, [38, 38, 38, 38, 74]), val_accuracy: 0.9153733866011063
CPU times: user 24min 46s, sys: 23.3 s, total: 25min 9s
Wall time: 24min 7s


## Various numbers of added neurons

In [None]:
cifar100 = tf.keras.datasets.cifar100
(X_train, y_train), (X_test, y_test) = cifar100.load_data()

X_train = X_train.astype(dtype) / 255.0
y_train = y_train.astype(dtype)
X_test = X_test.astype(dtype)  / 255.0
y_test = y_test.astype(dtype)

X_train = np.reshape(X_train, (-1, 3072))
X_test = np.reshape(X_test, (-1, 3072))

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Scaling each feature independently

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

X_norm = np.reshape(X_norm, (-1, 32, 32, 3))
X_train_norm = np.reshape(X_train_norm, (-1, 32, 32, 3))
X_test_norm = np.reshape(X_test_norm, (-1, 32, 32, 3))

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz


In [None]:
train_fn(learning_rate=0.0002, regularization_penalty=0.00002, regularization_method='weighted_l1', self_scaling_epochs=25, 
         layer_sizes=[65, 23, 29, 72, 201], epochs=45, pruning_only_epochs=5, min_new_neurons=20, growth_percentage=0.2, output_neurons=100, verbose=True)

##########################################################
Epoch 1/45
Before growing:
loss: None - accuracy: None - val_loss: 5.109777927398682 - val_accuracy: 0.0074 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After growing:
loss: None - accuracy: None - val_loss: 5.109777927398682 - val_accuracy: 0.0074 - penalty: 2e-05
hidden layer sizes: [85, 43, 49, 92, 241], total units: 510
Before pruning:
loss: 4.022263526916504 - accuracy: 0.10886 - val_loss: 3.635108709335327 - val_accuracy: 0.1627 - penalty: 2e-05
hidden layer sizes: [85, 43, 49, 92, 241], total units: 510
After pruning:
loss: None - accuracy: None - val_loss: 3.6351377964019775 - val_accuracy: 0.1627 - penalty: 2e-05
hidden layer sizes: [65, 28, 29, 72, 201], total units: 395
##########################################################
Epoch 2/45
Before growing:
loss: None - accuracy: None - val_loss: 3.6351377964019775 - val_accuracy: 0.1627 - penalty: 2e-05
hidden layer sizes: [65, 28, 29, 7

{'accuracy': [0.10886,
  0.16018,
  0.18474,
  0.21626,
  0.24354,
  0.26132,
  0.27734,
  0.29562,
  0.30502,
  0.31882,
  0.32742,
  0.33616,
  0.34476,
  0.35048,
  0.35752,
  0.36154,
  0.36686,
  0.37234,
  0.374,
  0.3806,
  0.3809,
  0.3828,
  0.38544,
  0.39094,
  0.39198,
  0.38374,
  0.45196,
  0.47614,
  0.503,
  0.52294,
  0.5414,
  0.56282,
  0.581,
  0.59262,
  0.61068,
  0.62522,
  0.63884,
  0.6493,
  0.66408,
  0.6706,
  0.6823,
  0.69084,
  0.69828,
  0.70974,
  0.71898],
 'hidden_layer_sizes': [[65, 28, 29, 72, 201],
  [65, 24, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 70, 204],
  [65, 23, 29, 68, 204],
  [65, 23, 29, 66, 205],
  [65, 23, 29, 62, 209],
  [65, 23, 29, 64, 207],
  [65, 23, 29, 64, 210],
  [65, 23, 29, 68, 210],
  [65, 23, 29, 70, 210],
  [65, 23, 29, 71, 210],
  [65, 23, 29, 70, 211],
  [63, 23, 29, 73, 211],
  [60, 23, 29, 70, 212],
  [60, 23, 29, 73, 213],
  [60, 23, 29, 75, 213],
  [59, 23, 29, 72, 214],
  [59, 2

In [None]:
history = train_fn(learning_rate=0.0002, regularization_penalty=0.00002, regularization_method='weighted_l1', self_scaling_epochs=25, 
         layer_sizes=[65, 23, 29, 72, 201], epochs=45, pruning_only_epochs=5, min_new_neurons=80, growth_percentage=0.8, output_neurons=100, verbose=True)

##########################################################
Epoch 1/45
Before growing:
loss: None - accuracy: None - val_loss: 5.190567970275879 - val_accuracy: 0.0091 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After growing:
loss: None - accuracy: None - val_loss: 5.190567970275879 - val_accuracy: 0.0091 - penalty: 2e-05
hidden layer sizes: [145, 103, 109, 152, 361], total units: 870
Before pruning:
loss: 4.023648262023926 - accuracy: 0.10894 - val_loss: 3.6310110092163086 - val_accuracy: 0.1578 - penalty: 2e-05
hidden layer sizes: [145, 103, 109, 152, 361], total units: 870
After pruning:
loss: None - accuracy: None - val_loss: 3.63093638420105 - val_accuracy: 0.1578 - penalty: 2e-05
hidden layer sizes: [65, 24, 29, 72, 201], total units: 391
##########################################################
Epoch 2/45
Before growing:
loss: None - accuracy: None - val_loss: 3.63093638420105 - val_accuracy: 0.1578 - penalty: 2e-05
hidden layer sizes: [65, 24, 

In [None]:
train_fn(learning_rate=0.0002, regularization_penalty=0.00002, regularization_method='weighted_l1', self_scaling_epochs=25, 
         layer_sizes=[65, 23, 29, 72, 201], epochs=45, pruning_only_epochs=5, min_new_neurons=3, growth_percentage=0.03, output_neurons=100, verbose=True)

##########################################################
Epoch 1/45
Before growing:
loss: None - accuracy: None - val_loss: 5.087422847747803 - val_accuracy: 0.0114 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After growing:
loss: None - accuracy: None - val_loss: 5.087422847747803 - val_accuracy: 0.0114 - penalty: 2e-05
hidden layer sizes: [68, 26, 32, 75, 207], total units: 408
Before pruning:
loss: 4.023555755615234 - accuracy: 0.1083 - val_loss: 3.6138393878936768 - val_accuracy: 0.1695 - penalty: 2e-05
hidden layer sizes: [68, 26, 32, 75, 207], total units: 408
After pruning:
loss: None - accuracy: None - val_loss: 3.61383056640625 - val_accuracy: 0.1695 - penalty: 2e-05
hidden layer sizes: [65, 25, 29, 72, 201], total units: 392
##########################################################
Epoch 2/45
Before growing:
loss: None - accuracy: None - val_loss: 3.61383056640625 - val_accuracy: 0.1695 - penalty: 2e-05
hidden layer sizes: [65, 25, 29, 72, 2

{'accuracy': [0.1083,
  0.159,
  0.18468,
  0.20968,
  0.2365,
  0.2597,
  0.27706,
  0.29124,
  0.30326,
  0.31482,
  0.32476,
  0.33048,
  0.33872,
  0.34262,
  0.35102,
  0.35528,
  0.36306,
  0.36488,
  0.36598,
  0.37126,
  0.37652,
  0.37676,
  0.38052,
  0.3851,
  0.38582,
  0.37816,
  0.44222,
  0.47036,
  0.49168,
  0.5112,
  0.53198,
  0.54832,
  0.56712,
  0.58414,
  0.59502,
  0.60912,
  0.62166,
  0.63244,
  0.64212,
  0.65272,
  0.6631,
  0.67496,
  0.6782,
  0.68672,
  0.69862],
 'hidden_layer_sizes': [[65, 25, 29, 72, 201],
  [65, 25, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 203],
  [65, 23, 29, 72, 205],
  [65, 23, 29, 69, 202],
  [65, 23, 29, 65, 202],
  [65, 23, 29, 65, 202],
  [65, 23, 29, 65, 202],
  [65, 23, 29, 66, 202],
  [65, 23, 29, 66, 202],
  [64, 23, 29, 65, 202],
  [64, 23, 29, 64, 202],
  [63, 23, 29, 64, 202],
  [63, 23, 29, 65, 203],
  [63, 23, 29, 65, 205],
  [63, 23, 29, 65, 203],
  [62, 23, 29, 66, 204],
  [6

In [None]:
train_fn(learning_rate=0.0002, regularization_penalty=0.00002, regularization_method='weighted_l1', self_scaling_epochs=25, 
         layer_sizes=[65, 23, 29, 72, 201], epochs=45, pruning_only_epochs=25, min_new_neurons=1, growth_percentage=0., output_neurons=100, verbose=True)

##########################################################
Epoch 1/45
Before pruning:
loss: 3.992975950241089 - accuracy: 0.11334 - val_loss: 3.6081390380859375 - val_accuracy: 0.1674 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After pruning:
loss: None - accuracy: None - val_loss: 3.6081390380859375 - val_accuracy: 0.1674 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
##########################################################
Epoch 2/45
Before pruning:
loss: 3.6014726161956787 - accuracy: 0.16406 - val_loss: 3.4407148361206055 - val_accuracy: 0.1902 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After pruning:
loss: None - accuracy: None - val_loss: 3.4407148361206055 - val_accuracy: 0.1902 - penalty: 2e-05
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
##########################################################
Epoch 3/45
Before pruning:
loss: 3.4443089962005615 - accuracy: 0.1868

{'accuracy': [0.11334,
  0.16406,
  0.18686,
  0.21404,
  0.24042,
  0.26002,
  0.2753,
  0.28964,
  0.30378,
  0.3089,
  0.32214,
  0.33034,
  0.33752,
  0.3441,
  0.35336,
  0.35844,
  0.3621,
  0.36476,
  0.36848,
  0.37152,
  0.3756,
  0.37704,
  0.38282,
  0.38352,
  0.38778,
  0.38028,
  0.4454,
  0.47324,
  0.49116,
  0.51296,
  0.53066,
  0.54682,
  0.56188,
  0.57644,
  0.59178,
  0.6036,
  0.6134,
  0.62668,
  0.63468,
  0.6489,
  0.65284,
  0.66126,
  0.67116,
  0.67646,
  0.68608],
 'hidden_layer_sizes': [[65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 71, 201],
  [65, 23, 29, 71, 201],
  [65, 23, 29, 71, 201],
  [65, 23, 29, 69, 201],
  [65, 23, 29, 67, 201],
  [65, 23, 29, 66, 201],
  [65, 23, 29, 65, 201],
  [65, 23, 29, 64, 201],
  [64, 23, 29, 64, 201],
  [64, 23, 29, 64, 201],
  [64, 23, 29, 63, 201],
  [62, 23, 29, 63, 201],
  [61, 23, 29, 63, 201],
  [61, 23, 29, 63, 201],
  [5

In [None]:
train_fn(learning_rate=0.0002, regularization_penalty=0.00000002, regularization_method='weighted_l1', self_scaling_epochs=25, 
         layer_sizes=[65, 23, 29, 72, 201], epochs=45, pruning_only_epochs=25, min_new_neurons=1, growth_percentage=0., output_neurons=100, verbose=True)

##########################################################
Epoch 1/45
Before pruning:
loss: 4.042459487915039 - accuracy: 0.10936 - val_loss: 3.5285441875457764 - val_accuracy: 0.1856 - penalty: 2e-08
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After pruning:
loss: None - accuracy: None - val_loss: 3.5285441875457764 - val_accuracy: 0.1856 - penalty: 2e-08
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
##########################################################
Epoch 2/45
Before pruning:
loss: 3.5535266399383545 - accuracy: 0.18092 - val_loss: 3.3058547973632812 - val_accuracy: 0.2268 - penalty: 2e-08
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After pruning:
loss: None - accuracy: None - val_loss: 3.3058547973632812 - val_accuracy: 0.2268 - penalty: 2e-08
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
##########################################################
Epoch 3/45
Before pruning:
loss: 3.3462257385253906 - accuracy: 0.2131

{'accuracy': [0.10936,
  0.18092,
  0.21318,
  0.24234,
  0.26524,
  0.29294,
  0.31004,
  0.33394,
  0.35352,
  0.3693,
  0.38136,
  0.3951,
  0.40744,
  0.41642,
  0.42992,
  0.43602,
  0.44686,
  0.44884,
  0.46008,
  0.4672,
  0.47128,
  0.47426,
  0.48148,
  0.48888,
  0.48862,
  0.49608,
  0.53858,
  0.55262,
  0.56618,
  0.57426,
  0.5866,
  0.59066,
  0.59672,
  0.60234,
  0.60832,
  0.61264,
  0.61794,
  0.62048,
  0.6272,
  0.63162,
  0.63598,
  0.63818,
  0.64452,
  0.64776,
  0.6523],
 'hidden_layer_sizes': [[65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
 

In [None]:
train_fn(learning_rate=0.0002, regularization_penalty=0., regularization_method=None, self_scaling_epochs=25, 
         layer_sizes=[65, 23, 29, 72, 201], epochs=45, pruning_only_epochs=25, min_new_neurons=1, growth_percentage=0., output_neurons=100, verbose=True)

##########################################################
Epoch 1/45
Before pruning:
loss: 4.0540876388549805 - accuracy: 0.1073 - val_loss: 3.542466402053833 - val_accuracy: 0.1866 - penalty: 0.0
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After pruning:
loss: None - accuracy: None - val_loss: 3.542466402053833 - val_accuracy: 0.1866 - penalty: 0.0
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
##########################################################
Epoch 2/45
Before pruning:
loss: 3.587552785873413 - accuracy: 0.17374 - val_loss: 3.353174924850464 - val_accuracy: 0.2133 - penalty: 0.0
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
After pruning:
loss: None - accuracy: None - val_loss: 3.353174924850464 - val_accuracy: 0.2133 - penalty: 0.0
hidden layer sizes: [65, 23, 29, 72, 201], total units: 390
##########################################################
Epoch 3/45
Before pruning:
loss: 3.377345085144043 - accuracy: 0.20884 - val_loss: 

{'accuracy': [0.1073,
  0.17374,
  0.20884,
  0.24028,
  0.26248,
  0.2852,
  0.30876,
  0.3307,
  0.34718,
  0.36394,
  0.38012,
  0.3876,
  0.4052,
  0.41532,
  0.42418,
  0.43424,
  0.44196,
  0.45062,
  0.45308,
  0.45912,
  0.46622,
  0.47206,
  0.47812,
  0.48374,
  0.48298,
  0.48802,
  0.53452,
  0.54826,
  0.55998,
  0.57382,
  0.57874,
  0.58616,
  0.59152,
  0.59558,
  0.6042,
  0.60732,
  0.61616,
  0.61578,
  0.62308,
  0.62656,
  0.63088,
  0.63506,
  0.63736,
  0.64502,
  0.64388],
 'hidden_layer_sizes': [[65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
  [65, 23, 29, 72, 201],
 