In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Aug  4 12:43:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

%matplotlib inline

In [None]:
dtype = 'float32'
tf.keras.backend.set_floatx(dtype)

In [None]:
# fashion_mnist = tf.keras.datasets.fashion_mnist
# (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# X_train = X_train.astype(dtype) / 255.0
# y_train = y_train.astype(dtype)
# X_test = X_test.astype(dtype)  / 255.0
# y_test = y_test.astype(dtype)

# X_train = np.reshape(X_train, (-1, 784))
# X_test = np.reshape(X_test, (-1, 784))

In [None]:
# cifar10 = tf.keras.datasets.cifar10
# (X_train, y_train), (X_test, y_test) = cifar10.load_data()

# X_train = X_train.astype(dtype) / 255.0
# y_train = y_train.astype(dtype)
# X_test = X_test.astype(dtype)  / 255.0
# y_test = y_test.astype(dtype)

# X_train = np.reshape(X_train, (-1, 3072))
# X_test = np.reshape(X_test, (-1, 3072))

In [None]:
from sklearn.model_selection import train_test_split

glass = pd.read_csv('glass.data', header=None)

y = glass.pop(10).values.astype(dtype)
y = np.reshape(y, (-1, 1))
X = glass.values.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split

vehicle = pd.read_csv('vehicle.txt', header=None, delim_whitespace=True)

vehicle[18] = pd.Categorical(vehicle[18])
vehicle[18] = vehicle[18].cat.codes

y = vehicle.pop(18).values.astype(dtype)
y = np.reshape(y, (-1, 1))
X = vehicle.values.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split

ionosphere = pd.read_csv('ionosphere.data', header=None)

ionosphere[34] = pd.Categorical(ionosphere[34])
ionosphere[34] = ionosphere[34].cat.codes

y = ionosphere.pop(34).values.astype(dtype)
y = np.reshape(y, (-1, 1))
X = ionosphere.values.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_norm = scaler.transform(X)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [None]:
X.shape

(351, 34)

In [None]:
class SSRegularizer(tf.keras.regularizers.Regularizer):
    def __init__(self, regularization_penalty, regularization_method):
        self.regularization_penalty = regularization_penalty
        self.regularization_method = regularization_method

    def __call__(self, x):
        if self.regularization_method == 'weighted_l1':
            return self.weighted_l1(x)
        elif self.regularization_method == 'group_sparsity':
            return self.group_sparsity(x)
        else:
            raise NotImplementedError(f"Unknown regularization method {self.regularization_method}")
    
    def weighted_l1(self, x):
        # I.e. for a parameter matrix of 4 input and 10 output neurons:
        #
        # [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        #  [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]
        #
        # The scaling vector could be [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        # and the resulting weighted values could be
        #
        # [[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        #  [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        #  [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        #  [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]]
        #
        # Therefore every additional output neuron is regularized more.

        scaling_vector = tf.cumsum(tf.constant(self.regularization_penalty, shape=(x.shape[-1],), dtype=dtype), axis=0)
        weighted_values = scaling_vector * tf.abs(x)
        return tf.reduce_sum(weighted_values)
    
    def group_sparsity(self, x):
        # I.e. for a parameter matrix of 3 input and 5 output neurons:
        #
        # [[1., 1., 1., 1., 1.],
        #  [1., 2., 2., 1., 2.],
        #  [2., 2., 3., 1., 3.]]
        #
        # The resulting vector of group norms is [2., 2., 3., 1., 3.], therefore for
        # every output neuron, its incoming connections form a group.

        group_norms = tf.norm(x, ord=2, axis=0)
        # assert group_norms.shape[0] == x.shape[1]
        return self.regularization_penalty * tf.reduce_sum(group_norms)

    def get_config(self):
        return {'regularization_penalty': float(self.regularization_penalty)}


class SSLayer(tf.keras.Model):
    def __init__(self, input_units, units, activation, regularization_penalty, regularization_method, kernel_initializer, bias_initializer, regularize=True):
        super().__init__()

        self.input_units = input_units
        self.units = units
        self.activation = activation
        self.regularization_penalty = regularization_penalty
        self.regularization_method = regularization_method
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
        
        self.A = tf.keras.activations.get(activation)
        self.W_init = tf.keras.initializers.get(kernel_initializer)
        self.b_init = tf.keras.initializers.get(bias_initializer)
        self.regularizer = SSRegularizer(self.regularization_penalty, self.regularization_method)
        
        self.W = tf.Variable(
            name='W',
            initial_value=self.W_init(shape=(input_units, units), dtype=dtype),
            trainable=True)
        
        self.b = tf.Variable(
            name='b',
            initial_value=self.b_init(shape=(units,), dtype=dtype),
            trainable=True)
        
        if self.regularization_method is not None:
            self.add_loss(lambda: self.regularizer(tf.concat([self.W, tf.reshape(self.b, (1, -1))], axis=0)))
    
    def call(self, inputs):
        return self.A(tf.matmul(inputs, self.W) + self.b)
    
    def copy_without_regularization(self):
        copy = SSLayer(
            self.input_units, 
            self.units, 
            self.activation, 
            regularization_penalty=self.regularization_penalty, 
            regularization_method=None, 
            kernel_initializer=self.kernel_initializer, 
            bias_initializer=self.bias_initializer
        )
        copy.W = self.W
        copy.b = self.b
        return copy


class SSModel(tf.keras.Model):
    def __init__(self, layer_sizes, activation=None, regularization_penalty=0.01, regularization_method='weighted_l1', kernel_initializer='glorot_uniform', bias_initializer='zeros'):
        super().__init__()
        
        self.sslayers = list()
        for l in range(len(layer_sizes) - 1):
            input_units = layer_sizes[l]
            units = layer_sizes[l + 1]
            if l < len(layer_sizes) - 2:
                layer = SSLayer(input_units, units, activation, regularization_penalty, regularization_method, kernel_initializer, bias_initializer)
            else:  # Last layer
                layer = SSLayer(input_units, units, 'softmax', 0., regularization_method, kernel_initializer, bias_initializer)
            self.sslayers.append(layer)

    def call(self, inputs):
        x = inputs
        for layer in self.sslayers:
            x = layer(x)
        return x
    
    def get_layer_sizes(self):
        layer_sizes = list()
        for l in range(len(self.sslayers)):
            layer = self.sslayers[l]
            layer_sizes.append(layer.W.shape[0])
            if l == len(self.sslayers) - 1:  # Last layer
                layer_sizes.append(layer.W.shape[1])
        return layer_sizes
    
    def get_hidden_layer_sizes(self):
        return self.get_layer_sizes()[1:-1]
    
    def remove_regularization(self):
        for l in range(len(self.sslayers)):
            self.sslayers[l] = self.sslayers[l].copy_without_regularization()
    
    def get_regularization_penalty(self):
        return self.sslayers[0].regularizer.regularization_penalty
    
    def set_regularization_penalty(self, regularization_penalty):
        for l in range(0, len(self.sslayers) - 1):  # Every layer except of the last is regularized
            self.sslayers[l].regularizer.regularization_penalty = regularization_penalty
    
    def prune(self, threshold=0.001):
        for l in range(len(self.sslayers) - 1):
            layer1 = self.sslayers[l]
            layer2 = self.sslayers[l + 1]
            
            W1 = layer1.W.value()
            b1 = layer1.b.value()
            W2 = layer2.W.value()

            weights_with_biases = tf.concat([W1, tf.reshape(b1, (1, -1))], axis=0)
            neurons_are_active = tf.math.reduce_max(tf.abs(weights_with_biases), axis=0) >= threshold
            active_neurons_indices = tf.reshape(tf.where(neurons_are_active), (-1,))
            
            new_W1 = tf.gather(W1, active_neurons_indices, axis=1)
            new_b1 = tf.gather(b1, active_neurons_indices, axis=0)
            new_W2 = tf.gather(W2, active_neurons_indices, axis=0)

            layer1.W = tf.Variable(name='W', initial_value=new_W1, trainable=True)
            layer1.b = tf.Variable(name='b', initial_value=new_b1, trainable=True)
            layer2.W = tf.Variable(name='W', initial_value=new_W2, trainable=True)
    
    def grow(self, percentage, min_new_neurons=5, scaling_factor=0.001):   
        for l in range(len(self.sslayers) - 1):
            layer1 = self.sslayers[l]
            layer2 = self.sslayers[l + 1]
       
            W1 = layer1.W.value()
            b1 = layer1.b.value()
            W2 = layer2.W.value()

            n_new_neurons = max(min_new_neurons, int(W1.shape[1] * percentage))

            W1_growth = layer1.W_init(shape=(W1.shape[0], W1.shape[1] + n_new_neurons), dtype=dtype)[:, -n_new_neurons:] * scaling_factor
            b1_growth = layer1.b_init(shape=(n_new_neurons,), dtype=dtype)
            W2_growth = layer2.W_init(shape=(W2.shape[0] + n_new_neurons, W2.shape[1]), dtype=dtype)[-n_new_neurons:, :] * scaling_factor  # TODO is it better to be multiplying here by scaling_factor? It does help with not increasing the max weights of existing neurons when new neurons are added.

            new_W1 = tf.concat([W1, W1_growth], axis=1)
            new_b1 = tf.concat([b1, b1_growth], axis=0)
            new_W2 = tf.concat([W2, W2_growth], axis=0)

            layer1.W = tf.Variable(name='W1', initial_value=new_W1, trainable=True)
            layer1.b = tf.Variable(name='b1', initial_value=new_b1, trainable=True)
            layer2.W = tf.Variable(name='W2', initial_value=new_W2, trainable=True)
    
    @staticmethod
    def get_param_string(weights, bias):
        param_string = ""
        weights_with_bias = tf.concat([weights, tf.reshape(bias, (1, -1))], axis=0)
        max_parameters = tf.math.reduce_max(tf.abs(weights_with_bias), axis=0).numpy()
        magnitudes = np.floor(np.log10(max_parameters))
        for m in magnitudes:
            if m > 0:
                m = 0
            param_string += str(int(-m))
        return param_string
    
    def print_neurons(self):
        for layer in self.sslayers[:-1]:
            print(self.get_param_string(layer.W, layer.b))
    
    def evaluate(self, x, y, validation_data):
        x_val = validation_data[0]
        y_val = validation_data[1]

        y_pred = self(x)
        loss = float(tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y, y_pred)))
        accuracy = float(tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(y, y_pred)))
        
        y_val_pred = self(x_val)
        val_loss = float(tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_val, y_val_pred)))
        val_accuracy = float(tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(y_val, y_val_pred)))

        return loss, accuracy, val_loss, val_accuracy
    
    def print_epoch_statistics(self, x, y, validation_data, print_neurons):
        loss, accuracy, val_loss, val_accuracy = self.evaluate(x, y, validation_data)
        print(f"loss: {loss} - accuracy: {accuracy} - val_loss: {val_loss} - val_accuracy: {val_accuracy} - penalty: {model.get_regularization_penalty()}")
        hidden_layer_sizes = self.get_hidden_layer_sizes()
        print(f"hidden layer sizes: {hidden_layer_sizes}, total neurons: {sum(hidden_layer_sizes)}")
        if print_neurons:
            self.print_neurons()
    
    def update_history(self, x, y, validation_data, history):
        loss, accuracy, val_loss, val_accuracy = self.evaluate(x, y, validation_data)
        history['loss'].append(loss)
        history['accuracy'].append(accuracy)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)

    def fit(self, x, y, optimizer, epochs, self_scaling_epochs, batch_size, min_new_neurons, validation_data, pruning_threshold=0.001, 
            regularization_penalty_multiplier=1., stall_coefficient=1, growth_percentage=0.2, mini_epochs_per_epoch=1, verbose=True, print_neurons=False):
        train_dataset = tf.data.Dataset.from_tensor_slices((x, y))
        train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

        history = {
            'loss': list(),
            'accuracy': list(),
            'val_loss': list(),
            'val_accuracy': list(),
        }

        best_val_loss = np.inf
        training_stalled = False
        for epoch in range(epochs):
            if verbose:
                print("##########################################################")
                print(f"Epoch {epoch + 1}/{epochs}")

            if epoch < self_scaling_epochs:
                if verbose:
                    print("Before growing:")
                    self.print_epoch_statistics(x, y, validation_data, print_neurons)

                loss, accuracy, val_loss, val_accuracy = self.evaluate(x, y, validation_data)
                if val_loss >= best_val_loss * stall_coefficient:
                    if not training_stalled:
                        penalty = self.get_regularization_penalty() * regularization_penalty_multiplier
                        self.set_regularization_penalty(penalty)
                        training_stalled = True
                else:
                    best_val_loss = val_loss
                    training_stalled = False

                self.grow(percentage=growth_percentage, min_new_neurons=min_new_neurons, scaling_factor=pruning_threshold)
                if verbose:
                    print("After growing:")
                    self.print_epoch_statistics(x, y, validation_data, print_neurons)
            
            if epoch == self_scaling_epochs:
                self.remove_regularization()

            for mini_epoch in range(mini_epochs_per_epoch):
                for step, (x_batch, y_batch) in enumerate(train_dataset):
                    with tf.GradientTape() as tape:
                        y_pred = self(x_batch, training=True)
                        loss_value = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_batch, y_pred))
                        loss_value += sum(self.losses)

                    grads = tape.gradient(loss_value, self.trainable_variables)
                    optimizer.apply_gradients(zip(grads, self.trainable_variables))
            
            if epoch < self_scaling_epochs:
                if verbose:
                    print("Before pruning:")
                    self.print_epoch_statistics(x, y, validation_data, print_neurons)
                self.prune(threshold=pruning_threshold)
                if verbose:
                    print("After pruning:")
                    self.print_epoch_statistics(x, y, validation_data, print_neurons)
            else:
                if verbose:
                    self.print_epoch_statistics(x, y, validation_data, print_neurons)
            
            self.update_history(x, y, validation_data, history)

        return history

# Small datasets

## Glass dataset

**Here the results of an auto-sizing network are very good compared to a static network, with the accuracy typically improved by several percent:**

In [None]:
epochs = 60
self_scaling_epochs = 40
batch_size = 32
min_new_neurons = 50

In [None]:
%%time

model = SSModel(layer_sizes=[18, 100, 100, 100, 100, 8], activation='selu', regularization_penalty=0.001, 
                regularization_method='weighted_l1', kernel_initializer='lecun_normal')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

model.fit(X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
            min_new_neurons, validation_data=(X_test_norm, y_test), pruning_threshold=0.01, print_neurons=True)

In [None]:
%%time

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(66, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(13, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(27, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(8, activation='softmax', kernel_initializer='lecun_normal'),
])

model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=['accuracy'])

model.fit(X_train_norm, y_train, epochs=60, validation_data=(X_test_norm, y_test))

In [None]:
%%time

from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_val_accuracies = list()
layer_sizes = list()
for train_index, test_index in kf.split(X_norm):
    xtrain, xtest = X_norm[train_index], X_norm[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    model = SSModel(layer_sizes=[10, 100, 100, 100, 100, 8], activation='selu', regularization_penalty=0.001, 
                    regularization_method='weighted_l1', kernel_initializer='lecun_normal')
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
    history = model.fit(X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
              min_new_neurons, validation_data=(X_test_norm, y_test), pruning_threshold=0.01, print=False)
    best_val_accuracies.append(max(history['val_accuracy']))
    layer_sizes.append(model.get_layer_sizes())

print(f'val accuracies: {best_val_accuracies}')
print(f'mean val accuracy: {np.mean(best_val_accuracies)}')

print(f'mean layer sizes: {[np.mean(layer) for layer in list(zip(*layer_sizes))]}')

val accuracies: [0.9846153855323792, 0.9692307710647583, 0.9692307710647583, 0.9538461565971375, 0.9538461565971375, 0.9692307710647583, 0.9692307710647583, 0.9538461565971375, 0.9692307710647583, 0.9692307710647583]
mean val accuracy: 0.9661538481712342
mean layer sizes: [10.0, 61.8, 11.2, 13.4, 24.7, 8.0]
CPU times: user 56.7 s, sys: 996 ms, total: 57.7 s
Wall time: 56.6 s


In [None]:
%%time

from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_val_accuracies = list()
for train_index, test_index in kf.split(X_norm):
    xtrain, xtest = X_norm[train_index], X_norm[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(62, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(11, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(13, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(24, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(8, activation='softmax', kernel_initializer='lecun_normal'),
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=['accuracy'])

    history = model.fit(X_train_norm, y_train, epochs=60, validation_data=(X_test_norm, y_test), verbose=0)
    best_val_accuracies.append(max(history.history['val_accuracy']))

print(f'val accuracies: {best_val_accuracies}')
print(f'mean val accuracy: {np.mean(best_val_accuracies)}')

val accuracies: [0.9230769276618958, 0.9384615421295166, 0.9538461565971375, 0.9076923131942749, 0.9076923131942749, 0.892307698726654, 0.9230769276618958, 0.9538461565971375, 0.9384615421295166, 0.9230769276618958]
mean val accuracy: 0.9261538505554199
CPU times: user 21.5 s, sys: 1.36 s, total: 22.9 s
Wall time: 20.5 s


## Vehicle dataset

In [None]:
epochs = 60
self_scaling_epochs = 40
batch_size = 32
min_new_neurons = 50

In [None]:
%%time

model = SSModel(layer_sizes=[18, 100, 100, 100, 100, 4], activation='selu', regularization_penalty=0.0001, 
                regularization_method='weighted_l1', kernel_initializer='lecun_normal')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

train_model(model, X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
            min_new_neurons, validation_data=(X_test_norm, y_test), pruning_threshold=0.01, print_neurons=True)

##########################################################
Epoch 1/60
Before growing:
loss: 1.556831955909729 - accuracy: 0.19763512909412384 - val_loss: 1.6075917482376099 - val_accuracy: 0.13385826349258423 - penalty: 0.0001
hidden layer sizes: [100, 100, 100, 100], total neurons: 400
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
After growing:
loss: 1.5568643808364868 - accuracy: 0.19763512909412384 - val_loss: 1.6076170206069946 - val_accuracy: 0.13385826349258423 - penalty: 0.0001
hidden layer sizes: [150, 150, 150, 150], total neurons: 600
111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111

In [None]:
%%time

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(114, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(49, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(72, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(58, activation='selu', kernel_initializer='lecun_normal'),
    tf.keras.layers.Dense(8, activation='softmax', kernel_initializer='lecun_normal'),
])

model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=['accuracy'])

model.fit(X_train_norm, y_train, epochs=60, validation_data=(X_test_norm, y_test))

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
CPU times: user 4.89 s, sys: 385 ms, total: 5.28 s
Wall time: 5.55 s


In [None]:
%%time

from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_val_accuracies = list()
layer_sizes = list()
for train_index, test_index in kf.split(X_norm):
    xtrain, xtest = X_norm[train_index], X_norm[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    model = SSModel(layer_sizes=[18, 100, 100, 100, 100, 4], activation='selu', regularization_penalty=0.0001, 
                    regularization_method='weighted_l1', kernel_initializer='lecun_normal')
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
    history = model.fit(X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
              min_new_neurons, validation_data=(X_test_norm, y_test), pruning_threshold=0.01, print=False)
    best_val_accuracies.append(max(history['val_accuracy']))
    layer_sizes.append(model.get_layer_sizes())

print(f'val accuracies: {best_val_accuracies}')
print(f'mean val accuracy: {np.mean(best_val_accuracies)}')

print(f'mean layer sizes: {[np.mean(layer) for layer in list(zip(*layer_sizes))]}')

val accuracies: [0.8503937125205994, 0.8622047305107117, 0.8464567065238953, 0.8503937125205994, 0.8661417365074158, 0.8661417365074158, 0.8779527544975281, 0.8700787425041199, 0.8582677245140076, 0.8582677245140076]
mean val accuracy: 0.86062992811203
mean layer sizes: [18.0, 93.6, 62.9, 44.0, 64.9, 4.0]
CPU times: user 2min 34s, sys: 2.23 s, total: 2min 36s
Wall time: 2min 34s


In [None]:
%%time

from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_val_accuracies = list()
for train_index, test_index in kf.split(X_norm):
    xtrain, xtest = X_norm[train_index], X_norm[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(18, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(94, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(63, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(44, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(65, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(4, activation='softmax', kernel_initializer='lecun_normal'),
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=['accuracy'])

    history = model.fit(X_train_norm, y_train, epochs=60, validation_data=(X_test_norm, y_test), verbose=0)
    best_val_accuracies.append(max(history.history['val_accuracy']))

print(f'val accuracies: {best_val_accuracies}')
print(f'mean val accuracy: {np.mean(best_val_accuracies)}')

val accuracies: [0.8779527544975281, 0.8818897604942322, 0.9055117964744568, 0.874015748500824, 0.8858267664909363, 0.874015748500824, 0.8858267664909363, 0.8818897604942322, 0.8779527544975281, 0.8661417365074158]
mean val accuracy: 0.8811023592948913
CPU times: user 41.5 s, sys: 2.81 s, total: 44.3 s
Wall time: 51.2 s


## Ionosphere dataset

In [None]:
epochs = 60
self_scaling_epochs = 40
batch_size = 32
min_new_neurons = 50

In [None]:
%%time

model = SSModel(layer_sizes=[34, 100, 100, 100, 100, 2], activation='selu', regularization_penalty=0.0001, 
                regularization_method='weighted_l1', kernel_initializer='lecun_normal')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

model.fit(X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
          min_new_neurons, validation_data=(X_test_norm, y_test), pruning_threshold=0.01, print_neurons=True)

##########################################################
Epoch 1/60
Before growing:
loss: 1.055200219154358 - accuracy: 0.5142857432365417 - val_loss: 0.9758434891700745 - val_accuracy: 0.5 - penalty: 0.0001
hidden layer sizes: [100, 100, 100, 100], total neurons: 400
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
After growing:
loss: 1.0551998615264893 - accuracy: 0.5142857432365417 - val_loss: 0.9758460521697998 - val_accuracy: 0.5 - penalty: 0.0001
hidden layer sizes: [150, 150, 150, 150], total neurons: 600
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111333333333333333333333333

In [None]:
%%time

from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_val_accuracies = list()
layer_sizes = list()
for train_index, test_index in kf.split(X_norm):
    xtrain, xtest = X_norm[train_index], X_norm[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    model = SSModel(layer_sizes=[34, 100, 100, 100, 100, 2], activation='selu', regularization_penalty=0.0001, 
                    regularization_method='weighted_l1', kernel_initializer='lecun_normal')
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
    history = model.fit(X_train_norm, y_train, optimizer, epochs, self_scaling_epochs, batch_size, 
              min_new_neurons, validation_data=(X_test_norm, y_test), pruning_threshold=0.01, verbose=False)
    best_val_accuracies.append(max(history['val_accuracy']))
    layer_sizes.append(model.get_layer_sizes())

print(f'val accuracies: {best_val_accuracies}')
print(f'mean val accuracy: {np.mean(best_val_accuracies)}')

print(f'mean layer sizes: {[np.mean(layer) for layer in list(zip(*layer_sizes))]}')

val accuracies: [0.9245283007621765, 0.9245283007621765, 0.9245283007621765, 0.9150943160057068, 0.9339622855186462, 0.9245283007621765, 0.9150943160057068, 0.9245283007621765, 0.9245283007621765, 0.8962264060974121]
mean val accuracy: 0.9207547128200531
mean layer sizes: [34.0, 38.2, 19.7, 14.9, 20.0, 2.0]
CPU times: user 1min 18s, sys: 1.3 s, total: 1min 19s
Wall time: 1min 18s


In [None]:
%%time

from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_val_accuracies = list()
for train_index, test_index in kf.split(X_norm):
    xtrain, xtest = X_norm[train_index], X_norm[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(34, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(38, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(20, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(15, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(20, activation='selu', kernel_initializer='lecun_normal'),
        tf.keras.layers.Dense(2, activation='softmax', kernel_initializer='lecun_normal'),
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=['accuracy'])

    history = model.fit(X_train_norm, y_train, epochs=60, validation_data=(X_test_norm, y_test), verbose=0)
    best_val_accuracies.append(max(history.history['val_accuracy']))

print(f'val accuracies: {best_val_accuracies}')
print(f'mean val accuracy: {np.mean(best_val_accuracies)}')

val accuracies: [0.9150943160057068, 0.9150943160057068, 0.9245283007621765, 0.9150943160057068, 0.9339622855186462, 0.9433962106704712, 0.9245283007621765, 0.9339622855186462, 0.9150943160057068, 0.9150943160057068]
mean val accuracy: 0.9235848963260651
CPU times: user 25.6 s, sys: 1.69 s, total: 27.3 s
Wall time: 25.1 s
