In [1]:
import tensorflow as tf
import numpy as np

In [112]:
mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.astype('float32') / 255.0
y_train = y_train.astype('float32')
X_test = X_test.astype('float32')  / 255.0
y_test = y_test.astype('float32')

X_train = np.reshape(X_train, (-1, 784))
X_test = np.reshape(X_test, (-1, 784))

In [312]:
class SSRegularizer(tf.keras.regularizers.Regularizer):
    def __init__(self, l1):
        self.l1 = l1

    def __call__(self, x):
        scaling_vector = tf.cumsum(tf.constant(self.l1, shape=x.shape[-1]), axis=0) - 0.1
        return tf.reduce_sum(scaling_vector * tf.abs(x))

    def get_config(self):
        return {'l1': float(self.l1)}


class SSModel(tf.keras.Model):
    def __init__(self, units, activation=None, kernel_initializer='glorot_uniform', bias_initializer='zeros'):
        super().__init__()
        self.units = units
        self.activation1 = tf.keras.activations.get(activation)
        self.activation2 = tf.keras.activations.get('softmax')
        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
        self.regularizer = SSRegularizer(0.01)
        
        self.W1 = tf.Variable(
            name='W1',
            initial_value=self.kernel_initializer(shape=(784, self.units), dtype='float32'),
            trainable=True)
        
        self.b1 = tf.Variable(
            name='b1',
            initial_value=self.bias_initializer(shape=(self.units,), dtype='float32'),
            trainable=True)
        
        self.W2 = tf.Variable(
            name='W2',
            initial_value=self.kernel_initializer(shape=(self.units, 10), dtype='float32'),
            trainable=True)
        
        self.b2 = tf.Variable(
            name='b2',
            initial_value=self.bias_initializer(shape=(10,), dtype='float32'),
            trainable=True)
        
        self.add_loss(lambda: self.regularizer(self.W1))
        self.add_loss(lambda: self.regularizer(self.b1))
        self.add_loss(lambda: self.regularizer(self.W2))
        self.add_loss(lambda: self.regularizer(self.b2))

    def call(self, inputs):
        A1 = self.activation1(tf.matmul(inputs, self.W1) + self.b1)
        A2 = self.activation2(tf.matmul(A1, self.W2) + self.b2)

        return A2
    
    def prune(self, threshold=0.001):
        W1 = self.W1.value()
        b1 = self.b1.value()
        W2 = self.W2.value()
        
        weights_with_biases = tf.concat([W1, tf.reshape(b1, (1, -1))], axis=0)
        neurons_are_active = tf.math.reduce_max(weights_with_biases, axis=0) >= threshold
        active_neurons_indices = tf.reshape(tf.where(neurons_are_active), (-1,))
        
        new_W1 = tf.gather(W1, active_neurons_indices, axis=1)
        new_b1 = tf.gather(b1, active_neurons_indices, axis=0)
        new_W2 = tf.gather(W2, active_neurons_indices, axis=0)
        
        self.W1 = tf.Variable(name='W1', initial_value=new_W1, trainable=True)
        self.b1 = tf.Variable(name='b1', initial_value=new_b1, trainable=True)
        self.W2 = tf.Variable(name='W2', initial_value=new_W2, trainable=True)

In [313]:
def train_model(model, optimizer, epochs, batch_size, train_dataset):
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        for step, (x_batch, y_batch) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss_value = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_batch, y_pred))
                loss_value += sum(model.losses)

            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # Show epoch statistics
        y_pred = model(X_train)
        loss_value = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_train, y_pred))
        accuracy = tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(y_train, y_pred))
        print(f"loss: {loss_value} - accuracy: {accuracy} - units: {model.W1.shape[1]}")
        
        model.prune()
        
        # Show epoch statistics
        y_pred = model(X_train)
        loss_value = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_train, y_pred))
        accuracy = tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(y_train, y_pred))
        print(f"loss: {loss_value} - accuracy: {accuracy} - units: {model.W1.shape[1]}")

In [314]:
epochs = 5
batch_size = 32

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

model = SSModel(units=100, activation='relu')
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

train_model(model, optimizer, 10, batch_size, train_dataset)

Epoch 1/10
loss: 1.7096461057662964 - accuracy: 0.5578166842460632 - units: 100
loss: 1.7096459865570068 - accuracy: 0.5578166842460632 - units: 97
Epoch 2/10
loss: 1.900716781616211 - accuracy: 0.6493833065032959 - units: 97
loss: 1.90078604221344 - accuracy: 0.6493833065032959 - units: 93
Epoch 3/10
loss: 2.4009337425231934 - accuracy: 0.6944166421890259 - units: 93
loss: 2.4010181427001953 - accuracy: 0.6943833231925964 - units: 89
Epoch 4/10
loss: 3.115446090698242 - accuracy: 0.7389833331108093 - units: 89
loss: 3.115446090698242 - accuracy: 0.7389833331108093 - units: 87
Epoch 5/10
loss: 4.33226203918457 - accuracy: 0.6991166472434998 - units: 87
loss: 4.33226203918457 - accuracy: 0.6991166472434998 - units: 84
Epoch 6/10
loss: 5.3541178703308105 - accuracy: 0.6970666646957397 - units: 84
loss: 5.3541178703308105 - accuracy: 0.6970666646957397 - units: 83
Epoch 7/10
loss: 6.86423397064209 - accuracy: 0.7108333110809326 - units: 83
loss: 6.86423397064209 - accuracy: 0.710833311080

In [236]:
model.prune()
train_model(model, optimizer, 1, batch_size, train_dataset)

Epoch 1/1
loss: 2.3635358810424805 - accuracy: 0.7073333263397217


In [221]:
model.W1[:, :10]

<tf.Tensor: shape=(784, 10), dtype=float32, numpy=
array([[-8.45908   ,  7.518516  ,  6.5726666 , ...,  1.9297397 ,
        -0.9787072 , -0.01317163],
       [ 8.437461  , -7.5142074 ,  6.5818934 , ...,  1.8981346 ,
        -0.9950207 ,  0.0211658 ],
       [-8.453113  , -7.516163  ,  6.6172676 , ..., -1.9027253 ,
         0.9439355 ,  0.02779727],
       ...,
       [ 8.446647  , -7.5786667 , -6.5709434 , ..., -1.9094808 ,
        -0.9905917 ,  0.07739742],
       [ 8.504931  ,  7.5016556 , -6.6205573 , ...,  1.9115169 ,
        -0.9903703 , -0.05508175],
       [-8.475748  ,  7.5195327 , -6.6430144 , ...,  1.9527134 ,
        -0.9456445 , -0.01849682]], dtype=float32)>

In [183]:
tf.Variable(name='test',
            initial_value=model.b1[:10],
            trainable=True)

<tf.Variable 'test:0' shape=(10,) dtype=float32, numpy=
array([ 0.10325973,  0.28163418, -0.02973413, -0.09620668, -0.07770138,
        0.19860472,  0.07667638,  0.07115997,  0.28643882,  0.11287516],
      dtype=float32)>

In [223]:
model.b1

<tf.Variable 'b1:0' shape=(100,) dtype=float32, numpy=
array([ 7.7180185e+00, -9.5333204e+00, -8.4282074e+00,  3.6814294e+00,
        3.0408814e+00, -5.1143255e+00,  1.5648246e+00, -2.9887168e+00,
        3.1807947e-01,  1.5764303e+00,  6.2338686e-01,  1.9603917e-01,
        2.1873888e-02,  9.9807698e-03,  3.5673131e-03, -5.0638691e-03,
       -3.7083528e-03, -2.0554103e-03,  8.4837107e-04,  4.6422998e-03,
        1.2599836e-03,  4.3248530e-03, -1.4403258e-03, -5.2472483e-03,
       -2.3949654e-03, -1.5769964e-03, -3.8820663e-03, -6.2360056e-04,
        9.2419318e-04,  2.4113364e-03, -1.5476472e-03, -4.8201147e-04,
       -4.8491023e-03, -2.9747889e-03, -5.5699162e-03, -4.4958424e-03,
        2.9769551e-04, -3.5081524e-05, -2.5447488e-03, -7.2792927e-03,
       -5.0031999e-04,  3.0598373e-03, -5.1800897e-03, -1.1878333e-02,
       -4.1770102e-03, -3.9493092e-03,  4.6534883e-04, -3.4572212e-03,
        1.1882372e-03, -1.0890003e-02,  7.1908114e-03,  7.8994385e-04,
       -9.4388379e-04,

In [222]:
np.amax(model.W1, 0)

array([8.54801464e+00, 7.58154488e+00, 6.67377949e+00, 5.71237659e+00,
       4.76999712e+00, 3.84227133e+00, 3.40639615e+00, 2.08757424e+00,
       1.39305019e+00, 5.54792762e-01, 5.46755552e-01, 3.44100893e-01,
       4.29232270e-01, 3.53028566e-01, 6.19301246e-03, 1.32577680e-03,
       6.67357957e-03, 2.54884213e-01, 4.66621574e-03, 5.53519232e-03,
       2.84283306e-03, 8.22989084e-03, 5.04944241e-03, 1.39534415e-03,
       6.41245907e-03, 5.24821132e-03, 2.78953370e-03, 7.86001887e-03,
       2.67719897e-03, 6.63585402e-03, 2.09694263e-03, 2.71954667e-03,
       2.29622913e-03, 2.39135046e-03, 2.49643694e-03, 2.58640596e-03,
       7.78836431e-03, 2.79369904e-03, 7.04646949e-03, 2.99354363e-03,
       3.09780124e-03, 6.52314164e-03, 5.12211584e-03, 3.87223973e-03,
       3.49415187e-03, 3.58668645e-03, 3.69629939e-03, 4.31363657e-03,
       4.35943343e-03, 4.78814356e-03, 7.92876817e-03, 5.71044441e-03,
       4.28779563e-03, 4.39967215e-03, 4.49936511e-03, 4.59608994e-03,
      

In [119]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [120]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), metrics=['accuracy'])

In [121]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa3c2899880>