# Introduction to Keras for Researchers
https://keras.io/getting_started/intro_to_keras_for_researchers/

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
x = tf.Variable(3.0)
with tf.GradientTape() as tape:
    y = x**2
    
dy_dx = tape.gradient(y, x)

In [3]:
dy_dx

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>

In [45]:
w = tf.Variable(tf.random.normal((3, 2)), name = 'w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name = 'b')
x = [[1., 2., 3.]]
x @ w

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.3812343,  1.9505316]], dtype=float32)>

In [6]:
x @ w + b

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 4.2237244, -0.8030729]], dtype=float32)>

In [7]:
b

<tf.Variable 'b:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>

https://alysivji.github.io/python-matrix-multiplication-operator.html
> PEP 465 introduced the @ infix operator that is designated to be used for matrix multiplication. The acceptance and implementation of this proposal in Python 3.5 was a signal to the scientific community that Python is taking its role as a numerical computation language very seriously.

In [46]:
with tf.GradientTape(persistent=True) as tape:
    y = x @ w + b
    loss = tf.reduce_mean(y**2)
    
[dl_dw, dl_db] = tape.gradient(loss, [w, b])

where `persistent` 
> Boolean controlling whether a persistent gradient tape is created. False by default, which means at most one call can be made to the gradient() method on this object.

https://www.tensorflow.org/api_docs/python/tf/GradientTape

In [13]:
?tf.reduce_mean

In [14]:
?tf.GradientTape

In [47]:
dl_dw

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-0.3812343,  1.9505316],
       [-0.7624686,  3.9010632],
       [-1.1437029,  5.851595 ]], dtype=float32)>

In [48]:
dl_db

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.3812343,  1.9505316], dtype=float32)>

In [26]:
my_vars = {
    'w': tf.Variable(tf.random.normal((3, 2)), name='w'),
    'b': tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
}

In [21]:
grad = tape.gradient(loss, my_vars)
grad['b']

In [27]:
my_vars

{'w': <tf.Variable 'w:0' shape=(3, 2) dtype=float32, numpy=
 array([[-0.5689192 , -0.5657398 ],
        [-0.3709449 , -0.3143288 ],
        [-0.158459  ,  0.12605582]], dtype=float32)>,
 'b': <tf.Variable 'b:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>}

In [28]:
grad # the variables have been changed!!

{'w': None, 'b': None}

In [32]:
with tf.GradientTape(persistent=True) as tape:
    y = x @ w + b
    loss = tf.reduce_mean(y**2)
grad = tape.gradient(loss, my_vars)
grad

{'w': None, 'b': None}

In [33]:
with tf.GradientTape(persistent=True) as tape:
    y = x @ my_vars['w'] + my_vars['b']
    loss = tf.reduce_mean(y**2)
grad = tape.gradient(loss, my_vars)
grad['b']

{'w': <tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[-1.7861859, -0.81623  ],
        [-3.5723717, -1.63246  ],
        [-5.3585577, -2.44869  ]], dtype=float32)>,
 'b': <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-1.7861859, -0.81623  ], dtype=float32)>}

In [51]:
my_vars = {
    'w': w,
    'b': b
}
grad = tape.gradient(loss, my_vars)
grad['b']

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.3812343,  1.9505316], dtype=float32)>

In [50]:
my_vars = {
    'w': tf.Variable(tf.random.normal((3, 2)), name='w'),
    'b': tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
}
grad = tape.gradient(loss, my_vars)
grad

{'w': None, 'b': None}

In [36]:
?tf.keras.layers.Dense 
# or more details: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense

In [38]:
layer = tf.keras.layers.Dense(2, activation = 'relu')
x = tf.constant([[1., 2., 3.]])

In [39]:
layer(x)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.7364416, 1.2341949]], dtype=float32)>

In [40]:
with tf.GradientTape() as tape:
    y = layer(x)
    loss = tf.reduce_mean(y**2)

grad = tape.gradient(loss, layer.trainable_variables)

In [41]:
grad

[<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[0.7364416, 1.2341949],
        [1.4728832, 2.4683897],
        [2.2093248, 3.7025847]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.7364416, 1.2341949], dtype=float32)>]

In [42]:
for var, g in zip(layer.trainable_variables, grad):
    print(f'{var.name}, shape: {g.shape}')

dense_1/kernel:0, shape: (3, 2)
dense_1/bias:0, shape: (2,)


In [1]:
?zip

In [3]:
x = tf.constant(1.0)
v0 = tf.Variable(2.0)
v1 = tf.Variable(2.0)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(x)
    if x > 0.0:
        result = v0
    else:
        result = v1**2
        
tape.gradient(result, [v0, v1])

[<tf.Tensor: shape=(), dtype=float32, numpy=1.0>, None]

In [4]:
# the control statement itself is not differentiable
tape.gradient(result, x)

In [5]:
x = tf.Variable(2.0)
for epoch in range(2):
    with tf.GradientTape() as tape:
        y = x + 1
    
    print(type(x).__name__, ":", tape.gradient(y, x))
    x.assign_add(1) # NOT x = x + 1

ResourceVariable : tf.Tensor(1.0, shape=(), dtype=float32)
ResourceVariable : tf.Tensor(1.0, shape=(), dtype=float32)


In [12]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape() as tape:
  x2 = x**2

  # This step is calculated with NumPy
  y = np.mean(x2, axis=0)

  # Like most ops, reduce_mean will cast the NumPy array to a constant tensor using `tf.convert_to_tensor` 
  y = tf.reduce_mean(y, axis=0)

print(tape.gradient(y, x))

None


In [9]:
?tf.convert_to_tensor

In [13]:
# The x0 variable has an `int` dtype.
x = tf.Variable([[2, 2],
                 [2, 2]])

with tf.GradientTape() as tape:
  # The path to x1 is blocked by the `int` dtype here.
  y = tf.cast(x, tf.float32)
  y = tf.reduce_sum(x)

print(tape.gradient(y, x))

None


In [14]:
x0 = tf.Variable(3.0)
x1 = tf.Variable(0.0)

with tf.GradientTape() as tape:
  # Update x1 = x1 + x0.
  x1.assign_add(x0)
  # The tape starts recording from x1.
  y = x1**2   # y = (x1 + x0)**2

# This doesn't work.
print(tape.gradient(y, x0))   #dy/dx0 = 2*(x1 + x2)

None


In [17]:
x = tf.Variable([2., 2.])
y = tf.Variable(3.)

with tf.GradientTape() as tape:
  z = y**2
# print(tape.gradient(z, x, unconnected_gradients=tf.UnconnectedGradients.ZERO))

In [18]:
print(tape.gradient(z, x))

None


https://keras.io/getting_started/intro_to_keras_for_researchers/

In [13]:
from tensorflow import keras

In [22]:
class Linear(keras.layers.Layer):
    
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(
            initial_value = w_init(shape=(input_dim, units), dtype = "float32"),
            trainable = True
        )
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(
            initial_value = b_init(shape=(units,), dtype = "float32"),
            trainable = True
        )
    
    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b
    

In [23]:
linear_layer = Linear(units = 4, input_dim=2)
y = linear_layer(tf.ones((2, 2)))

In [24]:
y

<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[ 0.00323859,  0.02699355, -0.07890009, -0.03770854],
       [ 0.00323859,  0.02699355, -0.07890009, -0.03770854]],
      dtype=float32)>

In [25]:
linear_layer.weights

[<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
 array([[ 0.03596107, -0.01020826, -0.02216507, -0.04651199],
        [-0.03272248,  0.03720182, -0.05673502,  0.00880345]],
       dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

In [26]:
a = [1, 2, 3]

In [27]:
a[-1]

3

In [28]:
class Linear(keras.layers.Layer):
    def __init__(self, units = 32):
        super(Linear, self).__init__()
        self.units = units
    
    def build(self, input_shape):
        self.w = self.add_weight(
            shape = (input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True
        )
        self.b = self.add_weight(
            shape = (self.units,),
            initializer="random_normal",
            trainable=True
        )
    
    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [31]:
linear_layer = Linear(4)
y = linear_layer(tf.ones((2,2)))

In [32]:
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [83]:
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32") / 255, y_train)
)
dataset = dataset.shuffle(buffer_size=1024).batch(64)

In [37]:
dataset

<BatchDataset shapes: ((None, 784), (None,)), types: (tf.float32, tf.uint8)>

In [34]:
linear_layer = Linear(10)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)

In [38]:
for step, (x, y) in enumerate(dataset):
    with tf.GradientTape() as tape:
        logits = linear_layer(x)
        loss = loss_fn(y, logits)
    gradients = tape.gradient(loss, linear_layer.trainable_weights)
    optimizer.apply_gradients(zip(gradients, linear_layer.trainable_weights))
    if step % 1 == 0:
        print("step", step, ", loss = ", float(loss))

step 0 , loss =  1.6134514808654785
step 1 , loss =  1.674712896347046
step 2 , loss =  1.699915885925293
step 3 , loss =  1.7053558826446533
step 4 , loss =  1.6796443462371826
step 5 , loss =  1.609327793121338
step 6 , loss =  1.6452875137329102
step 7 , loss =  1.747910499572754
step 8 , loss =  1.713761329650879
step 9 , loss =  1.705289363861084
step 10 , loss =  1.6537827253341675
step 11 , loss =  1.6228864192962646
step 12 , loss =  1.7047626972198486
step 13 , loss =  1.6979721784591675
step 14 , loss =  1.7081371545791626
step 15 , loss =  1.676334261894226
step 16 , loss =  1.5624463558197021
step 17 , loss =  1.6063860654830933
step 18 , loss =  1.6761558055877686
step 19 , loss =  1.6598820686340332
step 20 , loss =  1.5930603742599487
step 21 , loss =  1.643523931503296
step 22 , loss =  1.6386659145355225
step 23 , loss =  1.6144518852233887
step 24 , loss =  1.5751562118530273
step 25 , loss =  1.6180683374404907
step 26 , loss =  1.5564169883728027
step 27 , loss =  1

step 224 , loss =  1.6357378959655762
step 225 , loss =  1.6076761484146118
step 226 , loss =  1.614443302154541
step 227 , loss =  1.4918248653411865
step 228 , loss =  1.635043740272522
step 229 , loss =  1.4426594972610474
step 230 , loss =  1.502551794052124
step 231 , loss =  1.5828381776809692
step 232 , loss =  1.462816834449768
step 233 , loss =  1.6099741458892822
step 234 , loss =  1.4631117582321167
step 235 , loss =  1.536323070526123
step 236 , loss =  1.4684323072433472
step 237 , loss =  1.5645387172698975
step 238 , loss =  1.50203275680542
step 239 , loss =  1.579676866531372
step 240 , loss =  1.5729241371154785
step 241 , loss =  1.6617279052734375
step 242 , loss =  1.5832483768463135
step 243 , loss =  1.5352771282196045
step 244 , loss =  1.4300994873046875
step 245 , loss =  1.6221139430999756
step 246 , loss =  1.4040015935897827
step 247 , loss =  1.6571340560913086
step 248 , loss =  1.6719486713409424
step 249 , loss =  1.643936038017273
step 250 , loss =  1.

step 473 , loss =  1.5096774101257324
step 474 , loss =  1.4917936325073242
step 475 , loss =  1.4625539779663086
step 476 , loss =  1.452277660369873
step 477 , loss =  1.5250884294509888
step 478 , loss =  1.422173261642456
step 479 , loss =  1.3751426935195923
step 480 , loss =  1.5598092079162598
step 481 , loss =  1.5745118856430054
step 482 , loss =  1.4099211692810059
step 483 , loss =  1.514366626739502
step 484 , loss =  1.6100928783416748
step 485 , loss =  1.4522786140441895
step 486 , loss =  1.4250667095184326
step 487 , loss =  1.5019526481628418
step 488 , loss =  1.520362377166748
step 489 , loss =  1.343189001083374
step 490 , loss =  1.5157513618469238
step 491 , loss =  1.4695336818695068
step 492 , loss =  1.4259767532348633
step 493 , loss =  1.3587489128112793
step 494 , loss =  1.4761300086975098
step 495 , loss =  1.5037891864776611
step 496 , loss =  1.4938222169876099
step 497 , loss =  1.497666597366333
step 498 , loss =  1.5001895427703857
step 499 , loss = 

step 704 , loss =  1.351839542388916
step 705 , loss =  1.357790470123291
step 706 , loss =  1.3266236782073975
step 707 , loss =  1.2991738319396973
step 708 , loss =  1.421311855316162
step 709 , loss =  1.3545928001403809
step 710 , loss =  1.413337230682373
step 711 , loss =  1.4180176258087158
step 712 , loss =  1.465873122215271
step 713 , loss =  1.3398754596710205
step 714 , loss =  1.31160306930542
step 715 , loss =  1.3461366891860962
step 716 , loss =  1.3446271419525146
step 717 , loss =  1.2679541110992432
step 718 , loss =  1.446315050125122
step 719 , loss =  1.3402596712112427
step 720 , loss =  1.2868695259094238
step 721 , loss =  1.3785521984100342
step 722 , loss =  1.3497563600540161
step 723 , loss =  1.2749032974243164
step 724 , loss =  1.3729417324066162
step 725 , loss =  1.3200230598449707
step 726 , loss =  1.449237585067749
step 727 , loss =  1.4013032913208008
step 728 , loss =  1.3648228645324707
step 729 , loss =  1.4189505577087402
step 730 , loss =  1.

In [40]:
938*64

60032

In [41]:
?dataset.shuffle

In [42]:
ds = tf.data.Dataset.range(3)

In [45]:
list(ds.as_numpy_iterator())

[0, 1, 2]

In [64]:
ds = ds.shuffle(3)

In [65]:
ds

<ShuffleDataset shapes: (), types: tf.int64>

In [69]:
list(ds.as_numpy_iterator())

[2, 1, 0]

In [73]:
?ds.batch

In [74]:
mlp = keras.Sequential(
    [
        keras.layers.Dense(32, activation = tf.nn.relu),
        keras.layers.Dense(32, activation = tf.nn.relu),
        keras.layers.Dense(10)
    ]
)

In [75]:
class ActivityRegularization(keras.layers.Layer):
    def __init__(self, rate=1e-2):
        super(ActivityRegularization, self).__init__()
        self.rate = rate
    def call(self, inputs):
        self.add_loss(self.rate * tf.reduce_sum(inputs))
        return inputs

In [79]:
class SparseMLP(keras.layers.Layer):
    def __init__(self):
        super(SparseMLP, self).__init__()
        self.linear_1 = Linear(32)
        self.regularization = ActivityRegularization(1e-2)
        self.linear_3 = Linear(10)
        
    def call(self, inputs1):
        x = self.linear_1(inputs1)
        x = tf.nn.relu(x)
        x = self.regularization(x)
        return self.linear_3(x)

In [85]:
mlp = SparseMLP()

In [81]:
y = mlp(tf.ones((10, 10)))

In [82]:
mlp.losses

[<tf.Tensor: shape=(), dtype=float32, numpy=0.2529298>]

In [93]:
for step, (x, y) in enumerate(dataset):
    with tf.GradientTape() as tape:
        logits = mlp(x)
        loss = loss_fn(y, logits)
        loss += sum(mlp.losses)
    gradients = tape.gradient(loss, mlp.trainable_weights)
    optimizer.apply_gradients(zip(gradients, mlp.trainable_weights))
    if step % 100 == 0:
        print("step", step, ", loss = ", float(loss))

step 0 , loss =  2.3008975982666016
step 100 , loss =  2.3098599910736084
step 200 , loss =  2.298550844192505
step 300 , loss =  2.297795534133911
step 400 , loss =  2.304739236831665
step 500 , loss =  2.309346914291382
step 600 , loss =  2.3182613849639893
step 700 , loss =  2.3104445934295654
step 800 , loss =  2.3015530109405518
step 900 , loss =  2.304327964782715


with statement: https://www.geeksforgeeks.org/with-statement-in-python/

In [94]:
# Prepare our layer, loss, and optimizer.
model = keras.Sequential(
    [
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(10),
    ]
)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

@tf.function
def train_on_batch(x, y):
    with tf.GradientTape() as tape:
        logits = model(x)
        loss = loss_fn(y, logits)
    gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
    return loss

(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32") / 255, y_train)
)
dataset = dataset.shuffle(buffer_size=1024).batch(64)

for step, (x, y) in enumerate(dataset):
    loss = train_on_batch(x, y)
    if step % 100 == 0:
        print("Step:", step, "Loss:", float(loss))

Step: 0 Loss: 2.38885498046875
Step: 100 Loss: 0.6748959422111511
Step: 200 Loss: 0.63692307472229
Step: 300 Loss: 0.4861977994441986
Step: 400 Loss: 0.13841241598129272
Step: 500 Loss: 0.2590877413749695
Step: 600 Loss: 0.29595351219177246
Step: 700 Loss: 0.27203017473220825
Step: 800 Loss: 0.19011381268501282
Step: 900 Loss: 0.23012065887451172


In [95]:
class Dropout(keras.layers.Layer):
    def __init__(self, rate):
        super(Dropout, self).__init__()
        self.rate = rate

    def call(self, inputs, training=None):
        if training:
            return tf.nn.dropout(inputs, rate=self.rate)
        return inputs


class MLPWithDropout(keras.layers.Layer):
    def __init__(self):
        super(MLPWithDropout, self).__init__()
        self.linear_1 = Linear(32)
        self.dropout = Dropout(0.5)
        self.linear_3 = Linear(10)

    def call(self, inputs, training=None):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.dropout(x, training=training)
        return self.linear_3(x)


mlp = MLPWithDropout()
y_train = mlp(tf.ones((2, 2)), training=True)
y_test = mlp(tf.ones((2, 2)), training=False)

In [96]:
inputs = tf.keras.Input(shape=(16,), dtype="float32")

In [97]:
inputs

<tf.Tensor 'input_1:0' shape=(None, 16) dtype=float32>

In [98]:
x = Linear(32)(inputs)

In [99]:
x

<tf.Tensor 'linear_14/add:0' shape=(None, 32) dtype=float32>

In [100]:
x = Dropout(0.5)(x)

In [101]:
x

<tf.Tensor 'dropout_1/dropout_1/Identity:0' shape=(None, 32) dtype=float32>

In [102]:
outputs = Linear(10)(x)

In [103]:
outputs

<tf.Tensor 'linear_15/add:0' shape=(None, 10) dtype=float32>

In [106]:
model = tf.keras.Model(inputs, outputs)

In [108]:
model(tf.ones((2, 16)))

<tf.Tensor: shape=(2, 10), dtype=float32, numpy=
array([[ 0.02364092,  0.04820701,  0.07519927, -0.04784147, -0.03294853,
        -0.00909306, -0.12849325,  0.01039225,  0.11472937, -0.02940241],
       [ 0.02364092,  0.04820701,  0.07519927, -0.04784147, -0.03294853,
        -0.00909306, -0.12849325,  0.01039225,  0.11472937, -0.02940241]],
      dtype=float32)>

In [14]:
from tensorflow.keras import layers

In [4]:
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs # what's the shape of z_log_var, a scalar?
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [5]:
class Encoder(layers.Layer):
    def __init__(self, latent_dim = 32, intermediate_dim = 64, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation = tf.nn.relu)
        self.dense_mean = layers.Dense(latent_dim)
        self.dense_log_var = layers.Dense(latent_dim)
        self.sampling = Sampling()
        
    def call(self, inputs):
        x = self.dense_proj(inputs)
        z_mean = self.dense_mean(x)
        z_log_var = self.dense_log_var(x) # so z_log_var has the same shape with z_mean
        z = self.sampling((z_mean, z_log_var))
        return z_mean, z_log_var, z

In [7]:
class Decoder(layers.Layer):
    def __init__(self, original_dim, intermediate_dim=64, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation = tf.nn.relu)
        self.dense_output = layers.Dense(original_dim, activation = tf.nn.sigmoid)
        
    def call(self, inputs):
        x = self.dense_proj(inputs)
        return self.dense_output(x)

In [8]:
class VariationalAutoEncoder(layers.Layer):
    def __init__(self, original_dim, intermediate_dim = 64, latent_dim = 32, **kwargs):
        super(VariationalAutoEncoder, self).__init__(**kwargs)
        self.original_dim = original_dim # not infer from the shape of inputs
        self.encoder = Encoder(latent_dim=latent_dim, intermediate_dim = intermediate_dim)
        self.decoder = Decoder(original_dim, intermediate_dim = intermediate_dim)
    
    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        kl_loss = -0.5 * tf.reduce_mean(
            z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1
        )
        self.add_loss(kl_loss)
        return reconstructed

In [11]:
vae = VariationalAutoEncoder(original_dim=784, intermediate_dim=64, latent_dim=32)
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
(x_train, _), _ = tf.keras.datasets.mnist.load_data()
dataset = tf.data.Dataset.from_tensor_slices(
    x_train.reshape(60000, 784).astype("float32") / 255
)
dataset = dataset.shuffle(buffer_size=1024).batch(32)

@tf.function
def training_step(x):
    with tf.GradientTape() as tape:
        reconstructed = vae(x)
        loss = loss_fn(x, reconstructed)
        loss += sum(vae.losses)
    grads = tape.gradient(loss, vae.trainable_weights)
    optimizer.apply_gradients(zip(grads, vae.trainable_weights))
    return loss

losses = []
for step, x in enumerate(dataset):
    loss = training_step(x)
    losses.append(float(loss))
    if step % 100 == 0:
        print("step:", step, "loss:", sum(losses) / len(losses))
        

step: 0 loss: 0.34750431776046753
step: 100 loss: 0.12663604739573922
step: 200 loss: 0.10030697182339815
step: 300 loss: 0.08997227075785101
step: 400 loss: 0.08488470015866204
step: 500 loss: 0.08169266278158405
step: 600 loss: 0.0792588794452081
step: 700 loss: 0.0778793450365223
step: 800 loss: 0.07667038871554399
step: 900 loss: 0.0757216972843656
step: 1000 loss: 0.0747777977427998
step: 1100 loss: 0.0740617620813695
step: 1200 loss: 0.07360787985909988
step: 1300 loss: 0.07316073791040172
step: 1400 loss: 0.07272018230308132
step: 1500 loss: 0.07238143778528316
step: 1600 loss: 0.07208472068452373
step: 1700 loss: 0.07175387290778895
step: 1800 loss: 0.07149329694863957


In [17]:
1800*32

57600

In [15]:
original_dim = 784
intermediate_dim = 64
latent_dim = 32

original_inputs = tf.keras.Input(shape = (original_dim, ), name = "encoder_input")
x = layers.Dense(intermediate_dim, activation = "relu")(original_inputs)
z_mean = layers.Dense(latent_dim, name = "z_mean")(x)
z_log_var = layers.Dense(latent_dim, name = "z_log_var")(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs=original_inputs, outputs = z, name = "encoder")

latent_inputs = tf.keras.Input(shape = (latent_dim,), name = "z_sampling")
x = layers.Dense(intermediate_dim, activation = "relu")(latent_inputs)
outputs = layers.Dense(original_dim, activation="sigmoid")(x)
decoder = tf.keras.Model(inputs=latent_inputs, outputs = outputs, name = "decoder")

outputs = decoder(z)
vae = tf.keras.Model(inputs=original_inputs, outputs = outputs, name = "vae")

kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
vae.add_loss(kl_loss)

In [16]:
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

(x_train, _), _ = tf.keras.datasets.mnist.load_data()
dataset = tf.data.Dataset.from_tensor_slices(
    x_train.reshape(60000, 784).astype("float32") / 255
)
dataset = dataset.map(lambda x: (x, x))
dataset = dataset.shuffle(buffer_size=1024).batch(32)
vae.compile(optimizer, loss=loss_fn)
vae.fit(dataset, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f81c3d93fd0>

In [18]:
import numpy as np

input_dim = 784
classes = 10

outer_model = keras.Sequential(
    [keras.layers.Dense(64, activation=tf.nn.relu), keras.layers.Dense(classes), ]
)

for layer in outer_model.layers:
    layer.built = True

num_weights_to_generate = (classes * 64 + classes) + (64 * input_dim + 64)

inner_model = keras.Sequential(
    [
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(num_weights_to_generate, activation=tf.nn.sigmoid)
    ]
)

In [21]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

# Prepare a dataset.
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32") / 255, y_train)
)

# We'll use a batch size of 1 for this experiment.
dataset = dataset.shuffle(buffer_size=1024).batch(1)

@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        weights_pred = inner_model(x)
        
        start_index = 0
        w0_shape = (input_dim, 64)
        w0_coeffs = weights_pred[:, start_index : start_index + np.prod(w0_shape)]
        w0 = tf.reshape(w0_coeffs, w0_shape)
        start_index += np.prod(w0_shape)
        # Layer 0 bias.
        b0_shape = (64,)
        b0_coeffs = weights_pred[:, start_index : start_index + np.prod(b0_shape)]
        b0 = tf.reshape(b0_coeffs, b0_shape)
        start_index += np.prod(b0_shape)
        # Layer 1 kernel.
        w1_shape = (64, classes)
        w1_coeffs = weights_pred[:, start_index : start_index + np.prod(w1_shape)]
        w1 = tf.reshape(w1_coeffs, w1_shape)
        start_index += np.prod(w1_shape)
        # Layer 1 bias.
        b1_shape = (classes,)
        b1_coeffs = weights_pred[:, start_index : start_index + np.prod(b1_shape)]
        b1 = tf.reshape(b1_coeffs, b1_shape)
        start_index += np.prod(b1_shape)
        
        outer_model.layers[0].kernel = w0
        outer_model.layers[0].bias = b0
        outer_model.layers[1].kernel = w1
        outer_model.layers[1].bias = b1
        
        preds = outer_model(x)
        loss = loss_fn(y, preds)
    
    grads = tape.gradient(loss, inner_model.trainable_weights)
    optimizer.apply_gradients(zip(grads, inner_model.trainable_weights))
    return loss

losses = []  # Keep track of the losses over time.
for step, (x, y) in enumerate(dataset):
    loss = train_step(x, y)

    # Logging.
    losses.append(float(loss))
    if step % 100 == 0:
        print("Step:", step, "Loss:", sum(losses) / len(losses))

Step: 0 Loss: 3.106187582015991
Step: 100 Loss: 2.329586129451152
Step: 200 Loss: 2.018739344932102
Step: 300 Loss: 1.8607817600085554
Step: 400 Loss: 1.6927321594202092
Step: 500 Loss: 1.659945924356461
Step: 600 Loss: 1.6165531615731172
Step: 700 Loss: 1.5848073587594667
Step: 800 Loss: 1.5421692139055496
Step: 900 Loss: 1.5195796266366954
Step: 1000 Loss: 1.4668622419073138
Step: 1100 Loss: 1.4206054525291298
Step: 1200 Loss: 1.3794678447772861
Step: 1300 Loss: 1.3303812805341877
Step: 1400 Loss: 1.2965709597478559
Step: 1500 Loss: 1.2763124426571206
Step: 1600 Loss: 1.2495518829017889
Step: 1700 Loss: 1.2231836422578808
Step: 1800 Loss: 1.2097418638349406
Step: 1900 Loss: 1.195731880292863
Step: 2000 Loss: 1.1669522723871206
Step: 2100 Loss: 1.1616932695890003
Step: 2200 Loss: 1.1450523010648122
Step: 2300 Loss: 1.1312813921083238
Step: 2400 Loss: 1.1160248060201192
Step: 2500 Loss: 1.109039789929457
Step: 2600 Loss: 1.105918935104946
Step: 2700 Loss: 1.095133691379496
Step: 2800 L

Step: 22600 Loss: 0.6738090926891418
Step: 22700 Loss: 0.6731778062604573
Step: 22800 Loss: 0.6740356664172105
Step: 22900 Loss: 0.6724637007002229
Step: 23000 Loss: 0.6712288585444788
Step: 23100 Loss: 0.6700524873408495
Step: 23200 Loss: 0.6688628845500976
Step: 23300 Loss: 0.6683685356759446
Step: 23400 Loss: 0.667677262552663
Step: 23500 Loss: 0.6690476864466912
Step: 23600 Loss: 0.668645532277996
Step: 23700 Loss: 0.6687883072316537
Step: 23800 Loss: 0.6679152346669608
Step: 23900 Loss: 0.6675844639062294
Step: 24000 Loss: 0.6671006059190772
Step: 24100 Loss: 0.6662608155164917
Step: 24200 Loss: 0.6659203600411859
Step: 24300 Loss: 0.6655180255264141
Step: 24400 Loss: 0.665540608494538
Step: 24500 Loss: 0.6642075210007209
Step: 24600 Loss: 0.6636307217964408
Step: 24700 Loss: 0.6624666200016601
Step: 24800 Loss: 0.6611687664982572
Step: 24900 Loss: 0.6601427726244423
Step: 25000 Loss: 0.6591171202529635
Step: 25100 Loss: 0.6586777909497106
Step: 25200 Loss: 0.6576330484826408
Step

Step: 44800 Loss: 0.5702724872351568
Step: 44900 Loss: 0.5698007984211281
Step: 45000 Loss: 0.5696763269937449
Step: 45100 Loss: 0.5696589972727357
Step: 45200 Loss: 0.5695521961702927
Step: 45300 Loss: 0.5693974505078573
Step: 45400 Loss: 0.5691841045362773
Step: 45500 Loss: 0.5686395642170001
Step: 45600 Loss: 0.5681956086416007
Step: 45700 Loss: 0.5675832020871933
Step: 45800 Loss: 0.5679150579549092
Step: 45900 Loss: 0.5676504391347579
Step: 46000 Loss: 0.5671349417827459
Step: 46100 Loss: 0.5667084162152586
Step: 46200 Loss: 0.5670787228121708
Step: 46300 Loss: 0.5671918484216262
Step: 46400 Loss: 0.5667944552318642
Step: 46500 Loss: 0.5664136736830282
Step: 46600 Loss: 0.5659280183528874
Step: 46700 Loss: 0.5658875210141514
Step: 46800 Loss: 0.5653765335760951
Step: 46900 Loss: 0.5655537515132901
Step: 47000 Loss: 0.5650273041516409
Step: 47100 Loss: 0.5649597433358065
Step: 47200 Loss: 0.5647823413184038
Step: 47300 Loss: 0.5648534882403522
Step: 47400 Loss: 0.5644253013605042
S