<a href="https://colab.research.google.com/github/vinayak-pathak/pyprac_f/blob/master/tfgradientandgraidenttape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from tensorflow import keras as tfk


class Test(tfk.Model):
    def __init__(self):
        super(Test, self).__init__()
        self.embedding_layer = tfk.layers.Embedding(50000, 300)
        self.conv1d_layer = tfk.layers.Conv1D(256, 5)
        self.pool_layer = tfk.layers.MaxPool1D(pool_size=5, strides=2)
        self.dense1_layer = tfk.layers.Dense(128, activation=tfk.activations.swish)
        self.dense2_layer = tfk.layers.Dense(10, activation=tfk.activations.softmax)

    def call(self, inputs, training=None, mask=None):
        hidden = self.embedding_layer(inputs)
        hidden = self.conv1d_layer(hidden)
        hidden = self.pool_layer(hidden)
        hidden = tfk.layers.Flatten()(hidden)
        hidden = self.dense1_layer(hidden)
        y_pred = self.dense2_layer(hidden)
        return y_pred


class Test2(tfk.Model):
    def __init__(self):
        super(Test2, self).__init__()
        self.embedding_layer = tfk.layers.Embedding(50000, 300)
        self.rnn_layer = tfk.layers.LSTM(200)
        self.dense1_layer = tfk.layers.Dense(128, activation=tfk.activations.swish)
        self.dense2_layer = tfk.layers.Dense(10, activation=tfk.activations.softmax)

    def call(self, inputs, training=None, mask=None):
        hidden = self.embedding_layer(inputs)
        hidden = self.rnn_layer(hidden)
        hidden = self.dense1_layer(hidden)
        y_pred = self.dense2_layer(hidden)
        return y_pred


gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
epochs = 5
x = np.random.randint(low=0, high=50000, size=(10000, 128))
y = np.random.randint(low=0, high=10, size=(10000,))
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
trainset = tf.data.Dataset.zip(
    (tf.data.Dataset.from_tensor_slices(x_train), tf.data.Dataset.from_tensor_slices(y_train))).batch(300)
valset = tf.data.Dataset.zip(
    (tf.data.Dataset.from_tensor_slices(x_val), tf.data.Dataset.from_tensor_slices(y_val))).batch(300)
# model = Test()
model = Test2()
train_acc = tf.metrics.SparseCategoricalAccuracy()
val_acc = tf.metrics.SparseCategoricalAccuracy()
train_loss = tf.metrics.Mean()
val_loss = tf.metrics.Mean()
loss_object = tf.losses.SparseCategoricalCrossentropy()
optimizer = tf.optimizers.Adam()

model.compile(optimizer=optimizer, loss=loss_object, metrics=[train_acc])
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=300, epochs=epochs)

# model = Test()
model = Test2()
optimizer = tfk.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")

@tf.function
def train_op(x, y):
    with tf.GradientTape() as tape:
        y_pred = model(x)
        loss = loss_object(y, y_pred)
        scaled_loss = optimizer.get_scaled_loss(loss)
    scaled_grads = tape.gradient(scaled_loss, model.trainable_weights)
    grads = optimizer.get_unscaled_gradients(scaled_grads)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    train_loss.update_state(loss)
    train_acc.update_state(y, y_pred)


@tf.function
def val_op(x, y):
    y_pred = model(x)
    loss = loss_object(y, y_pred)
    val_loss.update_state(loss)
    val_acc.update_state(y, y_pred)


for epoch in range(epochs):
    print("Epoch {}/{}".format(epoch + 1, epochs))
    # remember reset states here，then GradientTape will get similar loss & acc with model.fit
    train_loss.reset_states()
    train_acc.reset_states()
    val_loss.reset_states()
    val_acc.reset_states()
    bar = tfk.utils.Progbar(len(y_train), unit_name="sample", stateful_metrics={"loss", "acc"})
    log_values = []
    for batch_x, batch_y in trainset:
        train_op(batch_x, batch_y)
        log_values.append(("loss", train_loss.result().numpy()))
        log_values.append(("acc", train_acc.result().numpy()))
        bar.add(len(batch_y), log_values)
    for batch_x, batch_y in valset:
        val_op(batch_x, batch_y)
    template = "val_loss: {:.4f} - val_acc: {:.4f}"
    print(template.format(val_loss.result().numpy(), val_acc.result().numpy()))



Epoch 1/5

## Alternative
model.fit vs GradientTape
This is really a case-specific thing and it's difficult to give a definite answer here (it might border on "too opinion-based). But in general, I would say

The "classic" Keras interface (using compile, fitetc.) allows for quick and easy building, training & evaluation of standard models. However, it is very high-level/abstract and as such doesn't give you much low-level control. If you are implementing models with non-trivial control flow, this can be hard to accommodate.
GradientTape gives you full low-level control over all aspects of training/running your model, allowing easier debugging as well as more complex architectures etc., but you will need to write more boilerplate code for many things that a compiled model will hide from you (e.g. training loops). Still, if you do research in deep learning you will probably be working on this level most of the time.

In [None]:
@tf.function
def train_op(x, y):
    y_pred = model(x, training=True)
    tf.print(y_pred) # In my case the shapes of the output required an extra tf.squeeze
    loss = loss_object(y, y_pred)
    train_loss.update_state(loss)
    train_acc.update_state(y, y_pred)
    grads = tf.gradients(loss, model.trainable_variables)
    optimizer.apply_gradients(grads, model.trainable_variables)