---

TensorFlow [introduction to graphs](https://www.tensorflow.org/guide/intro_to_graphs)

Graphs are data structures that contain a set of tf.Operation objects, which represent units of computation; and tf.Tensor objects, which represent the units of data that flow between operations. They are defined in a tf.Graph context. Since these graphs are data structures, they can be saved, run, and restored all without the original Python code for example on embedded devices, servers etc that not have built-in Python interpreters. Graphs can also be optimized to run faster.

<img src = 'https://drive.google.com/uc?id=1Q8-9CJbEZez8oOU0K-_sHQ-tDNgj8CAQ' alt = "graph components" width="400">
<img src = 'https://drive.google.com/uc?id=1Q7_c8CXV8_eEvPbNyAjdUN1tXCFKUS0e' alt = "graph" width="400">


---

In [None]:
import numpy as np
import timeit
import time
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

import tensorflow as tf
tf.__version__

---

**TensorFlow eager execution**: tensor computations are executed by Python, operation by operation, and return results back to Python.

**TensorFlow graph execution**: tensor computations are executed as a TensorFlow graph, sometimes referred to as a tf.Graph or simply a "graph."

Graph execution enables portability outside Python and tends to offer better performance.  


---

In [None]:
@tf.function
def a_func_in_python(x, y, b):
  x = tf.matmul(x, y)
  x = x + b
  return x

In [None]:
type(a_func_in_python)

In [None]:
# Define a function in Python
def a_func_in_python(x, y, b):
  x = tf.matmul(x, y)
  x = x + b
  return x

# A function that uses a graph version of the function above
a_func_that_uses_graph = tf.function(a_func_in_python)

# Make some tensors.
x1 = tf.constant([[1.0, 2.0, 3.0]])
y1 = tf.constant([[1.0], [2.0], [3.0]])
b1 = tf.constant(4.0)

# Call the two versions of the same function on the inputs
python_way = a_func_in_python(x1, y1, b1).numpy()
tf_function_way = a_func_that_uses_graph(x1, y1, b1).numpy()

# Check if the two versions return the same output
print(python_way)
print(tf_function_way)

---

tf.function applies to nested function calls

---

In [None]:
# Inner function
def inner_function(x, y, b):
  x = tf.matmul(x, y) + b
  return x

# Outer function that is converted into a graph
@tf.function
def outer_function(x):
  y = tf.constant([[1.0], [2.0], [3.0]])
  b = tf.constant(1.0)
  temp = inner_function(x, y, b)
  return(temp)

# Calling the outer function will result in a
# graph that will also include the inner function's
# graph
outer_function(tf.constant([[1.0, 2.0, 3.0]])).numpy()

---

Converting Python functions with branches into graphs.

---

In [None]:
def simple_relu(x):
  if tf.greater(x, 0):
    return x
  else:
    return 0

# Function to wrap python function into a graph
tf_simple_relu = tf.function(simple_relu)

print("First branch, with graph:", tf_simple_relu(tf.constant(1)).numpy())
print("Second branch, with graph:", tf_simple_relu(tf.constant(-1)).numpy())

---

Polymorphism using graphs

<img src = 'https://drive.google.com/uc?id=1Q9rMsRHWtBVSaWMsmDX6UZS8l0Cfu4eh' alt = "graph components" width="400">

---

In [None]:
@tf.function
def my_relu(x):
  return tf.maximum(0., x)

# Function creates a new graph for each case below
print(my_relu(tf.constant(1.0)))
print(my_relu([1, -1]))
print(my_relu(tf.constant([3., -3.])))

# Function does not create a new graph for each
# case below because a graph has already been
# created above for the given data type and shape
print(my_relu(tf.constant(2.0)))
print(my_relu(tf.constant([1.5, -1.5])))

---

**Graph execution vs. eager execution**: the code in a tf.function can be executed both eagerly and as a graph. By default, tf.function executes its code as a graph.

---

In [None]:
@tf.function
def get_MSE(y_true, y_pred):
  return tf.reduce_mean((y_true - y_pred)**2)

y_true = tf.random.uniform([5], maxval=10, dtype=tf.int32)
y_pred = tf.random.uniform([5], maxval=10, dtype=tf.int32)
print(y_true)
print(y_pred)

# Execution in graph mode
print(get_MSE(y_true, y_pred))

# Switch of graph mode and turn on eager execution mode
tf.config.run_functions_eagerly(True)
print(get_MSE(y_true, y_pred))

# Get back to graph mode
tf.config.run_functions_eagerly(False)

---

Graph tracing only captures TensorFlow operations into a graph and not print statements etc.

---

In [None]:
@tf.function
def get_MSE(y_true, y_pred):
  print('Calculating MSE')
  return tf.reduce_mean((y_true - y_pred)**2)

y_true = tf.random.uniform([5], maxval=10, dtype=tf.int32)
y_pred = tf.random.uniform([5], maxval=10, dtype=tf.int32)

# Execution in graph mode captures the print statement only once
print(get_MSE(y_true, y_pred))
print(get_MSE(y_true, y_pred))
print(get_MSE(y_true, y_pred))

# Switching to eager mode will capture the print statement all 3 times
tf.config.run_functions_eagerly(True)
print(get_MSE(y_true, y_pred))
print(get_MSE(y_true, y_pred))
print(get_MSE(y_true, y_pred))

# Switch back to graph mode
tf.config.run_functions_eagerly(False)

---

Graph mode corresponds to non-strict execution which means only those operations necessary to produce the observable effects are executed.

In contrast, in eager execution mode, all of the program operations, needed or not, are stepped through and executed.

---

In [None]:
x = tf.constant([1., 2., 3.])

def unused_func(x):
  # Get index 3 will fail
  tf.gather(x, [3]) # unused operation not connected to what function is returning
  return x

tf_unused_func = tf.function(unused_func)

try:
  print(tf_unused_func(x))
except tf.errors.InvalidArgumentError as e:
  # Unused operations are not run during graph execution so no error is raised.
  print(f'{type(e).__name__}: {e}')

try:
  print(unused_func(x))
except tf.errors.InvalidArgumentError as e:
  # All operations are run during eager execution so an error is raised.
  print(f'{type(e).__name__}: {e}')


**tf.function best practices**:

- Toggle between eager and graph execution early and often with tf.config.run_functions_eagerly to pinpoint if/ when the two modes diverge.
- Create tf.Variables outside the Python function that we want to convert into a graph and modify those variables on the inside. The same goes for objects that use tf.Variable, like tf.keras.layers, tf.keras.Models and tf.keras.optimizers.
- Avoid writing functions that depend on outer Python variables, excluding tf.Variables and Keras objects. Learn more in Depending on Python global and free variables of the tf.function guide.
- Prefer to write functions which take tensors and other TensorFlow types as input.
- Include as much computation as possible under a tf.function to maximize the performance gain. For example, decorate a whole training step or the entire training loop which runs faster under tf.function.

---

tf.function usually improves the performance of your code, but the amount of speed-up depends on the kind of computation you run. Small computations can be dominated by the overhead of calling a graph. You can measure the difference in performance like the example below where we multiply a matrix 100 times by itself, and repeat that operation 1000 times first using eager execution and then using graph execution:

---

In [None]:
x = tf.random.uniform(shape=[10, 10], minval=-1, maxval=2, dtype=tf.dtypes.int32)

def power(x, y):
  result = tf.eye(10, dtype=tf.dtypes.int32)
  for _ in range(y):
    result = tf.matmul(x, result)
  return result

print("Eager execution:", timeit.timeit(lambda: power(x, 100), number=1000), "seconds")

tf_power = tf.function(power)
print("Graph execution:", timeit.timeit(lambda: tf_power(x, 100), number=1000), "seconds")

---

When is a tf.function tracing?

---

In [None]:
x = tf.constant(2)

@tf.function
def trace_me(x):
  print("Tracing!") # An eager-only side effect.
  return x * x + tf.constant(2)

# Tracing happens here
print(trace_me(x))

# No tracing happens here
print(trace_me(x+1))

# Retracing happens again because new Python arguments as input, such as the
# one below will also trigger the creation of a new graph which will result
# in extra tracing
print(trace_me(3))

---

**Speed-up using graph execution**: apply a 2-layer fully-connected model with 128 nodes in the hidden layer to the MNIST dataset**

---

In [None]:
## Load MNIST data
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1]*X_test.shape[2])

num_labels = len(np.unique(y_train))
num_features = X_train.shape[1]
num_samples = X_train.shape[0]

# One-hot encode class labels
Y_train = tf.keras.utils.to_categorical(y_train)
Y_test = tf.keras.utils.to_categorical(y_test)

# Normalize the samples (images) using the training data
xmax = np.amax(X_train)
xmin = np.amin(X_train)
X_train = (X_train - xmin) / (xmax - xmin) # all train features turn into a number between 0 and 1
X_test = (X_test - xmin)/(xmax - xmin)

print('MNIST set')
print('---------------------')
print('Number of training samples = %d'%(num_samples))
print('Number of features = %d'%(num_features))
print('Number of output labels = %d'%(num_labels))

In [None]:
## Create source dataset from input data (this is helpful for pipelining later)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
batch_size = 100 # batch size
# Create training batches
train_dataset = train_dataset.shuffle(buffer_size = 1024).batch(batch_size)

In [None]:
# Define 2-layer neural network architecture with 128 nodes in the hidden layer
# that are leaky ReLU activated

# Define model
class Model128(tf.keras.models.Model):
    def __init__(self):
        super(Model128, self).__init__()
        self.layer1 = tf.keras.layers.Dense(?, dtype = 'float64',
                                 activation = ?
                                 )
        self.layer2 = tf.keras.layers.Dense(?, dtype = 'float64',
                                 activation = ?)

    # Forward pass for the model
    def call(self, inputs):
        a = self.layer1(?)
        a = self.?(?)
        return ?

In [None]:
# Instantiate 2-layer fully-connected model with 128 nodes in the hidden layer
model = Model128()

# Define optimizer
opt = tf.keras.optimizers.Adam(learning_rate = 1e-03)

# Define loss function
loss_fn = tf.keras.losses.CategoricalCrossentropy()

# Varible to store training loss per epoch
loss_train_epoch = tf.keras.metrics.Mean()

# Iterate over epochs
nepochs = 10

#@tf.function
# User-defined function for training model
def train_step(train_batch):
  with tf.GradientTape() as g:
    # Compute loss
    yhat = model(train_batch[?])
    loss = loss_fn(?, ?)

  # Calculate gradients
  grad = g.gradient(?, ?)

  # Update model
  opt.apply_gradients(zip(?, ?))

  # Return loss
  return(loss)


for epoch in range(nepochs):
  start_time = time.time()
  # Iterate over the batches of the dataset
  for step, train_batch in enumerate(train_dataset):
    # Call the training function
    loss_train = ?(train_batch)
    # Append training loss
    loss_train_epoch(loss_train)
  print('Epoch %d: time taken = %.2fs, train loss = %f'%(epoch+1, time.time() - start_time, loss_train_epoch.result()))