# 开始动态图之旅

In [1]:
import sys

import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np

sys.path.append('E:/xinlib')
import xin
import chaos

tf.enable_eager_execution()

Instructions for updating:
Use the retry module or similar alternatives.


In [11]:
a = tf.random_normal([2, 3])
a

<tf.Tensor: id=35, shape=(2, 3), dtype=float32, numpy=
array([[-1.20727885, -0.32219583,  0.50368798],
       [-0.97735554, -0.2825062 , -0.84458929]], dtype=float32)>

In [12]:
tf.executing_eagerly()        # => True

True

In [13]:
x = [[2.]]
m = tf.matmul(x, x)
print("hello, {}".format(m))

hello, [[ 4.]]


In [14]:
a = tf.constant([[1, 2], [3, 4]])
a

<tf.Tensor: id=45, shape=(2, 2), dtype=int32, numpy=
array([[1, 2],
       [3, 4]])>

In [15]:
print(a)

tf.Tensor(
[[1 2]
 [3 4]], shape=(2, 2), dtype=int32)


## Broadcasting support 支持广播

In [16]:
b = tf.add(a, 1)
print(b)

tf.Tensor(
[[2 3]
 [4 5]], shape=(2, 2), dtype=int32)


## Operator overloading is supported 支持运算符重载

In [18]:
print(a * b)

tf.Tensor(
[[ 2  6]
 [12 20]], shape=(2, 2), dtype=int32)


## Use NumPy values

In [19]:
import numpy as np

c = np.multiply(a, b)
print(c)

[[ 2  6]
 [12 20]]


In [20]:
c

array([[ 2,  6],
       [12, 20]])

## Obtain numpy value from a tensor:

In [22]:
print(a.numpy())

[[1 2]
 [3 4]]


In [23]:
a.numpy()

array([[1, 2],
       [3, 4]])

# Eager training
## Automatic differentiation 自动求导

在`eager`模式下，使用`tfe.GradientTape`跟踪操作供以后计算梯度。(During eager execution, use tfe.GradientTape to trace operations for computing gradients later.)

`tfe.GradientTape`是一项可选功能，以提供在不跟踪最大性能。由于每个通话过程中可能会出现不同的操作，所有向前通操作以“磁带”的形式被记录下来，。为了计算梯度，向后播放磁带，然后丢弃。特定`tfe.GradientTape`只能计算一次，后续调用引发运行时错误。（类似于 MXNet 的 autograd）

In [25]:
w = tfe.Variable([[1.0]])
with tfe.GradientTape() as tape:
    loss = w * w

grad = tape.gradient(loss, [w])
print(grad)

[<tf.Tensor: id=109, shape=(1, 1), dtype=float32, numpy=array([[ 2.]], dtype=float32)>]


In [37]:
x = tf.constant([[1.0]])
with tfe.GradientTape() as tape:
    y = x * x

grad = tape.gradient(y, x)
grad

[None]

### MXNet 的自动求导

In [28]:
import mxnet as mx
from mxnet import autograd, nd

In [34]:
x = nd.array([[1.0]])

x.attach_grad()  # 为 x 准备一个空磁带

with autograd.record():  # 录制
    y = x * x
y.backward()    # 倒带
y_x = x.grad
y_x


[[ 2.]]
<NDArray 1x1 @cpu(0)>

## 一个使用 `tfe.GradientTape` 例子

In [38]:
# A toy dataset of points around 3 * x + 2
NUM_EXAMPLES = 1000
training_inputs = tf.random_normal([NUM_EXAMPLES])
noise = tf.random_normal([NUM_EXAMPLES])
training_outputs = training_inputs * 3 + 2 + noise


def prediction(input, weight, bias):
    return input * weight + bias

# A loss function using mean-squared error


def loss(weights, biases):
    error = prediction(training_inputs, weights, biases) - training_outputs
    return tf.reduce_mean(tf.square(error))

# Return the derivative of loss with respect to weight and bias


def grad(weights, biases):
    with tfe.GradientTape() as tape:
        loss_value = loss(weights, biases)
    return tape.gradient(loss_value, [weights, biases])


train_steps = 200
learning_rate = 0.01
# Start with arbitrary values for W and B on the same batch of data
W = tfe.Variable(5.)
B = tfe.Variable(10.)

print("Initial loss: {:.3f}".format(loss(W, B)))

for i in range(train_steps):
    dW, dB = grad(W, B)
    W.assign_sub(dW * learning_rate)
    B.assign_sub(dB * learning_rate)
    if i % 20 == 0:
        print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B)))

print("Final loss: {:.3f}".format(loss(W, B)))
print("W = {}, B = {}".format(W.numpy(), B.numpy()))

Initial loss: 69.025
Loss at step 000: 66.354
Loss at step 020: 30.335
Loss at step 040: 14.164
Loss at step 060: 6.904
Loss at step 080: 3.644
Loss at step 100: 2.180
Loss at step 120: 1.522
Loss at step 140: 1.227
Loss at step 160: 1.094
Loss at step 180: 1.035
Final loss: 1.009
W = 3.017094135284424, B = 2.11067795753479


下面是一个通用的训练模型：
```py
dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
                                              data.train.labels))
...
for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
    ...
    with tfe.GradientTape() as tape:
        logits = model(images, training=True)
        loss_value = loss(logits, labels)
    ...
    grads = tape.gradient(loss_value, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables),
                              global_step=tf.train.get_or_create_global_step())
```

动态模型
`tfe.GradientTape`也可以在动态模型中使用。[回溯行搜索](https://en.wikipedia.org/wiki/Backtracking_line_search) 算法看起来像正常
NumPy 的代码，除了有梯度和是可微的，尽管复杂的控制流程：

In [39]:
def line_search_step(fn, init_x, rate=1.0):
    with tfe.GradientTape() as tape:
        # Variables are automatically recorded, but manually watch a tensor
        tape.watch(init_x)
        value = fn(init_x)
    grad, = tape.gradient(value, [init_x])
    grad_norm = tf.reduce_sum(grad * grad)
    init_value = value
    while value > init_value - rate * grad_norm:
        x = init_x - rate * grad
        value = fn(x)
        rate /= 2.0
    return x, value

# 用于计算梯度的附加函数(Additional functions to compute gradients)

`tfe.GradientTape` is a powerful interface for computing gradients, but there is another [Autograd](https://github.com/HIPS/autograd)-style API available for automatic differentiation. These functions are useful if writing math code with only tensors and gradient functions, and without `tfe.Variables`（下面的函数用于计算没有`tfe.Variables` 的张量或函数的梯度）:
- `tfe.gradients_function`-该 op 输入函数作为参数，返回相对于它的自变量的导函数。当调用返回的函数，它返回的列表`tf.Tensor`的对象：用于输入函数的每个参数的一个元素。
- `tfe.value_and_gradients_function`-被调用返回的函数时，它相对于它的自变量返回从除了输入功能的衍生物的列表中的输入功能的值。

In [40]:
def square(x):
    return tf.multiply(x, x)


grad = tfe.gradients_function(square)

In [41]:
square(3.)

<tf.Tensor: id=7610, shape=(), dtype=float32, numpy=9.0>

In [43]:
grad(3)

[<tf.Tensor: id=7649, shape=(), dtype=int32, numpy=6>]

二阶导数：

In [44]:
gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
gradgrad(3.) 

[<tf.Tensor: id=7677, shape=(), dtype=float32, numpy=2.0>]

3 阶导数：

In [45]:
gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
gradgradgrad(3.)

[None]

## With flow control(流程控制):

In [48]:
def abs(x):
    return x if x > 0. else -x


grad = tfe.gradients_function(abs)

print(grad(3.)[0])   # => [1.0]
print(grad(-3.)[0])  # => [-1.0]

tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(-1.0, shape=(), dtype=float32)


In [50]:
tfe.value_and_gradients_function?

## Custom gradients（自定义梯度）

In [51]:
@tf.custom_gradient
def clip_gradient_by_norm(x, norm):
    y = tf.identity(x)

    def grad_fn(dresult):
        return [tf.clip_by_norm(dresult, norm), None]
    return y, grad_fn

In [53]:
def log1pexp(x):
    return tf.log(1 + tf.exp(x))


grad_log1pexp = tfe.gradients_function(log1pexp)

# The gradient computation works fine at x = 0.
print(grad_log1pexp(0.)[0])  # => [0.5]

# However, x = 100 fails because of numerical instability.
print(grad_log1pexp(100.)[0]) # => [nan]

tf.Tensor(0.5, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)


In [54]:
@tf.custom_gradient
def log1pexp(x):
    e = tf.exp(x)

    def grad(dy):
        return dy * (1 - 1 / (1 + e))
    return tf.log(1 + e), grad


grad_log1pexp = tfe.gradients_function(log1pexp)

# As before, the gradient computation works fine at x = 0.
print(grad_log1pexp(0.))  # => [0.5]

# And the gradient computation also works at x = 100.
print(grad_log1pexp(100.))  # => [1.0]

[<tf.Tensor: id=7929, shape=(), dtype=float32, numpy=0.5>]
[<tf.Tensor: id=7939, shape=(), dtype=float32, numpy=1.0>]


# 创建和训练模型

In [57]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
    tf.keras.layers.Dense(10)
])

In [58]:
class MNISTModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units=10)
        self.dense2 = tf.keras.layers.Dense(units=10)

    def call(self, input):
        """Run the model."""
        result = self.dense1(input)
        result = self.dense2(result)
        result = self.dense2(result)  # reuse variables from dense2 layer
        return result


model = MNISTModel()

In [59]:
# Create a tensor representing a blank image
batch = tf.zeros([1, 1, 784])
print(batch.shape)  # => (1, 1, 784)

result = model(batch)

(1, 1, 784)


In [62]:
import dataset  # download dataset.py file
dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32)

Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz to ./datasets\train-images-idx3-ubyte.gz
Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz to ./datasets\train-labels-idx1-ubyte.gz


In [63]:
def loss(model, x, y):
    prediction = model(x)
    return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)


def grad(model, inputs, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return tape.gradient(loss_value, model.variables)


optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)

x, y = tfe.Iterator(dataset_train).next()
print("Initial loss: {:.3f}".format(loss(model, x, y)))

# Training loop
for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
    # Calculate derivatives of the input function with respect to its parameters.
    grads = grad(model, x, y)
    # Apply the gradient to the model
    optimizer.apply_gradients(zip(grads, model.variables),
                              global_step=tf.train.get_or_create_global_step())
    if i % 200 == 0:
        print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y)))

print("Final loss: {:.3f}".format(loss(model, x, y)))

Initial loss: 2.420
Loss at step 0000: 2.509
Loss at step 0200: 2.220
Loss at step 0400: 2.122
Loss at step 0600: 2.000
Loss at step 0800: 1.649
Loss at step 1000: 1.636
Loss at step 1200: 1.732
Loss at step 1400: 1.544
Loss at step 1600: 1.668
Loss at step 1800: 1.551
Loss at step 2000: 1.412
Loss at step 2200: 1.147
Loss at step 2400: 1.331
Loss at step 2600: 1.348
Loss at step 2800: 1.252
Loss at step 3000: 1.240
Loss at step 3200: 1.253
Loss at step 3400: 1.231
Loss at step 3600: 0.822
Loss at step 3800: 0.986
Loss at step 4000: 0.919
Loss at step 4200: 0.898
Loss at step 4400: 0.704
Loss at step 4600: 0.695
Loss at step 4800: 0.604
Loss at step 5000: 0.753
Loss at step 5200: 0.670
Loss at step 5400: 0.597
Loss at step 5600: 0.964
Loss at step 5800: 0.815
Loss at step 6000: 0.494
Loss at step 6200: 0.493
Loss at step 6400: 0.530
Loss at step 6600: 0.543
Loss at step 6800: 0.594
Loss at step 7000: 1.137
Loss at step 7200: 0.620
Loss at step 7400: 0.686
Final loss: 0.408


## 在 GPU 上运算

In [64]:
with tf.device("/gpu:0"):
    for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
        # minimize() is equivalent to the grad() and apply_gradients() calls.
        optimizer.minimize(lambda: loss(model, x, y),
                           global_step=tf.train.get_or_create_global_step())

InvalidArgumentError: Tensors on conflicting devices: cannot compute MatMul as input #1 was expected to be on /job:localhost/replica:0/task:0/device:GPU:0 but is actually on /job:localhost/replica:0/task:0/device:CPU:0 (operation running on /job:localhost/replica:0/task:0/device:GPU:0) Tensors can be copied explicitly using .gpu() or .cpu(), or transparently copied by using tfe.enable_eager_execution(tfe.DEVICE_PLACEMENT_SILENT). Copying tensors between devices may slow down your model [Op:MatMul]

## 变量和优化

In [70]:
class MNISTModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.W = tfe.Variable(5., name='weight')
        self.B = tfe.Variable(10., name='bias')

    def predict(self, inputs):
        return inputs * self.W + self.B


# A toy dataset of points around 3 * x + 2
NUM_EXAMPLES = 2000
training_inputs = tf.random_normal([NUM_EXAMPLES])
noise = tf.random_normal([NUM_EXAMPLES])
training_outputs = training_inputs * 3 + 2 + noise

# The loss function to be optimized


def loss(model, inputs, targets):
    error = model.predict(inputs) - targets
    return tf.reduce_mean(tf.square(error))


def grad(model, inputs, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return tape.gradient(loss_value, [model.W, model.B])


# Define:
# 1. A model.
# 2. Derivatives of a loss function with respect to model parameters.
# 3. A strategy for updating the variables based on the derivatives.
model = Model()
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)

print("Initial loss: {:.3f}".format(
    loss(model, training_inputs, training_outputs)))

# Training loop
for i in range(300):
    grads = grad(model, training_inputs, training_outputs)
    optimizer.apply_gradients(zip(grads, [model.W, model.B]),
                              global_step=tf.train.get_or_create_global_step())
    if i % 20 == 0:
        print("Loss at step {:03d}: {:.3f}".format(
            i, loss(model, training_inputs, training_outputs)))

print("Final loss: {:.3f}".format(
    loss(model, training_inputs, training_outputs)))
print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy()))

Initial loss: 67.891
Loss at step 000: 65.279
Loss at step 020: 30.001
Loss at step 040: 14.104
Loss at step 060: 6.937
Loss at step 080: 3.704
Loss at step 100: 2.245
Loss at step 120: 1.586
Loss at step 140: 1.288
Loss at step 160: 1.153
Loss at step 180: 1.093
Loss at step 200: 1.065
Loss at step 220: 1.053
Loss at step 240: 1.047
Loss at step 260: 1.044
Loss at step 280: 1.043
Final loss: 1.043
W = 2.994924306869507, B = 2.0340096950531006


During eager execution, variables persist until the last reference to the object is removed, and is then deleted.

In [66]:
with tf.device("gpu:0"):
    v = tfe.Variable(tf.random_normal([1000, 1000]))
    v = None  # v no longer takes up GPU memory

## Object-based saving
`tfe.Checkpoint` can save and restore tfe.Variables to and from `checkpoints`:

In [67]:
x = tfe.Variable(10.)

checkpoint = tfe.Checkpoint(x=x)  # save as "x"

x.assign(2.)   # Assign a new value to the variables and save.
save_path = checkpoint.save('./ckpt/')

x.assign(11.)  # Change the variable after saving.

# Restore values from the checkpoint
checkpoint.restore(save_path)

print(x)  # => 2.0

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>


To save and load models, `tfe.Checkpoint stores` the internal state of objects, without requiring hidden variables. To record the state of a model, an optimizer, and a global step, pass them to a `tfe.Checkpoint`:

In [72]:
import os

model = MyModel()
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
checkpoint_dir = 'D:/model_dir'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
root = tfe.Checkpoint(optimizer=optimizer,
                      model=model,
                      optimizer_step=tf.train.get_or_create_global_step())

root.save(file_prefix=checkpoint_prefix)
# or
root.restore(tf.train.latest_checkpoint(checkpoint_dir))

TypeError: __init__() missing 2 required positional arguments: 'message' and 'code'

# Object-oriented metrics

`tfe.metrics` are stored as objects. Update a metric by passing the new data to the callable, and retrieve the result using the `tfe.metrics.result` method, for example:

In [73]:
m = tfe.metrics.Mean("loss")
m(0)
m(5)
m.result()  # => 2.5
m([8, 9])
m.result()  # => 5.5

<tf.Tensor: id=683437, shape=(), dtype=float64, numpy=5.5>

# Summaries and TensorBoard

`tf.contrib.summary` is compatible with both eager and graph execution environments.

```py
writer = tf.contrib.summary.create_file_writer(logdir)
global_step = tf.train.get_or_create_global_step()  # return global step var

writer.set_as_default()

for _ in range(iterations):
    global_step.assign_add(1)
    # Must include a record_summaries method
    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
        # your model code goes here
        tf.contrib.summary.scalar('loss', loss)
            ...
```

# Performance

In [74]:
import time


def measure(x, steps):
    # TensorFlow initializes a GPU the first time it's used, exclude from timing.
    tf.matmul(x, x)
    start = time.time()
    for i in range(steps):
        x = tf.matmul(x, x)
        _ = x.numpy()  # Make sure to execute op and not just enqueue it
    end = time.time()
    return end - start


shape = (1000, 1000)
steps = 200
print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))

# Run on CPU:
with tf.device("/cpu:0"):
    print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))

# Run on GPU, if available:
if tfe.num_gpus() > 0:
    with tf.device("/gpu:0"):
        print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
else:
    print("GPU: not found")

Time to multiply a (1000, 1000) matrix by itself 200 times:
CPU: 4.022002458572388 secs
GPU: 2.2669179439544678 secs


`tf.Tensor` 对象可以被复制到不同的设备来执行其操作：

```py
x = tf.random_normal([10, 10])

x_gpu0 = x.gpu()
x_cpu = x.cpu()

_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0

if tfe.num_gpus() > 1:
    x_gpu1 = x.gpu(1)
    _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
```

# Use eager execution in a graph environment

In [13]:
def my_py_func(x):
    x = tf.matmul(x, x)  # You can use tf ops
    print(x)  # but it's eager!
    return x


x = tf.constant([[2.0]])
# Call eager function in graph!
pf = tfe.py_func(my_py_func, [x], tf.float32)

tf.Tensor([[ 4.]], shape=(1, 1), dtype=float32)
