<a href="https://colab.research.google.com/github/sourcecode369/100-days-of-ml-code/blob/master/tensorflow_2.0_docs/TensorFlow%20Core/Guide/Save%20a%20Model/TensorFlow_2_0_Save_a_Model_Checkpoints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install and importing dependencies

In [71]:
%%time
!pip install --upgrade tensorflow
import tensorflow as tf
print("TensorFlow version: ",tf.__version__)

Requirement already up-to-date: tensorflow in /usr/local/lib/python3.6/dist-packages (2.0.0)
TensorFlow version:  2.0.0
CPU times: user 168 ms, sys: 29.1 ms, total: 197 ms
Wall time: 8.2 s


### Overview

The phrase "Saving a TensorFlow model" typically means one of two things:

1. Checkpoints, OR
2. SavedModel.

Checkpoints capture the exact value of all parameters (tf.Variable objects) used by a model. Checkpoints do not contain any description of the computation defined by the model and thus are typically only useful when source code that will use the saved parameter values is available.

The SavedModel format on the other hand includes a serialized description of the computation defined by the model in addition to the parameter values (checkpoint). Models in this format are independent of the source code that created the model. They are thus suitable for deployment via TensorFlow Serving, TensorFlow Lite, TensorFlow.js, or programs in other programming languages (the C, C++, Java, Go, Rust, C# etc. TensorFlow APIs).

This guide covers APIs for writing and reading checkpoints.

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

In [0]:
class Net(tf.keras.Model):
    def __init__(self):
        super(Net, self).__init__()
        self.l1 = tf.keras.layers.Dense(5)
    def call(self, x):
        return self.l1(x)

In [0]:
net = Net()

### Saving from tf.keras training APIs

In [0]:
net.save_weights('easy_checkpoints')

### Writing Checkpoints

#### Manual checkpointing

In [0]:
def toy_dataset():
    inputs = tf.range(10.)[:, None]
    labels = inputs * 5 + tf.range(5.)[None, :]
    dataset = tf.data.Dataset.from_tensor_slices(dict(x=inputs, y=labels)).shuffle(buffer_size=2).repeat(100).batch(2)
    dataset = dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
    return dataset

In [0]:
def train_step(net, example, optimizer):
    with tf.GradientTape() as tape:
        output = net(example['x'])
        loss = tf.reduce_mean(tf.abs(output - example['y']))
    variables = net.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss

#### Create the checkpoint objects

In [0]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=net)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=3)

#### Train and checkpoint the model

In [0]:
def train_and_checkpoint(net, manager):
    ckpt.restore(manager.latest_checkpoint)
    if manager.latest_checkpoint:
        print("Restored from {}".format(manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    
    for example in toy_dataset():
        loss = train_step(net, example, opt)
        ckpt.step.assign_add(1)
        if int(ckpt.step) % 10 == 0:
            save_path = manager.save()
            print("Saved checkpoint for step {}:{}".format(int(ckpt.step), save_path))
            print("Loss {:1.2f}".format(loss.numpy()))

In [80]:
train_and_checkpoint(net, manager)

Restored from ./tf_ckpts/ckpt-155
Saved checkpoint for step 1560:./tf_ckpts/ckpt-156
Loss 28.63
Saved checkpoint for step 1570:./tf_ckpts/ckpt-157
Loss 28.55
Saved checkpoint for step 1580:./tf_ckpts/ckpt-158
Loss 23.53
Saved checkpoint for step 1590:./tf_ckpts/ckpt-159
Loss 26.74
Saved checkpoint for step 1600:./tf_ckpts/ckpt-160
Loss 26.66
Saved checkpoint for step 1610:./tf_ckpts/ckpt-161
Loss 28.22
Saved checkpoint for step 1620:./tf_ckpts/ckpt-162
Loss 28.13
Saved checkpoint for step 1630:./tf_ckpts/ckpt-163
Loss 26.43
Saved checkpoint for step 1640:./tf_ckpts/ckpt-164
Loss 26.35
Saved checkpoint for step 1650:./tf_ckpts/ckpt-165
Loss 24.65
Saved checkpoint for step 1660:./tf_ckpts/ckpt-166
Loss 26.19
Saved checkpoint for step 1670:./tf_ckpts/ckpt-167
Loss 27.72
Saved checkpoint for step 1680:./tf_ckpts/ckpt-168
Loss 27.64
Saved checkpoint for step 1690:./tf_ckpts/ckpt-169
Loss 27.56
Saved checkpoint for step 1700:./tf_ckpts/ckpt-170
Loss 24.28
Saved checkpoint for step 1710:./tf_

#### Restore and continue training

In [81]:
opt = tf.keras.optimizers.Adam(0.1)
net = Net()
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=net)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=3)

train_and_checkpoint(net, manager)

Restored from ./tf_ckpts/ckpt-205
Saved checkpoint for step 2060:./tf_ckpts/ckpt-206
Loss 21.62
Saved checkpoint for step 2070:./tf_ckpts/ckpt-207
Loss 20.11
Saved checkpoint for step 2080:./tf_ckpts/ckpt-208
Loss 21.47
Saved checkpoint for step 2090:./tf_ckpts/ckpt-209
Loss 24.25
Saved checkpoint for step 2100:./tf_ckpts/ckpt-210
Loss 19.90
Saved checkpoint for step 2110:./tf_ckpts/ckpt-211
Loss 24.09
Saved checkpoint for step 2120:./tf_ckpts/ckpt-212
Loss 24.01
Saved checkpoint for step 2130:./tf_ckpts/ckpt-213
Loss 23.92
Saved checkpoint for step 2140:./tf_ckpts/ckpt-214
Loss 23.84
Saved checkpoint for step 2150:./tf_ckpts/ckpt-215
Loss 23.76
Saved checkpoint for step 2160:./tf_ckpts/ckpt-216
Loss 23.68
Saved checkpoint for step 2170:./tf_ckpts/ckpt-217
Loss 22.20
Saved checkpoint for step 2180:./tf_ckpts/ckpt-218
Loss 23.51
Saved checkpoint for step 2190:./tf_ckpts/ckpt-219
Loss 20.66
Saved checkpoint for step 2200:./tf_ckpts/ckpt-220
Loss 23.35
Saved checkpoint for step 2210:./tf_

In [82]:
manager.checkpoints

['./tf_ckpts/ckpt-253', './tf_ckpts/ckpt-254', './tf_ckpts/ckpt-255']

In [83]:
!ls ./tf_ckpts/

checkpoint		      ckpt-254.data-00000-of-00001  ckpt-255.index
ckpt-253.data-00000-of-00001  ckpt-254.index
ckpt-253.index		      ckpt-255.data-00000-of-00001


### Loading Mechanics

In [85]:
to_restore = tf.Variable(tf.zeros([5]))
print(to_restore.numpy())
fake_layer = tf.train.Checkpoint(bias=to_restore)
fake_net = tf.train.Checkpoint(l1=fake_layer)
new_root = tf.train.Checkpoint(net=fake_net)
status = new_root.restore(tf.train.latest_checkpoint('./tf_ckpts/'))
print(to_restore.numpy())

[0. 0. 0. 0. 0.]
[2.23092   2.321593  2.4531858 2.5489972 2.5489972]


In [86]:
status.assert_existing_objects_matched()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f82096f6908>

#### Delayed restorations

In [87]:
delayed_restore = tf.Variable(tf.zeros([1,5]))
print(delayed_restore.numpy())
fake_layer.kernel = delayed_restore
print(delayed_restore.numpy())

[[0. 0. 0. 0. 0.]]
[[3.1211607 1.8763008 2.483833  2.207475  3.0152123]]


#### Manually inspecting checkpoints

In [88]:
tf.train.list_variables(tf.train.latest_checkpoint('./tf_ckpts/'))

[('_CHECKPOINTABLE_OBJECT_GRAPH', []),
 ('net/l1/bias/.ATTRIBUTES/VARIABLE_VALUE', [5]),
 ('net/l1/bias/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE', [5]),
 ('net/l1/bias/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE', [5]),
 ('net/l1/kernel/.ATTRIBUTES/VARIABLE_VALUE', [1, 5]),
 ('net/l1/kernel/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE',
  [1, 5]),
 ('net/l1/kernel/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE',
  [1, 5]),
 ('optimizer/beta_1/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/beta_2/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/decay/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/learning_rate/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('save_counter/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('step/.ATTRIBUTES/VARIABLE_VALUE', [])]

#### List and disctionary tracking

In [0]:
save = tf.train.Checkpoint()

In [0]:
save.listed = [tf.Variable(1.)]
save.listed.append(tf.Variable(2.))
save.mapped = {'one':save.listed[0]}
save.mapped["two"] = save.listed[1]

In [0]:
save_path = save.save("./tf_list_example")

In [0]:
restore = tf.train.Checkpoint()
v2 = tf.Variable(0.)
assert 0. == v2.numpy()

In [94]:
restore.mapped = {'two':v2}
restore.restore(save_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f82096f1c88>

In [0]:
assert 2. == v2.numpy()

In [96]:
restore.listed = []
print(restore.listed)  
v1 = tf.Variable(0.)
restore.listed.append(v1)
assert 1. == v1.numpy()

ListWrapper([])


### Saving object-based checkpoints with Estimator

In [0]:
import tensorflow.compat.v1 as tf_compat

In [99]:
def model_fn(features, labels, mode):
    net = Net()
    opt = tf.keras.optimizers.Adam(0.1)
    ckpt = tf.train.Checkpoint(step=tf_compat.train.get_global_step(),
                               optimizer=opt, net=net)
    with tf.GradientTape() as tape:
        output = net(features['x'])
        loss = tf.reduce_mean(tf.abs(output - features['y']))
    variables = net.trainable_variables
    gradients = tape.gradient(loss, variables)
    return tf.estimator.EstimatorSpec(
        mode, 
        loss = loss,
        train_op = tf.group(opt.apply_gradients(zip(gradients, variables)),
                            ckpt.step.assign_add(1)),
                            scaffold = tf_compat.train.Scaffold(saver=ckpt)
    )

tf.keras.backend.clear_session()
est = tf.estimator.Estimator(model_fn, './tf_estimator_example/')
est.train(toy_dataset, steps=10)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './tf_estimator_example/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f820990b518>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow

<tensorflow_estimator.python.estimator.estimator.EstimatorV2 at 0x7f820990b0b8>

In [100]:
opt = tf.keras.optimizers.Adam(0.1)
net = Net()
ckpt = tf.train.Checkpoint(
    step = tf.Variable(1, dtype=tf.int64), optimizer=opt, net = net
)
ckpt.restore(tf.train.latest_checkpoint('./tf_estimtor_example/'))
ckpt.step.numpy()

1

### Summary

TensorFlow objects provide an easy automatic mechanism for saving and restoring the values of variables they use.

