From e9363dfd02d5d87d219649d9f34f0c6e89dd7fda Mon Sep 17 00:00:00 2001 From: Yuxin Wu Date: Mon, 15 Oct 2018 22:56:12 -0700 Subject: [PATCH] update docs --- docs/tutorial/extend/callback.md | 16 +++++++++++++--- docs/tutorial/inference.md | 6 +++++- tensorpack/callbacks/inference_runner.py | 4 +++- tensorpack/train/base.py | 24 +++++++++++++++++++++++- 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/docs/tutorial/extend/callback.md b/docs/tutorial/extend/callback.md index dc3efab6a..d92632cd4 100644 --- a/docs/tutorial/extend/callback.md +++ b/docs/tutorial/extend/callback.md @@ -79,11 +79,15 @@ You can overwrite any of the following methods in the new callback: return tf.train.SessionRunArgs(fetches=my_op) ``` - The training loops would become `sess.run([training_op, my_op])`. + The training loops would become equivalent to `sess.run([training_op, my_op])`. However, if you write `my_op.run()` in `_trigger_step`, the training loop would become `sess.run(training_op); sess.run(my_op);`. Usually the difference matters, please choose carefully. + + If you want to run ops that depend on your inputs, it's better to run it + __along with__ the training iteration, to avoid wasting a datapoint and avoid + messing up hooks of the `InputSource`. * `_trigger_step(self)` @@ -107,7 +111,13 @@ You can overwrite any of the following methods in the new callback: * Access tensors / ops (details mentioned above): * For existing tensors/ops created in the tower, access them through [self.trainer.towers](../../modules/train.html#tensorpack.train.TowerTrainer.towers). * Extra tensors/ops have to be created in `_setup_graph` callback method. -* Access the current graph and session by `self.trainer.graph` and `self.trainer.sess`. +* Access the current graph and session by `self.trainer.graph` and + `self.trainer.sess`, `self.trainer.hooked_sess`. + Note that calling `(hooked_)sess.run` to evaluate tensors may have unexpected + effect in certain scenarios. + In general, use `sess.run` to evaluate tensors that do not depend on the inputs. + And use `_{before,after}_run` to evaluate tensors together with inputs if the + tensors depend on the inputs. * Write stuff to the monitor backend, by `self.trainer.monitors.put_xxx`. The monitors might direct your events to TensorFlow events file, JSON file, stdout, etc. You can access history monitor data as well. See the docs for [Monitors](../../modules/callbacks.html#tensorpack.callbacks.Monitors) @@ -118,7 +128,7 @@ You can overwrite any of the following methods in the new callback: ### Typical Steps about Writing/Using a Callback * Define the callback in `__init__`, prepare for it in `_setup_graph, _before_train`. -* Know whether you want to do something __along with__ the session run or not. +* Know whether you want to do something __along with__ the training iterations or not. If yes, implement the logic with `_{before,after}_run`. Otherwise, implement in `_trigger`, or `_trigger_step`. * You can choose to only implement "what to do", and leave "when to do" to diff --git a/docs/tutorial/inference.md b/docs/tutorial/inference.md index 258b5424c..11337d9c2 100644 --- a/docs/tutorial/inference.md +++ b/docs/tutorial/inference.md @@ -51,12 +51,16 @@ with TowerContext('', is_training=False): training settings (queues, iterators, summaries, evaluations, multi-gpu replications). Therefore it is usually wrong to import a training metagraph for inference. + It's especially error-prone to load a metagraph on top of a non-empty graph. + The potential name conflicts between the current graph and the nodes in the + metagraph can lead to esoteric bugs or sometimes completely ruin the model. + It's also very common to change the graph for inference. For example, you may need a different data layout for CPU inference, or you may need placeholders in the inference graph (which may not even exist in the training graph). However metagraph is not designed to be easily modified at all. - To do inference, it's best to recreate a clean graph (and save it if needed) by yourself. + Due to the above reasons, to do inference, it's best to recreate a clean graph (and save it if needed) by yourself. ``` ### Step 2: load the checkpoint diff --git a/tensorpack/callbacks/inference_runner.py b/tensorpack/callbacks/inference_runner.py index 4d8be3fc0..73ae3fe16 100644 --- a/tensorpack/callbacks/inference_runner.py +++ b/tensorpack/callbacks/inference_runner.py @@ -126,7 +126,9 @@ def __init__(self, input, infs, tower_name='InferenceTower', tower_func=None, de device (int): the device to use """ if isinstance(input, DataFlow): - input = FeedInput(input, infinite=True) # TODO a better way to handle inference size + # use infinite=False so that a dataflow without size will stop normally + # TODO a better way to handle inference size + input = FeedInput(input, infinite=False) assert isinstance(input, InputSource), input assert not isinstance(input, StagingInput), input self._tower_name = tower_name diff --git a/tensorpack/train/base.py b/tensorpack/train/base.py index f75f250a0..876343c1c 100644 --- a/tensorpack/train/base.py +++ b/tensorpack/train/base.py @@ -104,13 +104,35 @@ class Trainer(object): """ The ``tf.Session`` object the trainer is using. Available after :meth:`initialize()`. + + Using ``trainer.sess.run`` to evaluate tensors that depend on the inputs + can lead to unexpected effect: + + For example, if you use ``trainer.sess.run`` to evaluate a tensor that depends on the + inputs coming from a ``StagingArea``, + this will take a datapoint from the ``StagingArea``, making the ``StagingArea`` empty, and as a result + make the training hang. """ hooked_sess = None """ The ``tf.train.MonitoredSession`` object the trainer is using. - It contains all the hooks the callbacks have registered. + It contains all the ``before_run/after_run`` hooks the callbacks have registered. + It is used for running the training iterations. Available after :meth:`initialize()`. + + Note that using ``hooked_sess.run`` will evaluate all the hooks, + just like running a training iteration. It may do the following: + + 1. Take a datapoint from the InputSource + 2. Increase the global_step + 3. Evaluate some summaries + + Typically you do not want to use ``hooked_sess.run`` in callbacks, + because it is for the "training iteration". If you just want to evaluate + some tensors, use ``sess.run`` if the tensors does not depend on the inputs, + or more generally, use `before_run/after_run` to evaluate the tensors **along with** + the training iterations. """ def __init__(self):