update docs

tensorpack · Jun 29, 2018 · a3581e7 · a3581e7
1 parent 801e292
commit a3581e7
Show file tree

Hide file tree

Showing 18 changed files with 60 additions and 54 deletions.
diff --git a/docs/tutorial/trainer.md b/docs/tutorial/trainer.md
@@ -39,8 +39,10 @@ The tower function needs to follow some conventions:
      To respect variable reuse, use `tf.get_variable` instead of `tf.Variable` in the function.
      On the other hand, for non-trainable variables, it's OK to use
      `tf.Variable` to ensure creation of new variables in each tower even when `reuse=True`.
-4. It will always be called under a `TowerContext`, which can be accessed by `get_current_tower_contxt()`.
+4. It will always be called under a `TowerContext`, which can be accessed by `get_current_tower_context()`.
    The context contains information about training/inference mode, reuse, etc.
+5. It cannot create scopes or variables containing the name 'tower', as it is
+   reserved for special use.
 
 These conventions are easy to follow, and most layer wrappers (e.g.,
 tf.layers/slim/tensorlayer) do follow them. Note that certain Keras layers do not

diff --git a/examples/A3C-Gym/train-atari.py b/examples/A3C-Gym/train-atari.py
@@ -19,7 +19,7 @@
 from tensorpack.utils.concurrency import ensure_proc_terminate, start_proc_mask_signal
 from tensorpack.utils.serialize import dumps
 from tensorpack.tfutils.gradproc import MapGradient, SummaryGradient
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 
 
 import gym
@@ -144,10 +144,10 @@ def __init__(self, pipe_c2s, pipe_s2c, gpus):
 
     def _setup_graph(self):
         # create predictors on the available predictor GPUs.
-        nr_gpu = len(self._gpus)
+        num_gpu = len(self._gpus)
         predictors = [self.trainer.get_predictor(
             ['state'], ['policy', 'pred_value'],
-            self._gpus[k % nr_gpu])
+            self._gpus[k % num_gpu])
             for k in range(PREDICTOR_THREAD)]
         self.async_predictor = MultiThreadAsyncPredictor(
             predictors, batch_size=PREDICT_BATCH_SIZE)
@@ -213,16 +213,16 @@ def train():
     logger.set_logger_dir(dirname)
 
     # assign GPUs for training & inference
-    nr_gpu = get_nr_gpu()
+    num_gpu = get_num_gpu()
     global PREDICTOR_THREAD
-    if nr_gpu > 0:
-        if nr_gpu > 1:
+    if num_gpu > 0:
+        if num_gpu > 1:
             # use half gpus for inference
-            predict_tower = list(range(nr_gpu))[-nr_gpu // 2:]
+            predict_tower = list(range(num_gpu))[-num_gpu // 2:]
         else:
             predict_tower = [0]
         PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
-        train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0]
+        train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0]
         logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
             ','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
     else:

diff --git a/examples/DoReFa-Net/alexnet-dorefa.py b/examples/DoReFa-Net/alexnet-dorefa.py
@@ -15,7 +15,7 @@
 from tensorpack.tfutils.summary import add_param_summary
 from tensorpack.tfutils.varreplace import remap_variables
 from tensorpack.dataflow import dataset
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 
 from imagenet_utils import get_imagenet_dataflow, fbresnet_augmentor, ImageNetModel
 from dorefa import get_dorefa, ternarize
@@ -215,7 +215,7 @@ def run_image(model, sess_init, inputs):
         run_image(Model(), DictRestore(dict(np.load(args.load))), args.run)
         sys.exit()
 
-    nr_tower = max(get_nr_gpu(), 1)
+    nr_tower = max(get_num_gpu(), 1)
     BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower
     logger.set_logger_dir(os.path.join(
         'train_log', 'alexnet-dorefa-{}'.format(args.dorefa)))

diff --git a/examples/DynamicFilterNetwork/steering-filter.py b/examples/DynamicFilterNetwork/steering-filter.py
@@ -257,8 +257,8 @@ def get_config():
     args = parser.parse_args()
 
     with change_gpu(args.gpu):
-        NR_GPU = len(args.gpu.split(','))
+        NGPU = len(args.gpu.split(','))
         config = get_config()
         if args.load:
             config.session_init = SaverRestore(args.load)
-        launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
+        launch_train_with_config(config, SyncMultiGPUTrainer(NGPU))
diff --git a/examples/GAN/BEGAN.py b/examples/GAN/BEGAN.py
@@ -5,7 +5,7 @@
 
 from tensorpack import *
 from tensorpack.tfutils.summary import add_moving_summary
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
 import tensorflow as tf
 
@@ -137,7 +137,7 @@ def optimizer(self):
 
         input = QueueInput(DCGAN.get_data())
         model = Model()
-        nr_tower = max(get_nr_gpu(), 1)
+        nr_tower = max(get_num_gpu(), 1)
         if nr_tower == 1:
             trainer = GANTrainer(input, model)
         else:

diff --git a/examples/GAN/GAN.py b/examples/GAN/GAN.py
@@ -149,10 +149,10 @@ class MultiGPUGANTrainer(TowerTrainer):
     """
     A replacement of GANTrainer (optimize d and g one by one) with multi-gpu support.
     """
-    def __init__(self, nr_gpu, input, model):
+    def __init__(self, num_gpu, input, model):
         super(MultiGPUGANTrainer, self).__init__()
-        assert nr_gpu > 1
-        raw_devices = ['/gpu:{}'.format(k) for k in range(nr_gpu)]
+        assert num_gpu > 1
+        raw_devices = ['/gpu:{}'.format(k) for k in range(num_gpu)]
 
         # Setup input
         input = StagingInput(input)
@@ -167,13 +167,13 @@ def get_cost(*inputs):
         self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc())
         devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
         cost_list = DataParallelBuilder.build_on_towers(
-            list(range(nr_gpu)),
+            list(range(num_gpu)),
             lambda: self.tower_func(*input.get_input_tensors()),
             devices)
         # Simply average the cost here. It might be faster to average the gradients
         with tf.name_scope('optimize'):
-            d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / nr_gpu)
-            g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / nr_gpu)
+            d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / num_gpu)
+            g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / num_gpu)
 
             opt = model.get_optimizer()
             # run one d_min after one g_min

diff --git a/examples/HED/hed.py b/examples/HED/hed.py
@@ -12,7 +12,7 @@
 
 from tensorpack import *
 from tensorpack.dataflow import dataset
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 from tensorpack.tfutils import optimizer, gradproc
 from tensorpack.tfutils.summary import add_moving_summary, add_param_summary
 
@@ -256,4 +256,4 @@ def run(model_path, image_path, output):
             config.session_init = get_model_loader(args.load)
         launch_train_with_config(
             config,
-            SyncMultiGPUTrainer(max(get_nr_gpu(), 1)))
+            SyncMultiGPUTrainer(max(get_num_gpu(), 1)))
diff --git a/examples/ImageNetModels/inception-bn.py b/examples/ImageNetModels/inception-bn.py
@@ -11,14 +11,14 @@
 from tensorpack import *
 from tensorpack.tfutils.summary import add_moving_summary
 from tensorpack.dataflow import dataset
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 
 from imagenet_utils import fbresnet_augmentor, get_imagenet_dataflow
 
 # Change them if using different number of GPUs.
 TOTAL_BATCH_SIZE = 64 * 6
-NR_GPU = 6
-BATCH_SIZE = TOTAL_BATCH_SIZE // NR_GPU
+NUM_GPU = 6
+BATCH_SIZE = TOTAL_BATCH_SIZE // NUM_GPU
 INPUT_SHAPE = 224
 
 
@@ -169,6 +169,6 @@ def get_config():
     config = get_config()
     if args.load:
         config.session_init = SaverRestore(args.load)
-    nr_tower = get_nr_gpu()
-    assert nr_tower == NR_GPU
-    launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
+    nr_tower = get_num_gpu()
+    assert nr_tower == NUM_GPU
+    launch_train_with_config(config, SyncMultiGPUTrainer(NUM_GPU))
diff --git a/examples/ImageNetModels/shufflenet.py b/examples/ImageNetModels/shufflenet.py
@@ -14,7 +14,7 @@
 from tensorpack.dataflow import imgaug
 from tensorpack.tfutils import argscope, get_model_loader, model_utils
 from tensorpack.tfutils.scope_utils import under_name_scope
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 
 from imagenet_utils import (
     get_imagenet_dataflow,
@@ -212,7 +212,7 @@ def get_config(model, nr_tower):
     else:
         logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))
 
-        nr_tower = max(get_nr_gpu(), 1)
+        nr_tower = max(get_num_gpu(), 1)
         config = get_config(model, nr_tower)
         if args.load:
             config.session_init = get_model_loader(args.load)

diff --git a/examples/ImageNetModels/vgg16.py b/examples/ImageNetModels/vgg16.py
@@ -10,7 +10,7 @@
 from tensorpack import *
 from tensorpack.tfutils import argscope
 from tensorpack.tfutils.summary import *
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 
 from imagenet_utils import (
     ImageNetModel, get_imagenet_dataflow, fbresnet_augmentor)
@@ -108,7 +108,7 @@ def get_data(name, batch):
 
 
 def get_config():
-    nr_tower = max(get_nr_gpu(), 1)
+    nr_tower = max(get_num_gpu(), 1)
     batch = args.batch
     total_batch = batch * nr_tower
     assert total_batch >= 256   # otherwise the learning rate warmup is wrong.
@@ -159,6 +159,6 @@ def get_config():
     logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))
 
     config = get_config()
-    nr_tower = max(get_nr_gpu(), 1)
+    nr_tower = max(get_num_gpu(), 1)
     trainer = SyncMultiGPUTrainerReplicated(nr_tower)
     launch_train_with_config(config, trainer)
diff --git a/examples/README.md b/examples/README.md
@@ -27,8 +27,8 @@ These are all the toy examples in tensorpack. They are supposed to be just demos
 | ---  | --- |
 |	Train [ResNet](ResNet), [ShuffleNet and other models](ImageNetModels) on ImageNet		| reproduce paper	|
 |	[Train Faster-RCNN / Mask-RCNN on COCO](FasterRCNN)				|	reproduce paper		|
-| [DoReFa-Net: training binary / low-bitwidth CNN on ImageNet](DoReFa-Net) | reproduce paper |
 | [Generative Adversarial Network(GAN) variants](GAN), including DCGAN, InfoGAN, <br/> Conditional GAN, WGAN, BEGAN, DiscoGAN, Image to Image, CycleGAN | visually reproduce |
+| [DoReFa-Net: training binary / low-bitwidth CNN on ImageNet](DoReFa-Net) | reproduce paper |
 | [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](HED) | visually reproduce |
 | [Spatial Transformer Networks on MNIST addition](SpatialTransformer) | reproduce paper |
 | [Visualize CNN saliency maps](Saliency) | visually reproduce |

diff --git a/examples/ResNet/cifar10-resnet.py b/examples/ResNet/cifar10-resnet.py
@@ -9,7 +9,7 @@
 
 from tensorpack import *
 from tensorpack.tfutils.summary import add_moving_summary, add_param_summary
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 from tensorpack.dataflow import dataset
 
 import tensorflow as tf
@@ -170,5 +170,5 @@ def get_data(train_or_test):
         max_epoch=400,
         session_init=SaverRestore(args.load) if args.load else None
     )
-    nr_gpu = max(get_nr_gpu(), 1)
-    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
+    num_gpu = max(get_num_gpu(), 1)
+    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
diff --git a/examples/ResNet/imagenet-resnet.py b/examples/ResNet/imagenet-resnet.py
@@ -12,7 +12,7 @@
     TrainConfig, SyncMultiGPUTrainerReplicated, launch_train_with_config)
 from tensorpack.dataflow import FakeData
 from tensorpack.tfutils import argscope, get_model_loader
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 
 from imagenet_utils import (
     fbresnet_augmentor, get_imagenet_dataflow, ImageNetModel,
@@ -57,7 +57,7 @@ def get_data(name, batch):
 
 
 def get_config(model, fake=False):
-    nr_tower = max(get_nr_gpu(), 1)
+    nr_tower = max(get_num_gpu(), 1)
     assert args.batch % nr_tower == 0
     batch = args.batch // nr_tower
 
@@ -143,5 +143,5 @@ def get_config(model, fake=False):
         config = get_config(model, fake=args.fake)
         if args.load:
             config.session_init = get_model_loader(args.load)
-        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
+        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
         launch_train_with_config(config, trainer)
diff --git a/examples/Saliency/CAM-resnet.py b/examples/Saliency/CAM-resnet.py
@@ -16,7 +16,7 @@
 from tensorpack.tfutils import optimizer, gradproc
 from tensorpack.tfutils.symbolic_functions import *
 from tensorpack.tfutils.summary import *
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 from tensorpack.utils import viz
 
 from imagenet_utils import (
@@ -157,8 +157,8 @@ def viz_cam(model_file, data_dir):
     if args.gpu:
         os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 
-    nr_gpu = get_nr_gpu()
-    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu
+    num_gpu = get_num_gpu()
+    BATCH_SIZE = TOTAL_BATCH_SIZE // num_gpu
 
     if args.cam:
         BATCH_SIZE = 128    # something that can run on one gpu
@@ -169,4 +169,4 @@ def viz_cam(model_file, data_dir):
     config = get_config()
     if args.load:
         config.session_init = get_model_loader(args.load)
-    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
+    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
diff --git a/examples/SuperResolution/enet-pat.py b/examples/SuperResolution/enet-pat.py
@@ -13,6 +13,7 @@
 from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
 from tensorpack.tfutils.summary import add_moving_summary
 from tensorpack.utils import logger
+from tensorpack.utils.gpu import get_num_gpu
 from data_sampler import (
     ImageDecode, ImageDataFromZIPFile,
     RejectTooSmallImages, CenterSquareResize)
@@ -286,7 +287,7 @@ def get_data(file_name):
             param_dict = {'VGG19/' + name: value for name, value in six.iteritems(param_dict)}
             session_init = DictRestore(param_dict)
 
-        nr_tower = max(get_nr_gpu(), 1)
+        nr_tower = max(get_num_gpu(), 1)
         data = QueueInput(get_data(args.data))
         model = Model()
 

diff --git a/examples/keras/README.md b/examples/keras/README.md
@@ -1,10 +1,10 @@
 
 ## Keras + Tensorpack
 
-Use Keras to define a model a train it with efficient tensorpack trainers.
+Use Keras to define a model and train it with efficient tensorpack trainers.
 
 ### Why?
-Keras alone has various overhead. In particular, it is not efficient when working on large models.
+Keras alone has various overhead. In particular, it is not efficient with large models.
 The article [Towards Efficient Multi-GPU Training in Keras with TensorFlow](https://medium.com/rossum/towards-efficient-multi-gpu-training-in-keras-with-tensorflow-8a0091074fb2)
 has mentioned some of it.
 

diff --git a/examples/keras/imagenet-resnet-keras.py b/examples/keras/imagenet-resnet-keras.py
@@ -11,7 +11,7 @@
 from tensorpack import InputDesc, SyncMultiGPUTrainerReplicated
 from tensorpack.dataflow import FakeData, MapDataComponent
 from tensorpack.utils import logger
-from tensorpack.utils.gpu import get_nr_gpu
+from tensorpack.utils.gpu import get_num_gpu
 from tensorpack.contrib.keras import KerasModel
 from tensorpack.callbacks import *
 from tensorflow.python.keras.layers import *
@@ -141,12 +141,12 @@ def image_preprocess(image):
 
     tf.keras.backend.set_image_data_format('channels_first')
 
-    nr_gpu = get_nr_gpu()
+    num_gpu = get_num_gpu()
     if args.fake:
         df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8')
         df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
     else:
-        batch_size = TOTAL_BATCH_SIZE // nr_gpu
+        batch_size = TOTAL_BATCH_SIZE // num_gpu
         assert args.data is not None
         df_train = get_imagenet_dataflow(
             args.data, 'train', batch_size, fbresnet_augmentor(True))
@@ -164,7 +164,7 @@ def one_hot(label):
         inputs_desc=[InputDesc(tf.uint8, [None, 224, 224, 3], 'images')],
         targets_desc=[InputDesc(tf.float32, [None, 1000], 'labels')],
         input=df_train,
-        trainer=SyncMultiGPUTrainerReplicated(nr_gpu))
+        trainer=SyncMultiGPUTrainerReplicated(num_gpu))
 
     lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
     tf.summary.scalar('lr', lr)
@@ -188,7 +188,7 @@ def one_hot(label):
     if not args.fake:
         callbacks.append(
             DataParallelInferenceRunner(
-                df_val, ScalarStats(['categorical_accuracy']), nr_gpu))
+                df_val, ScalarStats(['categorical_accuracy']), num_gpu))
 
     M.fit(
         steps_per_epoch=100 if args.fake else 1281167 // TOTAL_BATCH_SIZE,

diff --git a/tensorpack/models/batch_norm.py b/tensorpack/models/batch_norm.py
@@ -96,8 +96,11 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
           When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
           and it then uses per-machine (multiple GPU) statistics to normalize.
 
+          Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute
+          global mean&variance. The result is the global mean&variance only if each tower has the same batch size.
+
           This option has no effect when not training.
-          The option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
+          This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
           Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222
 
     Variable Names: