Skip to content

Commit

Permalink
update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
ppwwyyxx committed Jun 29, 2018
1 parent 801e292 commit a3581e7
Show file tree
Hide file tree
Showing 18 changed files with 60 additions and 54 deletions.
4 changes: 3 additions & 1 deletion docs/tutorial/trainer.md
Expand Up @@ -39,8 +39,10 @@ The tower function needs to follow some conventions:
To respect variable reuse, use `tf.get_variable` instead of `tf.Variable` in the function.
On the other hand, for non-trainable variables, it's OK to use
`tf.Variable` to ensure creation of new variables in each tower even when `reuse=True`.
4. It will always be called under a `TowerContext`, which can be accessed by `get_current_tower_contxt()`.
4. It will always be called under a `TowerContext`, which can be accessed by `get_current_tower_context()`.
The context contains information about training/inference mode, reuse, etc.
5. It cannot create scopes or variables containing the name 'tower', as it is
reserved for special use.

These conventions are easy to follow, and most layer wrappers (e.g.,
tf.layers/slim/tensorlayer) do follow them. Note that certain Keras layers do not
Expand Down
16 changes: 8 additions & 8 deletions examples/A3C-Gym/train-atari.py
Expand Up @@ -19,7 +19,7 @@
from tensorpack.utils.concurrency import ensure_proc_terminate, start_proc_mask_signal
from tensorpack.utils.serialize import dumps
from tensorpack.tfutils.gradproc import MapGradient, SummaryGradient
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu


import gym
Expand Down Expand Up @@ -144,10 +144,10 @@ def __init__(self, pipe_c2s, pipe_s2c, gpus):

def _setup_graph(self):
# create predictors on the available predictor GPUs.
nr_gpu = len(self._gpus)
num_gpu = len(self._gpus)
predictors = [self.trainer.get_predictor(
['state'], ['policy', 'pred_value'],
self._gpus[k % nr_gpu])
self._gpus[k % num_gpu])
for k in range(PREDICTOR_THREAD)]
self.async_predictor = MultiThreadAsyncPredictor(
predictors, batch_size=PREDICT_BATCH_SIZE)
Expand Down Expand Up @@ -213,16 +213,16 @@ def train():
logger.set_logger_dir(dirname)

# assign GPUs for training & inference
nr_gpu = get_nr_gpu()
num_gpu = get_num_gpu()
global PREDICTOR_THREAD
if nr_gpu > 0:
if nr_gpu > 1:
if num_gpu > 0:
if num_gpu > 1:
# use half gpus for inference
predict_tower = list(range(nr_gpu))[-nr_gpu // 2:]
predict_tower = list(range(num_gpu))[-num_gpu // 2:]
else:
predict_tower = [0]
PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
train_tower = list(range(nr_gpu))[:-nr_gpu // 2] or [0]
train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0]
logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
else:
Expand Down
4 changes: 2 additions & 2 deletions examples/DoReFa-Net/alexnet-dorefa.py
Expand Up @@ -15,7 +15,7 @@
from tensorpack.tfutils.summary import add_param_summary
from tensorpack.tfutils.varreplace import remap_variables
from tensorpack.dataflow import dataset
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu

from imagenet_utils import get_imagenet_dataflow, fbresnet_augmentor, ImageNetModel
from dorefa import get_dorefa, ternarize
Expand Down Expand Up @@ -215,7 +215,7 @@ def run_image(model, sess_init, inputs):
run_image(Model(), DictRestore(dict(np.load(args.load))), args.run)
sys.exit()

nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower
logger.set_logger_dir(os.path.join(
'train_log', 'alexnet-dorefa-{}'.format(args.dorefa)))
Expand Down
4 changes: 2 additions & 2 deletions examples/DynamicFilterNetwork/steering-filter.py
Expand Up @@ -257,8 +257,8 @@ def get_config():
args = parser.parse_args()

with change_gpu(args.gpu):
NR_GPU = len(args.gpu.split(','))
NGPU = len(args.gpu.split(','))
config = get_config()
if args.load:
config.session_init = SaverRestore(args.load)
launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
launch_train_with_config(config, SyncMultiGPUTrainer(NGPU))
4 changes: 2 additions & 2 deletions examples/GAN/BEGAN.py
Expand Up @@ -5,7 +5,7 @@

from tensorpack import *
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
import tensorflow as tf

Expand Down Expand Up @@ -137,7 +137,7 @@ def optimizer(self):

input = QueueInput(DCGAN.get_data())
model = Model()
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
if nr_tower == 1:
trainer = GANTrainer(input, model)
else:
Expand Down
12 changes: 6 additions & 6 deletions examples/GAN/GAN.py
Expand Up @@ -149,10 +149,10 @@ class MultiGPUGANTrainer(TowerTrainer):
"""
A replacement of GANTrainer (optimize d and g one by one) with multi-gpu support.
"""
def __init__(self, nr_gpu, input, model):
def __init__(self, num_gpu, input, model):
super(MultiGPUGANTrainer, self).__init__()
assert nr_gpu > 1
raw_devices = ['/gpu:{}'.format(k) for k in range(nr_gpu)]
assert num_gpu > 1
raw_devices = ['/gpu:{}'.format(k) for k in range(num_gpu)]

# Setup input
input = StagingInput(input)
Expand All @@ -167,13 +167,13 @@ def get_cost(*inputs):
self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc())
devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
cost_list = DataParallelBuilder.build_on_towers(
list(range(nr_gpu)),
list(range(num_gpu)),
lambda: self.tower_func(*input.get_input_tensors()),
devices)
# Simply average the cost here. It might be faster to average the gradients
with tf.name_scope('optimize'):
d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / nr_gpu)
g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / nr_gpu)
d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / num_gpu)
g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / num_gpu)

opt = model.get_optimizer()
# run one d_min after one g_min
Expand Down
4 changes: 2 additions & 2 deletions examples/HED/hed.py
Expand Up @@ -12,7 +12,7 @@

from tensorpack import *
from tensorpack.dataflow import dataset
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.tfutils import optimizer, gradproc
from tensorpack.tfutils.summary import add_moving_summary, add_param_summary

Expand Down Expand Up @@ -256,4 +256,4 @@ def run(model_path, image_path, output):
config.session_init = get_model_loader(args.load)
launch_train_with_config(
config,
SyncMultiGPUTrainer(max(get_nr_gpu(), 1)))
SyncMultiGPUTrainer(max(get_num_gpu(), 1)))
12 changes: 6 additions & 6 deletions examples/ImageNetModels/inception-bn.py
Expand Up @@ -11,14 +11,14 @@
from tensorpack import *
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.dataflow import dataset
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu

from imagenet_utils import fbresnet_augmentor, get_imagenet_dataflow

# Change them if using different number of GPUs.
TOTAL_BATCH_SIZE = 64 * 6
NR_GPU = 6
BATCH_SIZE = TOTAL_BATCH_SIZE // NR_GPU
NUM_GPU = 6
BATCH_SIZE = TOTAL_BATCH_SIZE // NUM_GPU
INPUT_SHAPE = 224


Expand Down Expand Up @@ -169,6 +169,6 @@ def get_config():
config = get_config()
if args.load:
config.session_init = SaverRestore(args.load)
nr_tower = get_nr_gpu()
assert nr_tower == NR_GPU
launch_train_with_config(config, SyncMultiGPUTrainer(NR_GPU))
nr_tower = get_num_gpu()
assert nr_tower == NUM_GPU
launch_train_with_config(config, SyncMultiGPUTrainer(NUM_GPU))
4 changes: 2 additions & 2 deletions examples/ImageNetModels/shufflenet.py
Expand Up @@ -14,7 +14,7 @@
from tensorpack.dataflow import imgaug
from tensorpack.tfutils import argscope, get_model_loader, model_utils
from tensorpack.tfutils.scope_utils import under_name_scope
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu

from imagenet_utils import (
get_imagenet_dataflow,
Expand Down Expand Up @@ -212,7 +212,7 @@ def get_config(model, nr_tower):
else:
logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))

nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
config = get_config(model, nr_tower)
if args.load:
config.session_init = get_model_loader(args.load)
Expand Down
6 changes: 3 additions & 3 deletions examples/ImageNetModels/vgg16.py
Expand Up @@ -10,7 +10,7 @@
from tensorpack import *
from tensorpack.tfutils import argscope
from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu

from imagenet_utils import (
ImageNetModel, get_imagenet_dataflow, fbresnet_augmentor)
Expand Down Expand Up @@ -108,7 +108,7 @@ def get_data(name, batch):


def get_config():
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
batch = args.batch
total_batch = batch * nr_tower
assert total_batch >= 256 # otherwise the learning rate warmup is wrong.
Expand Down Expand Up @@ -159,6 +159,6 @@ def get_config():
logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))

config = get_config()
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
trainer = SyncMultiGPUTrainerReplicated(nr_tower)
launch_train_with_config(config, trainer)
2 changes: 1 addition & 1 deletion examples/README.md
Expand Up @@ -27,8 +27,8 @@ These are all the toy examples in tensorpack. They are supposed to be just demos
| --- | --- |
| Train [ResNet](ResNet), [ShuffleNet and other models](ImageNetModels) on ImageNet | reproduce paper |
| [Train Faster-RCNN / Mask-RCNN on COCO](FasterRCNN) | reproduce paper |
| [DoReFa-Net: training binary / low-bitwidth CNN on ImageNet](DoReFa-Net) | reproduce paper |
| [Generative Adversarial Network(GAN) variants](GAN), including DCGAN, InfoGAN, <br/> Conditional GAN, WGAN, BEGAN, DiscoGAN, Image to Image, CycleGAN | visually reproduce |
| [DoReFa-Net: training binary / low-bitwidth CNN on ImageNet](DoReFa-Net) | reproduce paper |
| [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](HED) | visually reproduce |
| [Spatial Transformer Networks on MNIST addition](SpatialTransformer) | reproduce paper |
| [Visualize CNN saliency maps](Saliency) | visually reproduce |
Expand Down
6 changes: 3 additions & 3 deletions examples/ResNet/cifar10-resnet.py
Expand Up @@ -9,7 +9,7 @@

from tensorpack import *
from tensorpack.tfutils.summary import add_moving_summary, add_param_summary
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.dataflow import dataset

import tensorflow as tf
Expand Down Expand Up @@ -170,5 +170,5 @@ def get_data(train_or_test):
max_epoch=400,
session_init=SaverRestore(args.load) if args.load else None
)
nr_gpu = max(get_nr_gpu(), 1)
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
num_gpu = max(get_num_gpu(), 1)
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
6 changes: 3 additions & 3 deletions examples/ResNet/imagenet-resnet.py
Expand Up @@ -12,7 +12,7 @@
TrainConfig, SyncMultiGPUTrainerReplicated, launch_train_with_config)
from tensorpack.dataflow import FakeData
from tensorpack.tfutils import argscope, get_model_loader
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu

from imagenet_utils import (
fbresnet_augmentor, get_imagenet_dataflow, ImageNetModel,
Expand Down Expand Up @@ -57,7 +57,7 @@ def get_data(name, batch):


def get_config(model, fake=False):
nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
assert args.batch % nr_tower == 0
batch = args.batch // nr_tower

Expand Down Expand Up @@ -143,5 +143,5 @@ def get_config(model, fake=False):
config = get_config(model, fake=args.fake)
if args.load:
config.session_init = get_model_loader(args.load)
trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
launch_train_with_config(config, trainer)
8 changes: 4 additions & 4 deletions examples/Saliency/CAM-resnet.py
Expand Up @@ -16,7 +16,7 @@
from tensorpack.tfutils import optimizer, gradproc
from tensorpack.tfutils.symbolic_functions import *
from tensorpack.tfutils.summary import *
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.utils import viz

from imagenet_utils import (
Expand Down Expand Up @@ -157,8 +157,8 @@ def viz_cam(model_file, data_dir):
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

nr_gpu = get_nr_gpu()
BATCH_SIZE = TOTAL_BATCH_SIZE // nr_gpu
num_gpu = get_num_gpu()
BATCH_SIZE = TOTAL_BATCH_SIZE // num_gpu

if args.cam:
BATCH_SIZE = 128 # something that can run on one gpu
Expand All @@ -169,4 +169,4 @@ def viz_cam(model_file, data_dir):
config = get_config()
if args.load:
config.session_init = get_model_loader(args.load)
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpu))
launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
3 changes: 2 additions & 1 deletion examples/SuperResolution/enet-pat.py
Expand Up @@ -13,6 +13,7 @@
from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.utils import logger
from tensorpack.utils.gpu import get_num_gpu
from data_sampler import (
ImageDecode, ImageDataFromZIPFile,
RejectTooSmallImages, CenterSquareResize)
Expand Down Expand Up @@ -286,7 +287,7 @@ def get_data(file_name):
param_dict = {'VGG19/' + name: value for name, value in six.iteritems(param_dict)}
session_init = DictRestore(param_dict)

nr_tower = max(get_nr_gpu(), 1)
nr_tower = max(get_num_gpu(), 1)
data = QueueInput(get_data(args.data))
model = Model()

Expand Down
4 changes: 2 additions & 2 deletions examples/keras/README.md
@@ -1,10 +1,10 @@

## Keras + Tensorpack

Use Keras to define a model a train it with efficient tensorpack trainers.
Use Keras to define a model and train it with efficient tensorpack trainers.

### Why?
Keras alone has various overhead. In particular, it is not efficient when working on large models.
Keras alone has various overhead. In particular, it is not efficient with large models.
The article [Towards Efficient Multi-GPU Training in Keras with TensorFlow](https://medium.com/rossum/towards-efficient-multi-gpu-training-in-keras-with-tensorflow-8a0091074fb2)
has mentioned some of it.

Expand Down
10 changes: 5 additions & 5 deletions examples/keras/imagenet-resnet-keras.py
Expand Up @@ -11,7 +11,7 @@
from tensorpack import InputDesc, SyncMultiGPUTrainerReplicated
from tensorpack.dataflow import FakeData, MapDataComponent
from tensorpack.utils import logger
from tensorpack.utils.gpu import get_nr_gpu
from tensorpack.utils.gpu import get_num_gpu
from tensorpack.contrib.keras import KerasModel
from tensorpack.callbacks import *
from tensorflow.python.keras.layers import *
Expand Down Expand Up @@ -141,12 +141,12 @@ def image_preprocess(image):

tf.keras.backend.set_image_data_format('channels_first')

nr_gpu = get_nr_gpu()
num_gpu = get_num_gpu()
if args.fake:
df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8')
df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
else:
batch_size = TOTAL_BATCH_SIZE // nr_gpu
batch_size = TOTAL_BATCH_SIZE // num_gpu
assert args.data is not None
df_train = get_imagenet_dataflow(
args.data, 'train', batch_size, fbresnet_augmentor(True))
Expand All @@ -164,7 +164,7 @@ def one_hot(label):
inputs_desc=[InputDesc(tf.uint8, [None, 224, 224, 3], 'images')],
targets_desc=[InputDesc(tf.float32, [None, 1000], 'labels')],
input=df_train,
trainer=SyncMultiGPUTrainerReplicated(nr_gpu))
trainer=SyncMultiGPUTrainerReplicated(num_gpu))

lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
tf.summary.scalar('lr', lr)
Expand All @@ -188,7 +188,7 @@ def one_hot(label):
if not args.fake:
callbacks.append(
DataParallelInferenceRunner(
df_val, ScalarStats(['categorical_accuracy']), nr_gpu))
df_val, ScalarStats(['categorical_accuracy']), num_gpu))

M.fit(
steps_per_epoch=100 if args.fake else 1281167 // TOTAL_BATCH_SIZE,
Expand Down
5 changes: 4 additions & 1 deletion tensorpack/models/batch_norm.py
Expand Up @@ -96,8 +96,11 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
and it then uses per-machine (multiple GPU) statistics to normalize.
Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute
global mean&variance. The result is the global mean&variance only if each tower has the same batch size.
This option has no effect when not training.
The option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222
Variable Names:
Expand Down

0 comments on commit a3581e7

Please sign in to comment.