tensorflow · k-w-w · Jun 12, 2018 · Jun 4, 2018 · Jun 7, 2018 · Jun 4, 2018
@@ -24,6 +24,7 @@
 from official.mnist import dataset
 from official.utils.flags import core as flags_core
 from official.utils.logs import hooks_helper
+from official.utils.misc import distribution_utils
 from official.utils.misc import model_helpers
 
 
@@ -87,7 +88,7 @@ def create_model(data_format):
 
 
 def define_mnist_flags():
-  flags_core.define_base(multi_gpu=True, num_gpu=False)
+  flags_core.define_base()
   flags_core.define_image()
   flags.adopt_module_key_flags(flags_core)
   flags_core.set_defaults(data_dir='/tmp/mnist_data',
@@ -152,53 +153,29 @@ def model_fn(features, labels, mode, params):
         })
 
 
-def validate_batch_size_for_multi_gpu(batch_size):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-  Note that this should eventually be handled by replicate_model_fn
-  directly. Multi-GPU support is currently experimental, however,
-  so doing the work here until that feature is in place.
-
-  Args:
-    batch_size: the number of examples processed in each training batch.
-
-  Raises:
-    ValueError: if no GPUs are found, or selected batch_size is invalid.
-  """
-  from tensorflow.python.client import device_lib  # pylint: disable=g-import-not-at-top
-
-  local_device_protos = device_lib.list_local_devices()
-  num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
-  if not num_gpus:
-    raise ValueError('Multi-GPU mode was specified, but no GPUs '
-                     'were found. To use CPU, run without --multi_gpu.')
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. '
-           'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-
-
 def run_mnist(flags_obj):
   """Run MNIST training and eval loop.
 
   Args:
     flags_obj: An object containing parsed flag values.
   """
-
   model_function = model_fn
 
-  if flags_obj.multi_gpu:
-    validate_batch_size_for_multi_gpu(flags_obj.batch_size)
+  # Get number of GPUs as defined by the --num_gpus flags and the number of
+  # GPUs available on the machine.
+  num_gpus = flags_core.get_num_gpus(flags_obj)
+  multi_gpu = num_gpus > 1
+
+  if multi_gpu:
+    # Validate that the batch size can be split into devices.
+    distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus)
 
     # There are two steps required if using multi-GPU: (1) wrap the model_fn,
     # and (2) wrap the optimizer. The first happens here, and (2) happens
     # in the model_fn itself when the optimizer is defined.
     model_function = tf.contrib.estimator.replicate_model_fn(
-        model_fn, loss_reduction=tf.losses.Reduction.MEAN)
+        model_fn, loss_reduction=tf.losses.Reduction.MEAN,
+        devices=["/device:GPU:%d" % d for d in range(num_gpus)])
 
   data_format = flags_obj.data_format
   if data_format is None:
@@ -209,7 +186,7 @@ def run_mnist(flags_obj):
       model_dir=flags_obj.model_dir,
       params={
           'data_format': data_format,
-          'multi_gpu': flags_obj.multi_gpu
+          'multi_gpu': multi_gpu
       })
 
   # Set up training and evaluation input functions.

@@ -38,6 +38,7 @@
 from official.utils.flags import core as flags_core
 from official.utils.logs import hooks_helper
 from official.utils.logs import logger
+from official.utils.misc import distribution_utils
 from official.utils.misc import model_helpers
 
 _TOP_K = 10  # Top-k list for evaluation
@@ -85,7 +86,8 @@ def evaluate_model(estimator, batch_size, num_gpus, ncf_dataset):
   # Define prediction input function
   def pred_input_fn():
     return dataset.input_fn(
-        False, per_device_batch_size(batch_size, num_gpus), ncf_dataset)
+        False, distribution_utils.per_device_batch_size(batch_size, num_gpus),
+        ncf_dataset)
 
   # Get predictions
   predictions = estimator.predict(input_fn=pred_input_fn)
@@ -166,37 +168,6 @@ def convert_keras_to_estimator(keras_model, num_gpus, model_dir):
   return estimator
 
 
-def per_device_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-  Note that this should eventually be handled by DistributionStrategies
-  directly. Multi-GPU support is currently experimental, however,
-  so doing the work here until that feature is in place.
-
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-
-  Returns:
-    Batch size per device.
-
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ("When running with multiple GPUs, batch size "
-           "must be a multiple of the number of available GPUs. Found {} "
-           "GPUs with a batch size of {}; try --batch_size={} instead."
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-
-
 def main(_):
   with logger.benchmark_context(FLAGS):
     run_ncf(FLAGS)
@@ -253,7 +224,8 @@ def run_ncf(_):
   # Training and evaluation cycle
   def train_input_fn():
     return dataset.input_fn(
-        True, per_device_batch_size(FLAGS.batch_size, num_gpus),
+        True,
+        distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
         ncf_dataset, FLAGS.epochs_between_evals)
 
   total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
@@ -296,7 +268,8 @@ def define_ncf_flags():
       intra_op=False,
       synthetic_data=False,
       max_train_steps=False,
-      dtype=False
+      dtype=False,
+      all_reduce_alg=False
   )
   flags_core.define_benchmark()
 

@@ -34,6 +34,7 @@
 from official.utils.export import export
 from official.utils.logs import hooks_helper
 from official.utils.logs import logger
+from official.utils.misc import distribution_utils
 from official.utils.misc import model_helpers
 # pylint: enable=g-bad-import-order
 
@@ -124,9 +125,11 @@ def get_synth_input_fn(height, width, num_channels, num_classes):
     that can be used for iteration.
   """
   def input_fn(is_training, data_dir, batch_size, *args, **kwargs):  # pylint: disable=unused-argument
-    images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
-    labels = tf.zeros((batch_size), tf.int32)
-    return tf.data.Dataset.from_tensors((images, labels)).repeat()
+    return model_helpers.generate_synthetic_data(
+        input_shape=tf.TensorShape([batch_size, height, width, num_channels]),
+        input_dtype=tf.float32,
+        label_shape=tf.TensorShape([batch_size]),
+        label_dtype=tf.int32)
 
   return input_fn
 
@@ -316,37 +319,6 @@ def exclude_batch_norm(name):
       eval_metric_ops=metrics)
 
 
-def per_device_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-  Note that this should eventually be handled by DistributionStrategies
-  directly. Multi-GPU support is currently experimental, however,
-  so doing the work here until that feature is in place.
-
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-
-  Returns:
-    Batch size per device.
-
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. Found {} '
-           'GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-
-
 def resnet_main(
     flags_obj, model_function, input_function, dataset_name, shape=None):
   """Shared main loop for ResNet Models.
@@ -377,17 +349,11 @@ def resnet_main(
       intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
       allow_soft_placement=True)
 
-  if flags_core.get_num_gpus(flags_obj) == 0:
-    distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
-  elif flags_core.get_num_gpus(flags_obj) == 1:
-    distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
-  else:
-    distribution = tf.contrib.distribute.MirroredStrategy(
-        num_gpus=flags_core.get_num_gpus(flags_obj)
-    )
+  distribution_strategy = distribution_utils.get_distribution_strategy(
+      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)
 
-  run_config = tf.estimator.RunConfig(train_distribute=distribution,
-                                      session_config=session_config)
+  run_config = tf.estimator.RunConfig(
+      train_distribute=distribution_strategy, session_config=session_config)
 
   classifier = tf.estimator.Estimator(
       model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
@@ -409,7 +375,7 @@ def resnet_main(
       'train_epochs': flags_obj.train_epochs,
   }
   if flags_obj.use_synthetic_data:
-    dataset_name = dataset_name + "-synthetic"
+    dataset_name = dataset_name + '-synthetic'
 
   benchmark_logger = logger.get_benchmark_logger()
   benchmark_logger.log_run_info('resnet', dataset_name, run_params,
@@ -422,15 +388,15 @@ def resnet_main(
   def input_fn_train():
     return input_function(
         is_training=True, data_dir=flags_obj.data_dir,
-        batch_size=per_device_batch_size(
+        batch_size=distribution_utils.per_device_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=flags_obj.epochs_between_evals,
         num_gpus=flags_core.get_num_gpus(flags_obj))
 
   def input_fn_eval():
     return input_function(
         is_training=False, data_dir=flags_obj.data_dir,
-        batch_size=per_device_batch_size(
+        batch_size=distribution_utils.per_device_batch_size(
             flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
         num_epochs=1)
 

@@ -14,8 +14,12 @@
 # ==============================================================================
 """Defines Transformer model parameters."""
 
+from collections import defaultdict
+
+
+BASE_PARAMS = defaultdict(
+    lambda: None,  # Set default value to None.
 
-BASE_PARAMS = dict(
     # Input params
     default_batch_size=2048,  # Maximum number of tokens per batch of examples.
     default_batch_size_tpu=32768,
@@ -56,8 +60,8 @@
     allow_ffn_pad=True,
 )
 
-BIG_PARAMS = dict(BASE_PARAMS)
-BIG_PARAMS.update(dict(
+BIG_PARAMS = BASE_PARAMS.copy()
+BIG_PARAMS.update(
     default_batch_size=4096,
 
     # default batch size is smaller than for BASE_PARAMS due to memory limits.
@@ -66,13 +70,27 @@
     hidden_size=1024,
     filter_size=4096,
     num_heads=16,
-))
+)
+
+# Parameters for running the model in multi gpu. These should not change the
+# params that modify the model shape (such as the hidden_size or num_heads).
+BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
+BASE_MULTI_GPU_PARAMS.update(
+    learning_rate_warmup_steps=8000
+)
+
+BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
+BIG_MULTI_GPU_PARAMS.update(
+    layer_postprocess_dropout=0.3,
+    learning_rate_warmup_steps=8000
+)
 
-TINY_PARAMS = dict(BASE_PARAMS)
-TINY_PARAMS.update(dict(
+# Parameters for testing the model
+TINY_PARAMS = BASE_PARAMS.copy()
+TINY_PARAMS.update(
     default_batch_size=1024,
     default_batch_size_tpu=1024,
     hidden_size=32,
     num_heads=4,
     filter_size=256,
-))
+)