Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 13 additions & 36 deletions official/mnist/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from official.mnist import dataset
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.misc import distribution_utils
from official.utils.misc import model_helpers


Expand Down Expand Up @@ -87,7 +88,7 @@ def create_model(data_format):


def define_mnist_flags():
flags_core.define_base(multi_gpu=True, num_gpu=False)
flags_core.define_base()
flags_core.define_image()
flags.adopt_module_key_flags(flags_core)
flags_core.set_defaults(data_dir='/tmp/mnist_data',
Expand Down Expand Up @@ -152,53 +153,29 @@ def model_fn(features, labels, mode, params):
})


def validate_batch_size_for_multi_gpu(batch_size):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.

Note that this should eventually be handled by replicate_model_fn
directly. Multi-GPU support is currently experimental, however,
so doing the work here until that feature is in place.

Args:
batch_size: the number of examples processed in each training batch.

Raises:
ValueError: if no GPUs are found, or selected batch_size is invalid.
"""
from tensorflow.python.client import device_lib # pylint: disable=g-import-not-at-top

local_device_protos = device_lib.list_local_devices()
num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
if not num_gpus:
raise ValueError('Multi-GPU mode was specified, but no GPUs '
'were found. To use CPU, run without --multi_gpu.')

remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. '
'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)


def run_mnist(flags_obj):
"""Run MNIST training and eval loop.

Args:
flags_obj: An object containing parsed flag values.
"""

model_function = model_fn

if flags_obj.multi_gpu:
validate_batch_size_for_multi_gpu(flags_obj.batch_size)
# Get number of GPUs as defined by the --num_gpus flags and the number of
# GPUs available on the machine.
num_gpus = flags_core.get_num_gpus(flags_obj)
multi_gpu = num_gpus > 1

if multi_gpu:
# Validate that the batch size can be split into devices.
distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus)

# There are two steps required if using multi-GPU: (1) wrap the model_fn,
# and (2) wrap the optimizer. The first happens here, and (2) happens
# in the model_fn itself when the optimizer is defined.
model_function = tf.contrib.estimator.replicate_model_fn(
model_fn, loss_reduction=tf.losses.Reduction.MEAN)
model_fn, loss_reduction=tf.losses.Reduction.MEAN,
devices=["/device:GPU:%d" % d for d in range(num_gpus)])

data_format = flags_obj.data_format
if data_format is None:
Expand All @@ -209,7 +186,7 @@ def run_mnist(flags_obj):
model_dir=flags_obj.model_dir,
params={
'data_format': data_format,
'multi_gpu': flags_obj.multi_gpu
'multi_gpu': multi_gpu
})

# Set up training and evaluation input functions.
Expand Down
41 changes: 7 additions & 34 deletions official/recommendation/ncf_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import model_helpers

_TOP_K = 10 # Top-k list for evaluation
Expand Down Expand Up @@ -85,7 +86,8 @@ def evaluate_model(estimator, batch_size, num_gpus, ncf_dataset):
# Define prediction input function
def pred_input_fn():
return dataset.input_fn(
False, per_device_batch_size(batch_size, num_gpus), ncf_dataset)
False, distribution_utils.per_device_batch_size(batch_size, num_gpus),
ncf_dataset)

# Get predictions
predictions = estimator.predict(input_fn=pred_input_fn)
Expand Down Expand Up @@ -166,37 +168,6 @@ def convert_keras_to_estimator(keras_model, num_gpus, model_dir):
return estimator


def per_device_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.

Note that this should eventually be handled by DistributionStrategies
directly. Multi-GPU support is currently experimental, however,
so doing the work here until that feature is in place.

Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.

Returns:
Batch size per device.

Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size

remainder = batch_size % num_gpus
if remainder:
err = ("When running with multiple GPUs, batch size "
"must be a multiple of the number of available GPUs. Found {} "
"GPUs with a batch size of {}; try --batch_size={} instead."
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)


def main(_):
with logger.benchmark_context(FLAGS):
run_ncf(FLAGS)
Expand Down Expand Up @@ -253,7 +224,8 @@ def run_ncf(_):
# Training and evaluation cycle
def train_input_fn():
return dataset.input_fn(
True, per_device_batch_size(FLAGS.batch_size, num_gpus),
True,
distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus),
ncf_dataset, FLAGS.epochs_between_evals)

total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
Expand Down Expand Up @@ -296,7 +268,8 @@ def define_ncf_flags():
intra_op=False,
synthetic_data=False,
max_train_steps=False,
dtype=False
dtype=False,
all_reduce_alg=False
)
flags_core.define_benchmark()

Expand Down
60 changes: 13 additions & 47 deletions official/resnet/resnet_run_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from official.utils.export import export
from official.utils.logs import hooks_helper
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import model_helpers
# pylint: enable=g-bad-import-order

Expand Down Expand Up @@ -124,9 +125,11 @@ def get_synth_input_fn(height, width, num_channels, num_classes):
that can be used for iteration.
"""
def input_fn(is_training, data_dir, batch_size, *args, **kwargs): # pylint: disable=unused-argument
images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
labels = tf.zeros((batch_size), tf.int32)
return tf.data.Dataset.from_tensors((images, labels)).repeat()
return model_helpers.generate_synthetic_data(
input_shape=tf.TensorShape([batch_size, height, width, num_channels]),
input_dtype=tf.float32,
label_shape=tf.TensorShape([batch_size]),
label_dtype=tf.int32)

return input_fn

Expand Down Expand Up @@ -316,37 +319,6 @@ def exclude_batch_norm(name):
eval_metric_ops=metrics)


def per_device_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.

Note that this should eventually be handled by DistributionStrategies
directly. Multi-GPU support is currently experimental, however,
so doing the work here until that feature is in place.

Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.

Returns:
Batch size per device.

Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size

remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)


def resnet_main(
flags_obj, model_function, input_function, dataset_name, shape=None):
"""Shared main loop for ResNet Models.
Expand Down Expand Up @@ -377,17 +349,11 @@ def resnet_main(
intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
allow_soft_placement=True)

if flags_core.get_num_gpus(flags_obj) == 0:
distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
elif flags_core.get_num_gpus(flags_obj) == 1:
distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
else:
distribution = tf.contrib.distribute.MirroredStrategy(
num_gpus=flags_core.get_num_gpus(flags_obj)
)
distribution_strategy = distribution_utils.get_distribution_strategy(
flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

run_config = tf.estimator.RunConfig(train_distribute=distribution,
session_config=session_config)
run_config = tf.estimator.RunConfig(
train_distribute=distribution_strategy, session_config=session_config)

classifier = tf.estimator.Estimator(
model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
Expand All @@ -409,7 +375,7 @@ def resnet_main(
'train_epochs': flags_obj.train_epochs,
}
if flags_obj.use_synthetic_data:
dataset_name = dataset_name + "-synthetic"
dataset_name = dataset_name + '-synthetic'

benchmark_logger = logger.get_benchmark_logger()
benchmark_logger.log_run_info('resnet', dataset_name, run_params,
Expand All @@ -422,15 +388,15 @@ def resnet_main(
def input_fn_train():
return input_function(
is_training=True, data_dir=flags_obj.data_dir,
batch_size=per_device_batch_size(
batch_size=distribution_utils.per_device_batch_size(
flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
num_epochs=flags_obj.epochs_between_evals,
num_gpus=flags_core.get_num_gpus(flags_obj))

def input_fn_eval():
return input_function(
is_training=False, data_dir=flags_obj.data_dir,
batch_size=per_device_batch_size(
batch_size=distribution_utils.per_device_batch_size(
flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
num_epochs=1)

Expand Down
32 changes: 25 additions & 7 deletions official/transformer/model/model_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@
# ==============================================================================
"""Defines Transformer model parameters."""

from collections import defaultdict


BASE_PARAMS = defaultdict(
lambda: None, # Set default value to None.

BASE_PARAMS = dict(
# Input params
default_batch_size=2048, # Maximum number of tokens per batch of examples.
default_batch_size_tpu=32768,
Expand Down Expand Up @@ -56,8 +60,8 @@
allow_ffn_pad=True,
)

BIG_PARAMS = dict(BASE_PARAMS)
BIG_PARAMS.update(dict(
BIG_PARAMS = BASE_PARAMS.copy()
BIG_PARAMS.update(
default_batch_size=4096,

# default batch size is smaller than for BASE_PARAMS due to memory limits.
Expand All @@ -66,13 +70,27 @@
hidden_size=1024,
filter_size=4096,
num_heads=16,
))
)

# Parameters for running the model in multi gpu. These should not change the
# params that modify the model shape (such as the hidden_size or num_heads).
BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
BASE_MULTI_GPU_PARAMS.update(
learning_rate_warmup_steps=8000
)

BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
BIG_MULTI_GPU_PARAMS.update(
layer_postprocess_dropout=0.3,
learning_rate_warmup_steps=8000
)

TINY_PARAMS = dict(BASE_PARAMS)
TINY_PARAMS.update(dict(
# Parameters for testing the model
TINY_PARAMS = BASE_PARAMS.copy()
TINY_PARAMS.update(
default_batch_size=1024,
default_batch_size_tpu=1024,
hidden_size=32,
num_heads=4,
filter_size=256,
))
)
Loading