In [1]:
from azureml.core import Workspace, Experiment, Run

# Configure experiment
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name='cifar10_cnn_horovod')

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Cluster configuration
cluster_name = "aml-gpu"
min_nodes = 0
max_nodes = 2
vm_size = "Standard_NC6"

# Verify that the cluster exists already
try:
    aml_cluster = ComputeTarget(workspace=ws, name=cluster_name)
except ComputeTargetException:
    print('Cluster not `%s` not found, creating one now.' % cluster_name)
    config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes)
    aml_cluster = ComputeTarget.create(workspace=ws, name=cluster_name, provisioning_configuration=config)

# Wait until the cluster is ready
aml_cluster.wait_for_completion(show_output=True)

Cluster not `aml-gpu` not found, creating one now.
Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.dnn import TensorFlow, Mpi

script = 'cifar10_cnn_horovod.py'
script_folder = os.path.join(os.getcwd(), 'code')

estimator = TensorFlow(source_directory=script_folder,
                       compute_target=aml_cluster,
                       entry_script=script,
                       script_params={'--epochs': 30},
                       node_count=2,
                       distributed_training=Mpi(process_count_per_node=1),
                       pip_packages=['keras'],
                       framework_version='1.13',
                       use_gpu=True)

run = exp.submit(estimator)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…