# Distributed Keras Horovod
In this tutorial, you will train a CNN model in Keras using distributed training via [Horovod](https://github.com/uber/horovod).

In [1]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.2


## Initialize workspace

In [3]:
import azureml.core
from azureml.core import Experiment
from azureml.core import Workspace
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core.runconfig import RunConfiguration
from azureml.core import ScriptRunConfig
import socket


spAuth = ServicePrincipalAuthentication(
    tenant_id="TENANT_ID",
    username="SERVICE_PRINCIPAL_ID", 
    password="SERVICE_PRINCIPAL_PASSWORD")

subscription_id = "SUBSCRIPTION_ID"
resource_group = "RESOURCE_GROUP_NAME"
workspace_name = "WORKSPACE_NAME"

ws = Workspace(auth=spAuth, subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: WORKSPACE_NAME
Azure region: eastus2
Subscription id: SUBSCRIPTION_ID
Resource group: RESOURCE_GROUP_NAME


## Create a remote compute target

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                           min_nodes=2,
                                                           max_nodes=2)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# Use the 'status' property to get a detailed status for the current cluster. 
print(compute_target.status.serialize())

Creating a new compute target...
Creating
Succeeded..

## Train model on the remote compute

In [5]:
import os

project_folder = './keras-distr-hvd'
os.makedirs(project_folder, exist_ok=True)

Copy the training script `keras-horovod.py` into this project directory.

In [6]:
import shutil

shutil.copy('keras-horovod.py', project_folder)

'./keras-distr-hvd\\keras-horovod.py'

### Create an experiment
Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. 

In [7]:
from azureml.core import Experiment

experiment_name = 'keras-mnis-distr-hvd'
experiment = Experiment(ws, name=experiment_name)

### Create an estimator
The AML SDK's estimator enables you to easily submit training jobs for both single-node and distributed runs.

In [8]:
from azureml.train.estimator import Estimator

script_params={
    #'--input_data': ds_data
}

estimator = Estimator(source_directory=project_folder,
                      compute_target=compute_target,
                      script_params=script_params,
                      entry_script='keras-horovod.py',
                      node_count=2,
                      process_count_per_node=1,
                      distributed_backend='mpi',
                      use_gpu=True)

estimator.conda_dependencies.add_pip_package('tensorflow-gpu==1.10.0')
estimator.conda_dependencies.add_pip_package('keras==2.2.4') # 
estimator.conda_dependencies.add_pip_package('horovod==0.13.11')

### Submit job
Run your experiment by submitting your estimator object. Note that this call is asynchronous.

In [9]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: keras-mnis-distr-hvd,
Id: keras-mnis-distr-hvd_1544701322306,
Type: azureml.scriptrun,
Status: Queued)


### Monitor your run
You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes.

In [10]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…