In [1]:
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.datastore import Datastore
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.environment import CondaDependencies
from azureml.core.runconfig import RunConfiguration
from azureml.exceptions import ComputeTargetException
from azureml.data.data_reference import DataReference
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.train.dnn import TensorFlow
# from azureml.train.hyperdrive import *
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

import os
import shutil
import urllib
import numpy as np
# import matplotlib.pyplot as plt

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.1.5


## Initialize workspace

Initialize a workspace object from persisted configuration. If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure the config file is present at .\config.json

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

avadevitsmlsvc
RG-ITSMLTeam-Dev
westus2
ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e


In [3]:
exp = Experiment(workspace=ws, name='Hyperdrive_test')

In [4]:
ds = ws.get_default_datastore()
# we're assuming that the data is already uploaded.
# if not refer to the original version of this notebook on how to download from web and upload them
# ds.upload(src_dir='./data/mnist', target_path='mnist', overwrite=True, show_progress=True)

## Retrieve or create a Azure Machine Learning compute
Azure Machine Learning Compute is a service for provisioning and managing clusters of Azure virtual machines for running machine learning workloads. Let's create a new Azure Machine Learning Compute in the current workspace, if it doesn't already exist. We will then run the training script on this compute target.

If we could not find the compute with the given name in the previous cell, then we will create a new compute here. This process is broken down into the following steps:

1. Create the configuration
2. Create the Azure Machine Learning compute

**This process will take a few minutes and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell.**


In [5]:
cluster_name = "gpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target {}.'.format(cluster_name))
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_NC6",
                                                               max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)

print("Azure Machine Learning Compute attached")

Found existing compute target gpu-cluster.
Azure Machine Learning Compute attached


In [6]:
est = TensorFlow(source_directory='./tf-mnist',                 
                 compute_target=compute_target,
                 entry_script='tf_mnist.py', 
                 use_gpu=True,
                 framework_version='1.13')

In [7]:
ps = RandomParameterSampling(
    {
        '--batch-size': choice(25, 50, 100),
        '--first-layer-neurons': choice(10, 50, 200, 300, 500),
        '--second-layer-neurons': choice(10, 50, 200, 500),
        '--learning-rate': loguniform(-6, -1)
    }
)

In [8]:
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

In [9]:
hd_config = HyperDriveConfig(estimator=est, 
                             hyperparameter_sampling=ps,
                             policy=early_termination_policy,
                             primary_metric_name='validation_acc', 
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                             max_total_runs=4,
                             max_concurrent_runs=4)

In [10]:
data_folder = DataReference(
    datastore=ds,
    data_reference_name="mnist_data")

In [11]:
metrics_output_name = 'metrics_output'
metrics_data = PipelineData(name='metrics_data',
                             datastore=ds)

hd_step_name='hd_step01'
hd_step = HyperDriveStep(
    name=hd_step_name,
    hyperdrive_config=hd_config,
    estimator_entry_script_arguments=['--data-folder', data_folder],
    inputs=[data_folder],
    metrics_output=metrics_data,
    allow_reuse=True)

In [12]:
conda_run_config = RunConfiguration(framework="python")
conda_run_config.target = compute_target
cd = CondaDependencies.create(pip_packages=['pandas', 'azureml-defaults'], 
                              pin_sdk_version=True)
conda_run_config.environment.python.conda_dependencies = cd

best_run_data = PipelineData('best_run_data', is_directory=True, datastore=ds)

best_run_step = PythonScriptStep(
    name='get best run',
    script_name='metrics.py',
    compute_target=compute_target,
        arguments=['--input_file', metrics_data,
               '--output_dir', best_run_data],
    inputs=[metrics_data],
    outputs=[best_run_data],
    runconfig=conda_run_config,
    source_directory=os.path.join(os.getcwd(), 'metrics'),
    allow_reuse=True
)

In [13]:
pipeline = Pipeline(workspace=ws, steps=[best_run_step])

### Run the pipeline

In [14]:
pipeline_run = exp.submit(pipeline)

Created step get best run [d7beb79a][a9842d81-7995-43c1-bb55-df20aa92e1b0], (This step is eligible to reuse a previous run's output)Created step hd_step01 [008ab04a][5129465c-2f07-4257-833f-7578c0855442], (This step will run and generate new outputs)

Using data reference mnist_data for StepId [e0e113af][c34f1c52-35c4-4553-a4fd-0ffa20b750c6], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun ffdbcfe9-cda2-4bcd-840c-1b55a5ab22f1
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Hyperdrive_test/runs/ffdbcfe9-cda2-4bcd-840c-1b55a5ab22f1?wsid=/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourcegroups/RG-ITSMLTeam-Dev/workspaces/avadevitsmlsvc


### Wait for the completion of this Pipeline run

In [None]:
pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: ffdbcfe9-cda2-4bcd-840c-1b55a5ab22f1
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Hyperdrive_test/runs/ffdbcfe9-cda2-4bcd-840c-1b55a5ab22f1?wsid=/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourcegroups/RG-ITSMLTeam-Dev/workspaces/avadevitsmlsvc
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 3c0f6c46-2445-4caf-940b-23ffad9f03d6
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Hyperdrive_test/runs/3c0f6c46-2445-4caf-940b-23ffad9f03d6?wsid=/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourcegroups/RG-ITSMLTeam-Dev/workspaces/avadevitsmlsvc
StepRun( hd_step01 ) Status: NotStarted
StepRun( hd_step01 ) Status: Running


In [None]:
pipeline_run2 = exp.submit(pipeline)

In [None]:
pipeline_run2.wait_for_completion(show_output=True)

In [None]:
pipeline3 = Pipeline(workspace=ws, steps=[best_run_step])

In [None]:
pipeline_run3 = exp.submit(pipeline3)

In [None]:
pipeline_run3.wait_for_completion(show_output=True)