# HyperDriveStep() Hang

This script is run in our environment and causes an indefinite hang in HyperDriveStep().  The hang occurs for any call to HyperDriveStep(), so here I have just a dummy function.  The code being run is unimportant.

See also https://learn.microsoft.com/en-us/answers/questions/889157/why-is-my-hyperdrive-step-not-completing-even-when?comment=question#newest-question-comment

## Setup -- Modify these to run

In [None]:
# Name of workspace, etc. for Workspace.get()
WS_NAME = "XX"
WS_ID = "XX"
WS_RESOURCE_GRP = "XX"

In [None]:
# Environment, for Environment.get()
ENV_NAME = "XX"

In [None]:
# Compute target (cluster)
COMP_TARGET = 'XX'

In [None]:
# Name of datastore for metrics
DATASTORE_NAME = 'XX'

In [None]:
# Number of concurrent parallel runs - aim to use about 80% of the cluster capacity, or just set to something small like 2-3
NUM_PROC = 3

## Dummy Python Script
Write to text in the current directory.  Will be used in ScriptRunConfig()

In [None]:
%%writefile empty_step.py

import argparse
import os
from azureml.core import Run
from azureml.core import Datastore, Dataset
from azureml.core.workspace import Workspace

run = Run.get_context()
ws = run.experiment.workspace

parser = argparse.ArgumentParser()                  

parser.add_argument('--learning_rate', type=float, default=0.1,
                    help='Learning Rate')
parser.add_argument('--min_child_weight', type=float, default=1,
                    help='Controls Overfitting')

args = parser.parse_args()

# Just log something
def main():
    run.log('Runtype', 'TEST')
    run.log('AUC', '0.7')

if __name__ == '__main__':
    main()


## Packages

In [None]:
from azureml.core import Dataset, ScriptRunConfig, Experiment, Datastore, Environment
from azureml.core.runconfig import RunConfiguration
from azureml.core.workspace import Workspace
from azureml.core.compute import ComputeTarget
from azureml.pipeline.core import PipelineData, Pipeline, PipelineRun, StepSequence, TrainingOutput
from azureml.pipeline.steps import HyperDriveStep
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal, BayesianParameterSampling, uniform, HyperDriveRun
import math

## Workspace and compute

In [None]:
ws = Workspace.get(
            name=WS_NAME, 
            subscription_id=WS_ID,
            resource_group=WS_RESOURCE_GRP
        )

In [None]:
std_env = Environment.get(workspace=ws, name=ENV_NAME)

In [None]:
train_datastore = Datastore.get(ws, DATASTORE_NAME)

## Create the Step

In [None]:
# Pipeline data for parameter tuning
step_metrics_data = PipelineData(name='hanging_step', datastore=train_datastore,
                                pipeline_output_name='step_metrics',
                                training_output=TrainingOutput(type='Metrics'))

In [None]:
# Run config with un-tuned parameters
step_run_config = ScriptRunConfig(source_directory = '.', 
                                    script = './empty_step.py', 
                                    environment=std_env,
                                    compute_target=COMP_TARGET)

In [None]:
# Hyperparameter sampling
step_tuning = BayesianParameterSampling({
    '--learning_rate': uniform(0.005,0.2),
    '--min_child_weight': uniform(50,500)
})

In [None]:
# I do a small number of max_total_runs so the hang happens quickly.
# Ignore the warning.
step_hyperdrive_config = HyperDriveConfig(
    run_config = step_run_config,
    hyperparameter_sampling=step_tuning,
    primary_metric_name="AUC",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
    max_total_runs = 3,  # Small number here to make it hang fast
    max_concurrent_runs= NUM_PROC
)

In [None]:
step = HyperDriveStep(name = 'model-tune',
                        hyperdrive_config = step_hyperdrive_config,
                        outputs=[step_metrics_data])

## Pipeline

In [None]:
step_sequence = StepSequence(steps=[step])

In [None]:
pipeline = Pipeline(workspace=ws, steps=step_sequence, 
                    default_source_directory = '.',
                    description= "HyperDriveStep hang")
pipeline_exp = Experiment(ws,  'eechxgb-hang')

In [None]:
pipeline_run = pipeline_exp.submit(pipeline)