# Hyperparameter Tuning using HyperDrive

Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
from azureml.core import Workspace, Experiment, Environment, ScriptRunConfig
from azureml.data.dataset_factory import TabularDatasetFactory as tdf
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy, MedianStoppingPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import joblib

## Set Up

### Load Workspace Elements, Create an Experiment

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'MS-Malware-Hyper'
project_folder = '.'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-133708
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-133708


### Create/Get Compute Cluster

In [3]:
cpu_cluster_name = "malware-compute"

# if cluster already exists, use it
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Cluster {} exists. Will use this cluster.'.format(cpu_cluster_name))
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
    cpu_cluster.wait_for_completion(show_output=True)

Cluster malware-compute exists. Will use this cluster.


### Create an Environment

In [4]:
%%writefile hyperdrive_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- pandas 
- numpy
- pip:
    - azureml-defaults
    - xgboost

Overwriting hyperdrive_dependencies.yml


In [5]:
hyperdrive_env = Environment.from_conda_specification(name = 'hyperdrive-env', file_path = './hyperdrive_dependencies.yml')

## Data Set

See the "automl.ipynb" Notebook for Data Set explanation

In [6]:
# Load Data set
data_path = 'https://raw.githubusercontent.com/tybyers/AZMLND_projects/capstone/capstone/data/train_1_10k.csv'
dataset = tdf.from_delimited_files(path=data_path)
dataset.to_pandas_dataframe().head()

Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7,0,,53447.0,1.0,...,36144,0,,0,0,0,0.0,0,10,0
1,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7,0,,53447.0,1.0,...,57858,0,,0,0,0,0.0,0,8,0
2,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7,0,,53447.0,1.0,...,52682,0,,0,0,0,0.0,0,3,0
3,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7,0,,53447.0,1.0,...,20050,0,,0,0,0,0.0,0,3,1
4,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7,0,,53447.0,1.0,...,19844,0,0.0,0,0,0,0.0,0,1,1


## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [12]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

#TODO: Create the different params that you will be using during training
param_sampling = RandomParameterSampling({'--max_depth': choice(range(2,11)),
                                         '--n_estimators': choice(25, 50, 100, 250, 500, 750, 1000),
                                         '--learning_rate': uniform(0, 1.0)})

src = ScriptRunConfig(source_directory=project_folder,
                      script='xgbtrain.py',
#                      arguments=['--kernel', 'linear', '--penalty', 1.0],
                      compute_target=cpu_cluster,
                      environment=hyperdrive_env)

hyperdrive_config = HyperDriveConfig(run_config=src,
                                    hyperparameter_sampling=param_sampling,
                                    policy=early_termination_policy,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=100,
                                    max_concurrent_runs=4)

In [13]:
#TODO: Submit your experiment
hyperdrive_run = experiment.submit(hyperdrive_config)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [14]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [15]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a
Web View: https://ml.azure.com/experiments/MS-Malware-Hyper/runs/HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-133708/workspaces/quick-starts-ws-133708

Execution Summary
RunId: HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a
Web View: https://ml.azure.com/experiments/MS-Malware-Hyper/runs/HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-133708/workspaces/quick-starts-ws-133708



{'runId': 'HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a',
 'target': 'malware-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-01-06T23:21:54.859815Z',
 'endTimeUtc': '2021-01-07T00:22:16.460142Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '8fa5a128-ff38-41cd-b64c-cd8ec4e5bba8',
  'score': '0.6283',
  'best_child_run_id': 'HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a_34',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg133708.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_9af624e2-cb55-42f4-a5a1-9ba437174b5a/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=qXwBc6K7mWFOViI1ZZ6EcTkLP2L5uoxzH%2F1BX6m5wck%3D&st=2021-01-07T00%3A12%3A44Z&se=2021-01-07T08%3A22%3A44Z&sp=r'}}

In [16]:
assert(hyperdrive_run.get_status() == "Completed")

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [17]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run.get_metrics()

{'max_depth:': 2,
 'n_estimators:': 50,
 'learning_rate:': 0.36632226178129756,
 'Accuracy': 0.6283}

In [18]:
best_run.properties

{'_azureml.ComputeTargetType': 'amlcompute',
 'ContentSnapshotId': '8fa5a128-ff38-41cd-b64c-cd8ec4e5bba8',
 'ProcessInfoFile': 'azureml-logs/process_info.json',
 'ProcessStatusFile': 'azureml-logs/process_status.json'}

Save the best model

In [19]:
print(best_run.get_details()['runDefinition'])

{'script': 'xgbtrain.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--learning_rate', '0.36632226178129756', '--max_depth', '2', '--n_estimators', '50'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'malware-compute', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'jobName': None, 'maxRunDurationSeconds': 2592000, 'nodeCount': 1, 'priority': None, 'credentialPassthrough': False, 'environment': {'name': 'hyperdrive-env', 'version': 'Autosave_2021-01-06T21:31:18Z_0dc07082', 'python': {'interpreterPath': 'python', 'userManagedDependencies': False, 'condaDependencies': {'dependencies': ['python=3.6.2', 'scikit-learn', 'pandas', 'numpy', {'pip': ['azureml-defaults', 'xgboost']}], 'name': 'azureml_523fb8d0d2520274d0060eaa29065899'}, 'baseCondaEnvironment': None}, 'environmentVariables': {'EXAMPLE_ENV_VAR': 'EXAMPLE_VALUE'}, 'docker': {'baseImage': 'mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20200821.v1', 'platform': {

In [20]:
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_46091c72c6c4772d6e52e10e4fe9b5fb5769ffdacf8f5cb81d9f041fb13069b0_d.txt', 'azureml-logs/65_job_prep-tvmps_46091c72c6c4772d6e52e10e4fe9b5fb5769ffdacf8f5cb81d9f041fb13069b0_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_46091c72c6c4772d6e52e10e4fe9b5fb5769ffdacf8f5cb81d9f041fb13069b0_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/99_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_c31a3733-c7f8-4912-9370-770c833ede0b.jsonl', 'logs/azureml/dataprep/python_span_l_c31a3733-c7f8-4912-9370-770c833ede0b.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']


In [21]:
model = best_run.register_model(model_name = 'best_hyperdrive', model_path='outputs/model.joblib')
print(model.name, model.id, model.version, sep='\t')

best_hyperdrive	best_hyperdrive:1	1


## Model Deployment

We opted to deploy the AutoML model because it had a higher accuracy -- 0.638 compared to the best accuracy here of 0.628. Please see the automl.ipynb notebook for details of that deployment.