In [1]:
import os
import joblib
import pandas as pd
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Experiment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.train.automl import AutoMLConfig
from azureml.data.dataset_factory import TabularDatasetFactory
from src.train import clean_data

In [2]:
# Initialise workspace
ws = Workspace.from_config(path='./config.json')
exp = Experiment(workspace=ws, name="hyperdrive-automl")

print(
    'Workspace name: ' + ws.name,
    'Azure region: ' + ws.location,
    'Subscription id: ' + ws.subscription_id,
    'Resource group: ' + ws.resource_group, sep = '\n'
)

Workspace name: quick-starts-ws-133560
Azure region: southcentralus
Subscription id: f9d5a085-54dc-4215-9ba6-dad5d86e60a0
Resource group: aml-quickstarts-133560


In [3]:
# Create compute cluster
cluster_name = "hyperdrive-aml"

try:
    target = ComputeTarget(workspace=ws, name=cluster_name)
except:
    config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    target = ComputeTarget.create(ws, cluster_name, config)
    
target.wait_for_completion(show_output=True)
print('done')

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
done


In [4]:
# Specify parameter sampler
ps = RandomParameterSampling({
    "--C" : choice(0.01, 0.1, 10),
    "--max_iter" : choice(20, 50, 100, 200)
})

# Specify a Policy
policy = BanditPolicy(
    slack_factor=0.1,
    evaluation_interval=1,
    delay_evaluation=5
)

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    source_directory='./src/',
    entry_script='train.py',
    compute_target=cluster_name
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=50,
    max_concurrent_runs=4
)


'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [5]:
# Submit hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_973673aa-4fd2-4fc9-8483-46867874010f
Web View: https://ml.azure.com/experiments/hyperdrive-automl/runs/HD_973673aa-4fd2-4fc9-8483-46867874010f?wsid=/subscriptions/a24a24d5-8d87-4c8a-99b6-91ed2d2df51f/resourcegroups/aml-quickstarts-133542/workspaces/quick-starts-ws-133542

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-05T19:36:57.695904][API][INFO]Experiment created<END>\n"<START>[2021-01-05T19:36:58.4995457Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-01-05T19:36:58.507520][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-05T19:36:58.832640][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_973673aa-4fd2-4fc9-8483-46867874010f
Web View: https://ml.azure.com/experiments/hyperdrive-automl/runs/HD_973673aa-4fd2-4fc9-8483-46867874010f?wsid=/subscriptions/a

{'runId': 'HD_973673aa-4fd2-4fc9-8483-46867874010f',
 'target': 'hyperdrive-aml',
 'status': 'Completed',
 'startTimeUtc': '2021-01-05T19:36:57.388843Z',
 'endTimeUtc': '2021-01-05T19:52:02.332809Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '39a0d2f1-2f32-4909-8cee-b70f789389cd',
  'score': '0.9125948406676783',
  'best_child_run_id': 'HD_973673aa-4fd2-4fc9-8483-46867874010f_2',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg133542.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_973673aa-4fd2-4fc9-8483-46867874010f/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=AfVAlnoW2ELx%2B11FkD4%2BsayfXzm0VuW3zWyh3RsSUvA%3D&st=2021-01-05T19%3A42%3A10Z&se=2021-01-06T03%3A52%3A10Z&sp=r'}}

In [6]:
# Get your best run and save the model from that run.
hyperdrive_best_run = hyperdrive_run.get_best_run_by_primary_metric()
print("Best params: ",hyperdrive_best_run.get_metrics())

model = hyperdrive_best_run.register_model(
    model_name='best_hyperdrive', 
    model_path='./outputs/hyperdrive_model.joblib'
)

model.download(target_dir="outputs", exist_ok=True)

Best params:  {'Regularization Strength:': 10.0, 'Max iterations:': 50, 'Accuracy': 0.9125948406676783}


'outputs\\hyperdrive_model.joblib'

### AutoML runs

In [4]:
# Get data
ds = TabularDatasetFactory.from_delimited_files(['https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'])

In [5]:
# Run AutoML
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,
    label_column_name='y',
    n_cross_validations=4,
    max_concurrent_iterations=4,
    compute_target=target,
    enable_early_stopping=True,
    iterations=15,
    blocked_models=['SVM'] # Slow
)

# Submit AutoMl run
remote_run = exp.submit(
    config=automl_config,
    show_output=True
)

RunDetails(remote_run).show()

Running on remote.
No run_configuration provided, running on hyperdrive-aml with default configuration
Running on remote compute: hyperdrive-aml
Parent Run ID: AutoML_f43d53d8-afe6-4714-a59f-642f50fd3d7d

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [6]:
# Retrieve and save the best automl model.
aml_best_run, aml_model = remote_run.get_output()

aml_best_run.download_file('outputs/model.pkl', 'outputs/aml_model.pkl')

In [11]:
# Show performance
aml_best_run.get_metrics()

{'balanced_accuracy': 0.7476864226183557,
 'f1_score_micro': 0.9172077578055048,
 'f1_score_weighted': 0.9130559987680588,
 'recall_score_micro': 0.9172077578055047,
 'accuracy': 0.9172077578055047,
 'precision_score_macro': 0.8033915801582914,
 'norm_macro_recall': 0.49537284523671143,
 'precision_score_micro': 0.9172077578055047,
 'precision_score_weighted': 0.9110023211259382,
 'matthews_correlation': 0.5480898742964511,
 'weighted_accuracy': 0.9593150030292856,
 'average_precision_score_weighted': 0.956467575128199,
 'AUC_macro': 0.9493470542384046,
 'recall_score_weighted': 0.9172077578055047,
 'AUC_micro': 0.981257128994184,
 'average_precision_score_micro': 0.982022872936836,
 'recall_score_macro': 0.7476864226183557,
 'AUC_weighted': 0.9493470542384047,
 'average_precision_score_macro': 0.8284899862716537,
 'f1_score_macro': 0.7714018382589988,
 'log_loss': 0.17702531146752656,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_f43d53d8-afe6-4714-a59f-642f50fd3d7d_1

In [13]:
print('AutoML Accuracy: ', aml_best_run.get_metrics()['accuracy'])

AutoML Accuracy:  0.9172077578055047


In [14]:
# Delete compute target
try:
    target.delete()
except:
    pass