In [1]:
import os
import joblib
import pandas as pd
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Experiment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.train.automl import AutoMLConfig
from azureml.data.dataset_factory import TabularDatasetFactory
from src.train import clean_data

In [2]:
# Initialise workspace
ws = Workspace.from_config(path='./config.json')
exp = Experiment(workspace=ws, name="hyperdrive-automl")

print(
    'Workspace name: ' + ws.name,
    'Azure region: ' + ws.location,
    'Subscription id: ' + ws.subscription_id,
    'Resource group: ' + ws.resource_group, sep = '\n'
)

Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-134201
Azure region: southcentralus
Subscription id: a0a76bad-11a1-4a2d-9887-97a29122c8ed
Resource group: aml-quickstarts-134201


In [3]:
# Create compute cluster
cluster_name = "hyperdrive-aml"

try:
    target = ComputeTarget(workspace=ws, name=cluster_name)
except:
    config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    target = ComputeTarget.create(ws, cluster_name, config)
    
target.wait_for_completion(show_output=True)
print('done')

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
done


In [4]:
# Specify parameter sampler
ps = RandomParameterSampling({
    "--C" : choice(0.01, 0.1, 10),
    "--max_iter" : choice(20, 50, 100, 200)
})

# Specify a Policy
policy = BanditPolicy(
    slack_factor=0.1,
    evaluation_interval=1,
    delay_evaluation=5
)

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    source_directory='./src/',
    entry_script='train.py',
    compute_target=cluster_name
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=50,
    max_concurrent_runs=4
)


'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [5]:
# Submit hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36
Web View: https://ml.azure.com/experiments/hyperdrive-automl/runs/HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36?wsid=/subscriptions/a0a76bad-11a1-4a2d-9887-97a29122c8ed/resourcegroups/aml-quickstarts-134201/workspaces/quick-starts-ws-134201

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-10T10:32:18.716604][API][INFO]Experiment created<END>\n""<START>[2021-01-10T10:32:19.738966][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-10T10:32:20.5111435Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-01-10T10:32:19.429219][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"

Execution Summary
RunId: HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36
Web View: https://ml.azure.com/experiments/hyperdrive-automl/runs/HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36?wsid=/subscriptions/a

{'runId': 'HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36',
 'target': 'hyperdrive-aml',
 'status': 'Completed',
 'startTimeUtc': '2021-01-10T10:32:18.472224Z',
 'endTimeUtc': '2021-01-10T10:46:51.342531Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '1006e28f-e10d-472b-ab66-56654b87c03b',
  'score': '0.9125948406676783',
  'best_child_run_id': 'HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36_2',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg134201.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_7e4e0334-0bf8-41bb-a15b-615190ea5f36/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=GFI2UMbgrySQ8FiWk9oT32sBRObE5v5DMbKvGwUPwjw%3D&st=2021-01-10T10%3A37%3A23Z&se=2021-01-10T18%3A47%3A23Z&sp=r'}}

In [6]:
# Get your best run and save the model from that run.
hyperdrive_best_run = hyperdrive_run.get_best_run_by_primary_metric()
print("Best params: ",hyperdrive_best_run.get_metrics())

model = hyperdrive_best_run.register_model(
    model_name='best_hyperdrive', 
    model_path='./outputs/hyperdrive_model.joblib'
)

model.download(target_dir="outputs", exist_ok=True)

Best params:  {'Regularization Strength:': 10.0, 'Max iterations:': 100, 'Accuracy': 0.9125948406676783}


'outputs\\hyperdrive_model.joblib'

### AutoML runs

In [7]:
# Get data
ds = TabularDatasetFactory.from_delimited_files(['https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'])

In [8]:
# Run AutoML
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,
    label_column_name='y',
    n_cross_validations=4,
    max_concurrent_iterations=4,
    compute_target=target,
    enable_early_stopping=True,
    iterations=15,
    blocked_models=['SVM'] # Slow
)

# Submit AutoMl run
remote_run = exp.submit(
    config=automl_config,
    show_output=True
)

RunDetails(remote_run).show()

Running on remote.
No run_configuration provided, running on hyperdrive-aml with default configuration
Running on remote compute: hyperdrive-aml
Parent Run ID: AutoML_74cbfef5-3b64-4356-a753-e2893a7c15f6

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [9]:
# Retrieve and save the best automl model.
aml_best_run, aml_model = remote_run.get_output()

aml_best_run.download_file('outputs/model.pkl', 'outputs/aml_model.pkl')

In [10]:
# Show performance
aml_best_run.get_metrics()

{'precision_score_weighted': 0.9113192470725119,
 'f1_score_weighted': 0.9133558168646536,
 'AUC_weighted': 0.9494888390476206,
 'precision_score_macro': 0.8029522193823603,
 'accuracy': 0.9172988214259388,
 'average_precision_score_macro': 0.8285663189420476,
 'log_loss': 0.17760197321556268,
 'balanced_accuracy': 0.7498659900340786,
 'norm_macro_recall': 0.4997319800681572,
 'weighted_accuracy': 0.9588867163697785,
 'average_precision_score_micro': 0.9820662882292477,
 'recall_score_micro': 0.9172988214259388,
 'matthews_correlation': 0.5501100592156605,
 'AUC_micro': 0.9812978430679353,
 'f1_score_micro': 0.9172988214259388,
 'recall_score_macro': 0.7498659900340786,
 'AUC_macro': 0.9494888390476207,
 'average_precision_score_weighted': 0.9565051474161843,
 'precision_score_micro': 0.9172988214259388,
 'f1_score_macro': 0.7726526676202616,
 'recall_score_weighted': 0.9172988214259388,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_74cbfef5-3b64-4356-a753-e2893a7c15f6

In [11]:
print(aml_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                dual=False,
                                                                                                fit_intercept=True,
                                                                                                intercept_scaling=1,
                                                   

In [27]:
aml_model.steps[1][1].estimators

[('0',
  Pipeline(memory=None,
           steps=[('maxabsscaler', MaxAbsScaler(copy=True)),
                  ('lightgbmclassifier',
                   LightGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split', learning_rate=0.1,
                                      max_depth=-1, min_child_samples=20,
                                      min_child_weight=0.001, min_split_gain=0.0,
                                      n_estimators=100, n_jobs=1, num_leaves=31,
                                      objective=None, random_state=None,
                                      reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0, verbose=-10))],
           verbose=False)),
 ('1',
  Pipeline(memory=None,
           steps=[('maxabsscaler', MaxAbsSca

In [15]:
aml_best_run.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":1,"CurrentNodeCount":4}',
 'ensembled_iterations': '[0, 1, 6, 7, 8]',
 'ensembled_algorithms': "['LightGBM', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'LogisticRegression']",
 'ensemble_weights': '[0.46153846153846156, 0.07692307692307693, 0.15384615384615385, 0.23076923076923078, 0.07692307692307693]',
 'best_individual_pipeline_score': '0.9154171641510163',
 'best_individual_iteration': '0',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True'}

In [13]:
print('AutoML Accuracy: ', aml_best_run.get_metrics()['accuracy'])

AutoML Accuracy:  0.9172988214259388


In [14]:
# Delete compute target
try:
    target.delete()
except:
    pass