In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-134374
Azure region: southcentralus
Subscription id: 61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30
Resource group: aml-quickstarts-134374


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

print(compute_target.get_status().serialize())

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-11T19:34:22.896000+00:00', 'errors': None, 'creationTime': '2021-01-11T19:34:13.948600+00:00', 'modifiedTime': '2021-01-11T19:34:30.647063+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [5]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        '--C' : choice(0.001,0.01,0.1,1,10,20,50,100,200,500,1000),
        '--max_iter': choice(50,100,300)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)
if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = "./",compute_target=compute_target,vm_size='STANDARD_D2_V2',entry_script="train.py")

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     policy=policy,
                                     estimator=est,
                                     max_total_runs=16
                                    )


In [6]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_ef7e1cff-49dc-46a3-9675-670293f9ac96
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_ef7e1cff-49dc-46a3-9675-670293f9ac96?wsid=/subscriptions/61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30/resourcegroups/aml-quickstarts-134374/workspaces/quick-starts-ws-134374

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-11T19:37:29.235508][API][INFO]Experiment created<END>\n""<START>[2021-01-11T19:37:29.782450][GENERATOR][INFO]Trying to sample '16' jobs from the hyperparameter space<END>\n"<START>[2021-01-11T19:37:30.4442887Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-01-11T19:37:30.425239][GENERATOR][INFO]Successfully sampled '16' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_ef7e1cff-49dc-46a3-9675-670293f9ac96
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_ef7e1cff-49dc-46a3-9675-670293f9ac96?wsid=/subscriptions/61c

{'runId': 'HD_ef7e1cff-49dc-46a3-9675-670293f9ac96',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-11T19:37:29.001403Z',
 'endTimeUtc': '2021-01-11T19:46:33.044314Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'bfaed2f8-2054-4706-b282-d277c61642a9',
  'score': '0.91442097596504',
  'best_child_run_id': 'HD_ef7e1cff-49dc-46a3-9675-670293f9ac96_6',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg134374.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_ef7e1cff-49dc-46a3-9675-670293f9ac96/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=6dkgVty6wEwZ8Djk6LpJ4sQz8xxCmX2kxzjr6scXBlY%3D&st=2021-01-11T19%3A36%3A39Z&se=2021-01-12T03%3A46%3A39Z&sp=r'}}

In [7]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_metrics())

{'Regularization Strength:': 10.0, 'Max iterations:': 50, 'Accuracy': 0.91442097596504}


In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(['https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'])

In [9]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [10]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ds,
    label_column_name='y',
    compute_target= cluster_name,
    n_cross_validations=5)

In [11]:
# Submit your automl run

automl_run=exp.submit(config=automl_config,show_output=True)

Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_5204ca1f-2525-41b8-95d8-d653080c9220

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input

In [12]:
# Retrieve and save your best automl model.

best_automlrun_metrics=automl_run.get_metrics()
for primary_metric in best_automlrun_metrics:
    metric=best_automlrun_metrics[primary_metric]
    print(primary_metric,metric)


best_automl_run,best_automl_model=automl_run.get_output()
joblib.dump(value=best_automl_model,filename='automl_model.joblib')

experiment_status ['DatasetEvaluation', 'FeaturesGeneration', 'DatasetFeaturization', 'DatasetFeaturizationCompleted', 'DatasetBalancing', 'DatasetCrossValidationSplit', 'ModelSelection']
experiment_status_description ['Gathering dataset statistics.', 'Generating features for the dataset.', 'Beginning to fit featurizers and featurize the dataset.', 'Completed fit featurizers and featurizing the dataset.', 'Performing class balancing sweeping', 'Generating individually featurized CV splits.', 'Beginning model selection.']
f1_score_weighted 0.9143276692139521
recall_score_micro 0.9169650986342944
AUC_weighted 0.9470025744744618
balanced_accuracy 0.7620163831683666
average_precision_score_weighted 0.9555856232753932
norm_macro_recall 0.5240327663367331
precision_score_macro 0.7979526065015026
log_loss 0.219350259774646
matthews_correlation 0.5587087189124692
average_precision_score_micro 0.9812851285652568
weighted_accuracy 0.9554573100624861
average_precision_score_macro 0.82625044590131

['automl_model.joblib']

In [13]:
compute_target.delete()