In [None]:
from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="udacity-project")
ws = Workspace.get(name="quick-starts-ws-126831")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
from azureml.core.compute_target import ComputeTargetException

compute_name = "mycompute"
try:
    compute = ComputeTarget(workspace=ws, name=compute_name)
    print('Compute cluster {} already exists!'.format(compute_name))
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute = ComputeTarget.create(ws, compute_name, config)
    
compute.wait_for_completion()

Compute cluster mycompute already exists!


In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
from azureml.train.hyperdrive import loguniform, choice
ps = RandomParameterSampling(
    {
        '--C': loguniform(-5, 3), 
        '--max_iter': choice(50, 100, 500, 1000, 5000, 10000)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=100, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")
import shutil
training_folder = "./training"
shutil.copyfile('train.py', os.path.join(training_folder, 'train.py'))

# Create a SKLearn estimator for use with train.py
from azureml.train.estimator import Estimator
est = Estimator(source_directory=training_folder,
                entry_script="train.py",
                compute_target=compute,
                conda_packages=['scikit-learn==0.21.3', 'pandas==0.23.4']
               )


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
#    max_total_runs=1000)
    max_total_runs=15)

In [6]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
run = exp.submit(hyperdrive_config)
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [10]:
import joblib
# Get your best run and save the model from that run.
best_run = run.get_best_run_by_primary_metric()
print(best_run.get_metrics())
print(best_run.get_file_names())
os.makedirs('./model', exist_ok=True)
best_run.download_file(name='outputs/model.lib', output_file_path='./model/lr_bankmarketing.lib')
best_run.register_model(model_name='lr_bankmarketing', model_path='outputs/model.lib')
#model = best_run.register_model(model_name='lr_bankmarketing', model_path='outputs/lr_bankmarketing.joblib')


{'Regularization Strength:': 0.3580435578530266, 'Max iterations:': 1000, 'Accuracy': 0.9148710166919575}
['azureml-logs/55_azureml-execution-tvmps_aa88043d91350a1626647832327ba137859136e698338d35df748190eaa8c1e4_d.txt', 'azureml-logs/65_job_prep-tvmps_aa88043d91350a1626647832327ba137859136e698338d35df748190eaa8c1e4_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_aa88043d91350a1626647832327ba137859136e698338d35df748190eaa8c1e4_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/101_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_9ce68d32-0739-4235-a2ae-151d97626120.jsonl', 'logs/azureml/dataprep/python_span_l_9ce68d32-0739-4235-a2ae-151d97626120.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.lib']


Model(workspace=Workspace.create(name='quick-starts-ws-126843', subscription_id='374bdf1a-c648-4244-a317-f0d1ef4b85c7', resource_group='aml-quickstarts-126843'), name=lr_bankmarketing, id=lr_bankmarketing:2, version=2, tags={}, properties={})

In [11]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
dataset = TabularDatasetFactory.from_delimited_files(url)

In [23]:
from train import clean_data

# Use the clean_data function to clean your data.
from train import clean_data
x, y = clean_data(dataset)
td = x.join(y)
os.makedirs('./data', exist_ok=True)
td.to_csv('data/bankmarketing_train_cleaned.csv')
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)

dataset = TabularDatasetFactory.from_delimited_files(path=ds.path('data/bankmarketing_train_cleaned.csv'))
print(dataset.to_pandas_dataframe().head)

Uploading an estimated of 1 files
Uploading ./data/bankmarketing_train_cleaned.csv
Uploaded ./data/bankmarketing_train_cleaned.csv, 1 files out of an estimated total of 1
Uploaded 1 files
<bound method NDFrame.head of        Column1  age  marital  default  housing  loan  month  day_of_week  \
0            0   57        1        0        0     1      5            1   
1            1   55        1        0        1     0      5            4   
2            2   33        1        0        0     0      5            5   
3            3   36        1        0        0     0      6            5   
4            4   27        1        0        1     0      7            5   
...        ...  ...      ...      ...      ...   ...    ...          ...   
32945    32945   56        1        0        0     1      7            1   
32946    32946   37        1        0        0     1      7            5   
32947    32947   26        0        0        0     0      5            2   
32948    32948   31   

In [24]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    compute_target=compute,
    primary_metric='accuracy',
    training_data=dataset,
    label_column_name="y",
    n_cross_validations=5)

print(automl_config)

<azureml.train.automl.automlconfig.AutoMLConfig object at 0x7f7a89f4a940>


In [26]:
# Submit your automl run
automl_run = exp.submit(automl_config, show_output=True)

Running on remote.
Running on remote compute: mycompute
Parent Run ID: AutoML_d1a8b765-bcdd-4354-ab0e-0901f8cff071

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+----

In [32]:
# Retrieve and save your best automl model.
best_automl_run, best_automl_model = automl_run.get_output()
#best_automl_run.register_model(model_name='automl_bankmarketing', model_path='outputs/automl_bankmarketing.joblib')

In [34]:
print(best_automl_run.get_metrics())
print(best_automl_run.get_file_names())
best_run.download_file(name='outputs/model.pkl', output_file_path='./model/auto_ml_bankmarketing.lib')
best_run.register_model(model_name='auto_ml_bankmarketing', model_path='outputs/model.pkl')

{'weighted_accuracy': 0.9602884294672795, 'log_loss': 0.1765879562427169, 'f1_score_macro': 0.7606163176581189, 'f1_score_weighted': 0.909852201164554, 'f1_score_micro': 0.915113808801214, 'matthews_correlation': 0.5289140231173124, 'precision_score_micro': 0.915113808801214, 'average_precision_score_macro': 0.8265639266968844, 'recall_score_micro': 0.915113808801214, 'average_precision_score_weighted': 0.9558618474823046, 'AUC_micro': 0.9806877897029803, 'AUC_weighted': 0.9473292621855611, 'recall_score_weighted': 0.915113808801214, 'balanced_accuracy': 0.7331907225203123, 'norm_macro_recall': 0.4663814450406246, 'precision_score_weighted': 0.9076461382703298, 'recall_score_macro': 0.7331907225203123, 'AUC_macro': 0.9473292621855611, 'precision_score_macro': 0.8000982855397496, 'accuracy': 0.915113808801214, 'average_precision_score_micro': 0.9815172112148698, 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_d1a8b765-bcdd-4354-ab0e-0901f8cff071_1/accuracy_table', 'confusi

Model(workspace=Workspace.create(name='quick-starts-ws-126843', subscription_id='374bdf1a-c648-4244-a317-f0d1ef4b85c7', resource_group='aml-quickstarts-126843'), name=auto_ml_bankmarketing, id=auto_ml_bankmarketing:1, version=1, tags={}, properties={})

In [35]:
# Clean up compute cluster
try:
    compute.delete()
except ComputeTargetException as e:
    print(e.exception_message)
    print("Failed to clean up compute cluster!")
          
try:
    ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    print('Compute cluster {} no longer exists.'.format(compute_name))