In [1]:
from azureml.core import Workspace, Experiment

ws=Workspace.from_config()
ws.write_config(path='.azureml')

exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-133356
Azure region: southcentralus
Subscription id: 2c48c51c-bd47-40d4-abbe-fb8eabd19c8c
Resource group: aml-quickstarts-133356


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cluster_name = "aml-compute"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [18]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive import choice
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler

ps = RandomParameterSampling( {
        "C": uniform(0, 100),
        "max_iter": choice(50, 75, 100, 150)
    }
)


# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1)

if "training" not in os.listdir():
    os.mkdir("./training")
    
script_folder = './training'
os.makedirs(script_folder, exist_ok=True)
import shutil
shutil.copy('./train.py', script_folder)

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=script_folder, compute_target=cpu_cluster, entry_script="train.py", vm_size="Standard_D2_V2")

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=25,
                             max_concurrent_runs=3)



In [19]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)

RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [20]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run.get_details()
hyperdrive_run.get_children_sorted_by_primary_metric(top=1)

[{'run_id': 'HD_3138a734-c8b8-4ba3-b579-0c96091d0942_2',
  'hyperparameters': '{"C": 53.95724941830583, "max_iter": 50}',
  'best_primary_metric': 0.917298937784522,
  'status': 'Completed'}]

In [2]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###

ds = TabularDatasetFactory.from_delimited_files(path='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')


In [3]:

def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return x_df, y_df


In [4]:


# Use the clean_data function to clean your data.
import pandas as pd

x, y = clean_data(ds)

data = pd.concat([x,y], axis=1)

In [5]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=data,
    label_column_name='y',
    n_cross_validations=5)

In [9]:
pip install -r /anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/automl/core/validated_linux_requirements.txt

Collecting certifi<=2020.11.8
  Downloading certifi-2020.11.8-py2.py3-none-any.whl (155 kB)
[K     |████████████████████████████████| 155 kB 5.9 MB/s eta 0:00:01
Collecting pyopenssl<=19.1.0
  Downloading pyOpenSSL-19.1.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.0 MB/s  eta 0:00:01
[31mERROR: azureml-contrib-notebook 1.19.0 has requirement nbconvert<6, but you'll have nbconvert 6.0.7 which is incompatible.[0m
[31mERROR: azure-cli 2.16.0 has requirement azure-graphrbac~=0.60.0, but you'll have azure-graphrbac 0.61.1 which is incompatible.[0m
[31mERROR: azure-cli 2.16.0 has requirement azure-mgmt-containerregistry==3.0.0rc15, but you'll have azure-mgmt-containerregistry 2.8.0 which is incompatible.[0m
[31mERROR: azure-cli 2.16.0 has requirement azure-mgmt-keyvault==8.0.0, but you'll have azure-mgmt-keyvault 2.2.0 which is incompatible.[0m
[31mERROR: azure-cli 2.16.0 has requirement azure-mgmt-storage~=16.0.0, but you'll have azure-mgmt-sto

In [6]:
# Submit your automl run
experiment_auto = Experiment(ws, 'automl_remote')
remote_run = experiment_auto.submit(automl_config, show_output=True)
RunDetails(remote_run).show()


No run_configuration provided, running on local with default configuration
Running on local machine
Parent Run ID: AutoML_bf4e367a-3697-4352-bc08-81087803cab9

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/Auto

NameError: name 'RunDetails' is not defined

In [8]:
# Retrieve and save your best automl model.

best_run = remote_run.get_best_child()

best_run.get_details()

{'runId': 'AutoML_bf4e367a-3697-4352-bc08-81087803cab9_40',
 'status': 'Completed',
 'startTimeUtc': '2021-01-03T20:44:26.827455Z',
 'endTimeUtc': '2021-01-03T20:45:52.618643Z',
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'automl_remote\',\'compute_target\':\'local\',\'subscription_id\':\'510b94ba-e453-4417-988b-fbdc37b55ca7\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_bf4e367a-3697-4352-bc08-81087803cab9_40","experiment_name":null,"workspace_name":"quick-starts-ws-133206","subscription_id":"510b94ba-e453-4417-988b-fbdc37b55ca7","resource_group_name":"aml-quick

In [10]:
best_run, best_model=remote_run.get_output()

In [11]:
best_run.get_metrics()

{'recall_score_weighted': 0.9169650986342944,
 'precision_score_weighted': 0.9116702071770761,
 'log_loss': 0.19463934016680914,
 'AUC_macro': 0.9481940490323199,
 'f1_score_weighted': 0.9136422697181559,
 'balanced_accuracy': 0.7551036793335658,
 'recall_score_micro': 0.9169650986342944,
 'norm_macro_recall': 0.5102073586671317,
 'AUC_weighted': 0.9481940490323199,
 'f1_score_macro': 0.7748062966591373,
 'accuracy': 0.9169650986342944,
 'average_precision_score_macro': 0.8286422054211595,
 'average_precision_score_weighted': 0.9563382782161319,
 'average_precision_score_micro': 0.9817490068972891,
 'recall_score_macro': 0.7551036793335658,
 'precision_score_macro': 0.7997978611799305,
 'f1_score_micro': 0.9169650986342944,
 'matthews_correlation': 0.5530123283647104,
 'AUC_micro': 0.9810075227790301,
 'precision_score_micro': 0.9169650986342944,
 'weighted_accuracy': 0.9571637331557966,
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_bf4e367a-3697-4352-bc08-81087803ca

In [12]:
best_run.get_details()

{'runId': 'AutoML_bf4e367a-3697-4352-bc08-81087803cab9_40',
 'status': 'Completed',
 'startTimeUtc': '2021-01-03T20:44:26.827455Z',
 'endTimeUtc': '2021-01-03T20:45:52.618643Z',
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'automl_remote\',\'compute_target\':\'local\',\'subscription_id\':\'510b94ba-e453-4417-988b-fbdc37b55ca7\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_bf4e367a-3697-4352-bc08-81087803cab9_40","experiment_name":null,"workspace_name":"quick-starts-ws-133206","subscription_id":"510b94ba-e453-4417-988b-fbdc37b55ca7","resource_group_name":"aml-quick

In [13]:
#save best automl model
best_run.register_model(model_name='automl_best_model.pkl',model_path='outputs/')

Model(workspace=Workspace.create(name='quick-starts-ws-133206', subscription_id='510b94ba-e453-4417-988b-fbdc37b55ca7', resource_group='aml-quickstarts-133206'), name=automl_best_model.pkl, id=automl_best_model.pkl:1, version=1, tags={}, properties={})

In [14]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl_remote,AutoML_bf4e367a-3697-4352-bc08-81087803cab9_40,,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [3]:
#Delete the cpu cluster after the training and with that free up the resources:
cpu_cluster.delete()