In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-127521")
exp2 = Experiment(workspace=ws, name="automl")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp2.start_logging()

Workspace name: quick-starts-ws-127521
Azure region: southcentralus
Subscription id: 8bb47da5-84b5-43cf-bd4a-97928e5c9b08
Resource group: aml-quickstarts-127521


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException


# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
cluster2="automl"
try:
  compute_target=ComputeTarget(workspace=ws,name=cluster2)
  print("Compute target exists")
except ComputeTargetException:
  print('Create a new compute target')
  compute_config=AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2',max_nodes=4)
  compute_target=ComputeTarget.create(ws,cluster2,compute_config)
  

### YOUR CODE HERE ###

Create a new compute target


In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
tabular = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
) #here we convert the csv file to tabular dataset format which can be used by automl

In [7]:
tabular

{
  "source": [
    "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
  ],
  "definition": [
    "GetFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [8]:
from train import clean_data
import pandas as pd
# Use the clean_data function to clean your data.
x, y = clean_data(tabular)


In [9]:
x.head(3)

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0


In [10]:
y.head(3)

0    0
1    0
2    0
Name: y, dtype: int64

In [11]:
df =pd.concat([x,y],axis=1)
df.head(5)

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0


In [15]:
import os
if not os.path.isdir('datadir'): #make a directory for data 
    os.mkdir('datadir')

# Convert the  data to a csv file to  upload it to the datastore
df.to_csv("datadir/data.csv", index=False)

dstore = ws.get_default_datastore()
dstore

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-88640893-e9cb-4796-8bf2-0bd03118d588",
  "account_name": "mlstrg127521",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [16]:
dstore.upload(src_dir='./datadir', target_path='automl-dataset', overwrite=True, show_progress=True)
 


Uploading an estimated of 2 files
Uploading ./datadir/training_data.csv
Uploaded ./datadir/training_data.csv, 1 files out of an estimated total of 2
Uploading ./datadir/data.csv
Uploaded ./datadir/data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_77387f9c08094fc0801a13c1ee8101cd

In [18]:
from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath


# Upload the training data as a tabular dataset so it can be used by automl compute
data = Dataset.Tabular.from_delimited_files(path=dstore.path('automl-dataset/data.csv'))
label = "y"

Since 5 folds cross validations are being performed, so for each training we use 4/5 th of data and
1/5 th of data is used in each validation  with a different holdout fold each time.

In [19]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=data,
    label_column_name='y',
    n_cross_validations=5,
    compute_target=compute_target,
    iterations=100,
    max_concurrent_iterations=8)

In [20]:
aml_run=exp2.submit(automl_config, show_output=True)


Running on remote.
Running on remote compute: automl
Parent Run ID: AutoML_87c610c3-3498-4e8b-abb4-0d38c5f6dc85

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the small

In [21]:
aml_run.wait_for_completion(show_output=True)



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1                                |32950                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_87c610c3-3498-4e8b-abb4-0d38c5f6dc85',
 'target': 'automl',
 'status': 'Completed',
 'startTimeUtc': '2020-11-23T11:02:23.471099Z',
 'endTimeUtc': '2020-11-23T11:43:52.073638Z',
 'properties': {'num_iterations': '100',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'automl',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"97e51620-63a0-4119-ae28-abe2df1ba8b6\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"automl-dataset/data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-127521\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"8bb47da5-84b5-43cf-bd4a-97928e5c9b08\\\\\\", \\\\\\"workspaceNa

In [22]:
from azureml.widgets import RunDetails
RunDetails(aml_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [25]:
run,model=aml_run.get_output() #get best run and model
print(run)

Run(Experiment: automl,
Id: AutoML_87c610c3-3498-4e8b-abb4-0d38c5f6dc85_82,
Type: azureml.scriptrun,
Status: Completed)


In [41]:
print(model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               max_leaves=0,
                                                                                               min_child_weight=1,
                                                                                               missing=nan,
                                                            

In [29]:
metrics=aml_run.get_metrics()
print(metrics)

{'experiment_status': ['DatasetEvaluation', 'FeaturesGeneration', 'DatasetFeaturization', 'DatasetFeaturizationCompleted', 'DatasetBalancing', 'DatasetCrossValidationSplit', 'ModelSelection', 'BestRunExplainModel', 'ModelExplanationDataSetSetup', 'PickSurrogateModel', 'EngineeredFeatureExplanations', 'EngineeredFeatureExplanations', 'RawFeaturesExplanations', 'RawFeaturesExplanations', 'BestRunExplainModel'], 'experiment_status_description': ['Gathering dataset statistics.', 'Generating features for the dataset.', 'Beginning to fit featurizers and featurize the dataset.', 'Completed fit featurizers and featurizing the dataset.', 'Performing class balancing sweeping', 'Generating individually featurized CV splits.', 'Beginning model selection.', 'Best run model explanations started', 'Model explanations data setup completed', 'Choosing LightGBM as the surrogate model for explanations', 'Computation of engineered features started', 'Computation of engineered features completed', 'Computa

In [38]:
#Display the metrics in a more understandable manner
for i in metrics:
    print(i, metrics[i], '\n')


experiment_status ['DatasetEvaluation', 'FeaturesGeneration', 'DatasetFeaturization', 'DatasetFeaturizationCompleted', 'DatasetBalancing', 'DatasetCrossValidationSplit', 'ModelSelection', 'BestRunExplainModel', 'ModelExplanationDataSetSetup', 'PickSurrogateModel', 'EngineeredFeatureExplanations', 'EngineeredFeatureExplanations', 'RawFeaturesExplanations', 'RawFeaturesExplanations', 'BestRunExplainModel'] 

experiment_status_description ['Gathering dataset statistics.', 'Generating features for the dataset.', 'Beginning to fit featurizers and featurize the dataset.', 'Completed fit featurizers and featurizing the dataset.', 'Performing class balancing sweeping', 'Generating individually featurized CV splits.', 'Beginning model selection.', 'Best run model explanations started', 'Model explanations data setup completed', 'Choosing LightGBM as the surrogate model for explanations', 'Computation of engineered features started', 'Computation of engineered features completed', 'Computation o

In [42]:
run.register_model(model_name='automl_best_model.pkl', model_path='./outputs/') #register it

Model(workspace=Workspace.create(name='quick-starts-ws-127521', subscription_id='8bb47da5-84b5-43cf-bd4a-97928e5c9b08', resource_group='aml-quickstarts-127521'), name=automl_best_model.pkl, id=automl_best_model.pkl:1, version=1, tags={}, properties={})

In [43]:
run.download_file('outputs/model.pkl', 'best_model.pkl') #now download it

In [36]:
compute_target.delete()