# Automated ML

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core.dataset import Dataset
from azureml.train.automl.utilities import get_primary_metrics
from azureml.train.automl import AutoMLConfig
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment
from azureml.automl.core.shared import constants
import joblib

## Dataset

### Overview

The dataset used for this project is the [Heart Failure Prediction](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data) dataset taken from Kaggle.

This dataset contains 12 features that can be used to predict mortality by heart failure:

- age: Age of the patient
- amaemia: Decrease of red blood cells or hemoglobin
- creatinine_phosphokinase: Level of the CPK enzyme in the blood (mcg/L)
- diabetes: If the patient has diabetes
- ejection_fraction: Percentage of blood leaving the heart at each contraction
- high_blood_pressure: If the patient has hypertension
- platelets: Platelets in the blood (kiloplatelets/mL)
- serum_creatinine: Level of serum creatinine in the blood (mg/dL)
- serum_sodium: Level of serum sodium in the blood (mEq/L)
- sex: Woman or man
- smoking: If the patient smokes or not
- time: Follow-up period (days)

The target column is DEATH_EVENT which tells if the patient deceased during the follow-up period. The task performed in this project is to predict whether or not a death event occurs.

In [2]:
## Creating a new Experiment
ws = Workspace.from_config()
experiment_name = 'heart-failure-automl'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-177051
Azure region: southcentralus
Subscription id: 81cefad3-d2c9-4f77-a466-99a7f541c7bb
Resource group: aml-quickstarts-177051


In [3]:
#Checking and printing existing compute targets
compute_targets= ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

notebook177051 ComputeInstance Succeeded
heart-compute AmlCompute Succeeded
cpu-cluster AmlCompute Succeeded
my-compute AmlCompute Succeeded
cpu-compute AmlCompute Succeeded


In [4]:
#Create compute cluster
compute_cluster_name= "cpu-compute"

#Check if compute cluster already exists
try:
    compute_cluster=ComputeTarget(workspace=ws, name=compute_cluster_name)
    print("Found existing cluster, use it...")
except ComputeTargetException:
    print("Creating new cluster...")
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=5)
    compute_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)
    
compute_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it...
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
#Get data
from azureml.data.dataset_factory import TabularDatasetFactory



path_to_data= "https://raw.githubusercontent.com/neha7598/azure-ml-capstone/main/data/heart_failure_clinical_records_dataset.csv"
data=TabularDatasetFactory.from_delimited_files(path=path_to_data)

In [6]:
data = data.to_pandas_dataframe()
x=data.drop('DEATH_EVENT',axis=1)
y=data['DEATH_EVENT']

##split into train and test datasets
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.20)

#concatenate to form train and test datasets 
train_df=pd.concat([x_train, y_train], axis=1)
test_df=pd.concat([x_test, y_test], axis=1)

train_df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
89,57.0,1,115,0,25,1,181000.00,1.1,144,1,0,79,0
60,45.0,0,7702,1,25,1,390000.00,1.0,139,1,0,60,1
52,60.0,0,3964,1,62,0,263358.03,6.8,146,0,0,43,1
5,90.0,1,47,0,40,1,204000.00,2.1,132,1,1,8,1
75,60.0,1,47,0,20,0,204000.00,0.7,139,1,1,73,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,40.0,0,244,0,45,1,275000.00,0.9,140,0,0,174,0
152,50.0,0,115,0,45,1,184000.00,0.9,134,1,1,118,0
146,52.0,0,132,0,30,0,218000.00,0.7,136,1,1,112,0
270,44.0,0,582,1,30,1,263358.03,1.6,130,1,1,244,0


In [7]:
#save pandas dataframe as .csv and upload to datastore
if not os.path.isdir('data'):
    os.mkdir('data')
pd.DataFrame(train_df).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_df).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='heart-failure', overwrite=True, show_progress=True)

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 2 files
Uploading ./data/test_data.csv
Uploaded ./data/test_data.csv, 1 files out of an estimated total of 2
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_f78d3b1998744cd3a3d208718acf191a

In [8]:
#Load dataset as TabularDataset
train_data = Dataset.Tabular.from_delimited_files(path=ds.path('heart-failure/train_data.csv'))

In [9]:
get_primary_metrics("classification")

['average_precision_score_weighted',
 'AUC_weighted',
 'precision_score_weighted',
 'accuracy',
 'norm_macro_recall']

## AutoML Configuration
The AutoML Config class is a way of leveraging the AutoML SDK to automate machine learning. 

The AutoML settings file is a dictionary that specifies all the parameters controlling the experiment like experiment_timeout_minutes, whether or not to enable early stoppong, number of cross validations, the primary metric, etc.

In the AutoML Config, we specify the task, the training data to be used, the target column name, the compute target and we pass the automl settings dictionary.

In [10]:
# automl settings 
automl_settings = {
    "enable_early_stopping" : True,
    "experiment_timeout_minutes": 30,
    "n_cross_validations": 4,
    "featurization": 'auto',
    "primary_metric": 'accuracy',
    "verbosity": logging.INFO
}

# automl config 
automl_config = AutoMLConfig(
    task='classification',
    debug_log = 'automl_errors.log',
    training_data=train_data,
    label_column_name='DEATH_EVENT',
    compute_target=compute_cluster,
    **automl_settings
)

In [11]:
# Submit your experiment
remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-automl,AutoML_7e0b1c81-b6af-42a9-ad51-0b9083296104,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

In [12]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [13]:
remote_run.wait_for_completion()

{'runId': 'AutoML_7e0b1c81-b6af-42a9-ad51-0b9083296104',
 'target': 'cpu-compute',
 'status': 'Completed',
 'startTimeUtc': '2022-01-20T15:50:30.798559Z',
 'endTimeUtc': '2022-01-20T16:17:13.680801Z',
 'services': {},
   'message': 'No scores improved over last 20 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'cpu-compute',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"d6aba19a-0103-4669-980e-c31d9f8df225\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets":

## Best Model

In [14]:
#get best model and print all the metrics
best_run, fitted_model = remote_run.get_output()

best_run_metrics = best_run.get_metrics() 
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Package:azureml-automl-runtime, training version:1.37.0, current version:1.36.0
Package:azureml-core, training version:1.37.0, current version:1.36.0.post2
Package:azureml-dataprep, training version:2.25.0, current version:2.24.4
Package:azureml-dataprep-rslex, training version:2.1.0, current version:2.0.3
Package:azureml-dataset-runtime, training version:1.37.0, current version:1.36.0
Package:azureml-defaults, training version:1.37.0, current version:1.36.0
Package:azureml-interpret, training version:1.37.0, current version:1.36.0
Package:azureml-mlflow, training version:1.37.0, current version:1.36.0
Package:azureml-pipeline-core, training version:1.37.0, current version:1.36.0
Package:azureml-responsibleai, training version:1.37.0, current version:1.36.0
Package:azureml-telemetry, training version:1.37.0, current version:1.36.0
Package:azureml-train-automl-client, training version:1.37.0, current version:1.36.0
Package:azureml-train-automl-runtime, training version:1.37.0, current v

f1_score_macro 0.8549193614298632
average_precision_score_micro 0.9158021966305572
recall_score_weighted 0.8745056497175141
AUC_weighted 0.9190572927529449
recall_score_macro 0.8570747554443208
balanced_accuracy 0.8570747554443208
AUC_macro 0.9190572927529449
AUC_micro 0.9153775455648121
f1_score_weighted 0.8736038862148576
precision_score_weighted 0.88281444928292
norm_macro_recall 0.7141495108886413
average_precision_score_macro 0.9046786856327598
weighted_accuracy 0.88646223100367
precision_score_micro 0.8745056497175141
recall_score_micro 0.8745056497175141
precision_score_macro 0.8633922911986992
average_precision_score_weighted 0.9241216781821676
accuracy 0.8745056497175141
matthews_correlation 0.7200773823638164
f1_score_micro 0.8745056497175141
log_loss 0.39710782591714905
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_7e0b1c81-b6af-42a9-ad51-0b9083296104_36/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_7e0b1c81-b6af-42a9-ad51-0b908329610

In [15]:
# Details of best model as well as the parameters of the best run
fitted_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
), random_state=0, reg_alpha=0.7291666666666667, reg_lambda=2.3958333333333335, subsample=0.8, tree_method='auto'))], verbose=False)), ('28', Pipeline(memory=None, steps=[('sparsenormalizer', Normalizer(copy=True, norm='l1')), ('xgboostclassifier', XGBoostClassifier(booster='gbtree', colsample_bytree=1, eta=0.4, gamma=0.1, max_depth=10, max_leaves=127, n_estimators=50, n_jobs=1, objective='reg:logistic', problem_info=ProblemInfo(
    gpu_training_param_dict={'processing_unit_type': 'cpu'}
), random_state=0, reg_alpha=0.8333333333333334, reg_lambda=1.1458333333333335, subsample=0.8, tree_method='auto')

In [16]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-automl,AutoML_7e0b1c81-b6af-42a9-ad51-0b9083296104_36,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [17]:
#Save the best model
best_run.register_model(model_name = 'automl_best_model.pkl', model_path = './outputs/')
joblib.dump(fitted_model, filename= "outputs/automl_model.pkl")

['outputs/automl_model.pkl']

## Model Deployment

In [18]:
# Download score.py and env file
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'inference/score.py')
best_run.download_file(constants.CONDA_ENV_FILE_PATH,'automl_env.yml')

In [19]:
# Register the model
model_name = best_run.properties['model_name']
description = 'AutoML Model trained on heart failure data to predict if death event occurs or not'
tags = None
model = remote_run.register_model(model_name = model_name, description = description, tags = tags)

print(remote_run.model_id) # This will be written to the script file later in the notebook.

AutoML7e0b1c81b36


In [20]:
# Create inference config
script_file_name= 'inference/score.py'
inference_config = InferenceConfig(entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                               memory_gb = 1, 
                                               tags = {'area': "hfData", 'type': "automl_classification"}, 
                                               description = 'Heart Failure Prediction')

aci_service_name = 'automl-heart-failure'
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

automl-heart-failure
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-01-20 16:21:38+00:00 Creating Container Registry if not exists.
2022-01-20 16:21:39+00:00 Use the existing image.
2022-01-20 16:21:39+00:00 Generating deployment configuration.
2022-01-20 16:21:40+00:00 Submitting deployment to compute.
2022-01-20 16:21:43+00:00 Checking the status of deployment automl-heart-failure..
2022-01-20 16:24:13+00:00 Checking the status of inference endpoint automl-heart-failure.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [21]:
# Enable Application Insights
aci_service.update(enable_app_insights=True)

In [22]:
print("State "+ aci_service.state)
print("Swagger URI " + aci_service.swagger_uri)
print("Scoring URI " + aci_service.scoring_uri)

State Healthy
Swagger URI http://ce2d5345-0dac-48dc-adb5-ac39fd5a20b8.southcentralus.azurecontainer.io/swagger.json
Scoring URI http://ce2d5345-0dac-48dc-adb5-ac39fd5a20b8.southcentralus.azurecontainer.io/score


In the cell below, a request is sent to the web service deployed to test it.

In [23]:
import requests
import json

# URL for the web service, should be similar to:
# 'http://8530a665-66f3-49c8-a953-b82a2d312917.eastus.azurecontainer.io/score'
scoring_uri = aci_service.scoring_uri
# If the service is authenticated, set the key or token

# Two sets of data to score, so we get two results back
data = {"data":
        [
          {
            "age": 70.0,
            "anaemia": 1,
            "creatinine_phosphokinase": 4020,
            "diabetes": 1,
            "ejection_fraction": 32,
            "high_blood_pressure": 1,
            "platelets": 234558.23,
            "serum_creatinine": 1.4,
            "serum_sodium": 125,
            "sex": 0,
            "smoking": 1,
            "time": 12
          },
          {
            "age": 75.0,
            "anaemia": 0,
            "creatinine_phosphokinase": 4221,
            "diabetes": 0,
            "ejection_fraction": 22,
            "high_blood_pressure": 0,
            "platelets": 404567.23,
            "serum_creatinine": 1.1,
            "serum_sodium": 115,
            "sex": 1,
            "smoking": 0,
            "time": 7
          },
      ]
    }
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())

{"result": [1, 1]}


In [24]:
#Print the logs of the deployed service
aci_service.get_logs()

'2022-01-20T16:24:05,852488200+00:00 - gunicorn/run \nDynamic Python package installation is disabled.\nStarting HTTP server\n2022-01-20T16:24:05,863362400+00:00 - iot-server/run \n2022-01-20T16:24:05,883075300+00:00 - rsyslog/run \n2022-01-20T16:24:05,908334300+00:00 - nginx/run \nrsyslogd: /azureml-envs/azureml_248925b58f9a398a4b0c980c8749f7a2/lib/libuuid.so.1: no version information available (required by rsyslogd)\nEdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...\n2022-01-20T16:24:06,561624600+00:00 - iot-server/finish 1 0\n2022-01-20T16:24:06,566801100+00:00 - Exit code 1 is normal. Not restarting iot-server.\nStarting gunicorn 20.1.0\nListening at: http://127.0.0.1:31311 (71)\nUsing worker: sync\nworker timeout is set to 300\nBooting worker with pid: 99\nSPARK_HOME not set. Skipping PySpark Initialization.\nGenerating new fontManager, this may take some time...\nInitializing logger\n2022-01-20 16:24:09,323 | root | INFO | Starting up app insights client\

In [30]:
#Delete the deployed service
aci_service.delete()