# Automated ML

## Azure ML and pipeline SDK-specific imports

In [1]:
import logging
import os
import csv
import pkg_resources
import json
import requests

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

import sklearn
from sklearn import datasets
from sklearn.metrics import confusion_matrix

import azureml.core
from azureml.core import Model
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.dataset import Dataset
from azureml.core.environment import Environment
from azureml.core.experiment import Experiment
from azureml.core.model import InferenceConfig
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.core.webservice import AciWebservice
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.27.0


## Initialize workspace

Initialize a workspace object from persisted configuration.

In [2]:
ws = Workspace.from_config()

print(
    'Workspace name: ' + ws.name, 
    'Azure region: ' + ws.location, 
    'Subscription id: ' + ws.subscription_id, 
    'Resource group: ' + ws.resource_group, sep = '\n'
    )

Workspace name: udacity-ml-capstone-ws
Azure region: eastus
Subscription id: b329467a-d1f8-4c9b-b3dc-95cdc7bff7fa
Resource group: udacity-ml-capstone-rg


## Create an Azure ML experiment

Let's create an experiment named `heart-failure-aml-exp` and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the source_directory would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the source_directory of the step.

In [3]:
# Choose a name for the run history container in the workspace
experiment_name = 'heart-failure-aml-exp'
project_folder = './heart-failure-aml-proj'

experiment = Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
heart-failure-aml-exp,udacity-ml-capstone-ws,Link to Azure Machine Learning studio,Link to Documentation


### Create or attach an AmlCompute cluster

You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run.

In [4]:
# Choose a name for your CPU cluster
compute_cluster_name = "compute-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
        #vm_priority = 'lowpriority', # optional
        min_nodes=0,
        max_nodes=5)
    compute_target = ComputeTarget.create(ws, compute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# For a more detailed view of current AmlCompute status, use get_status()
print(compute_target.get_status().serialize())

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 1, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-05-11T19:45:30.629000+00:00', 'errors': None, 'creationTime': '2021-05-11T12:45:11.596800+00:00', 'modifiedTime': '2021-05-11T12:45:57.018265+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS2_V2'}


## Dataset

### Overview

The project uses the [heart failure clinical records dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv). This dataset contains the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features.

|                           |                                                                        |
| ------------------------- | ---------------------------------------------------------------------- |
| Dataset characteristics   | Multivariate                                                           |
| Number of instances       | 299                                                                    |
| Area                      | Life                                                                   |
| Attribute characteristics | Integer, Real                                                          |
| Number of attributes      | 13                                                                     |
| Associated tasks          | Classification, Regression, Clustering                                 |
| Missing values?           | N/A                                                                    |
| Source                    | https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records |

A detailed description of the dataset can be found in the dataset section of the ["Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone"](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5) paper.

### Attribute Information

Thirteen clinical features:

| Name                           | Description                                               | Unit             |
| ------------------------------ | --------------------------------------------------------- | ---------------- |
| Age                            | Age of the patient                                        | Years            |
| Anaemia                        | Decrease of red blood cells or hemoglobin                 | Boolean          |
| High blood pressure            | If the patient has hypertension                           | Boolean          |
| Creatinine phosphokinase (CPK) | Level of the CPK enzyme in the blood                      | mcg/L            |
| Diabetes                       | If the patient has diabetes                               | Boolean          |
| Ejection fraction              | Percentage of blood leaving the heart at each contraction | Percentage       |
| Platelets                      | Platelets in the blood                                    | kiloplatelets/mL |
| Sex                            | Woman or man                                              | Binary           |
| Serum creatinine               | Level of serum creatinine in the blood                    | mg/dL            |
| Serum sodium                   | Level of serum sodium in the blood                        | mEq/L            |
| Smoking                        | If the patient smokes or not                              | Boolean          |
| Time                           | Follow-up period                                          | Days             |
| Death event (target)           | If the patient deceased during the follow-up period       | Boolean          |

### Task

This project tries to predict the binary class for the target/label `DEATH_EVENT` using supervised machine learning. `DEATH_EVENT` is the mortality caused by heart failure. All the features in the dataset are used.

In [5]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "heart-failure-ds"
description_text = "Heart failure clinical records dataset from the UCI repository (https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records)"
data_path = 'https://raw.githubusercontent.com/thom/azure-ml-engineer-capstone/main/data/heart_failure_clinical_records_dataset.csv'

if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    # Create AML Dataset and register it into Workspace
    dataset = Dataset.Tabular.from_delimited_files(data_path)        
    # Register Dataset in Workspace
    dataset = dataset.register(
        workspace=ws,
        name=key,
        description=description_text)


df = dataset.to_pandas_dataframe()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
age                         299 non-null float64
anaemia                     299 non-null int64
creatinine_phosphokinase    299 non-null int64
diabetes                    299 non-null int64
ejection_fraction           299 non-null int64
high_blood_pressure         299 non-null int64
platelets                   299 non-null float64
serum_creatinine            299 non-null float64
serum_sodium                299 non-null int64
sex                         299 non-null int64
smoking                     299 non-null int64
time                        299 non-null int64
DEATH_EVENT                 299 non-null int64
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


## AutoML Configuration

Overview of the AutoML settings and configuration used for this experiment:

| Property                     | Value               | Description                                                                                                                      |
| ---------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `experiment_timeout_minutes` | `60`                | Maximum amount of time in hours that all iterations combined can take before the experiment terminates                           |
| `max_concurrent_iterations`  | `5`                 | Represents the maximum number of iterations that would be executed in parallel, should be less or equal to the number of cores   |
| `n_cross_validation`         | `5`                 | How many cross validations to perform when user validation data is not specified                                                 |
| `primary_metric`             | `accuracy`          | The metric that Automated Machine Learning will optimize for model selection                                                     |
| `compute_target`             | `compute_cluster`   | The Azure Machine Learning compute target to run the Automated Machine Learning experiment on                                    |
| `task`                       | `classification`    | The type of task to run                                                                                                          |
| `training_data`              | `dataset`           | The training data to be used within the experiment, it contains both training features and a label column                        |
| `label_column_name`          | `DEATH_EVENT`       | The name of the label column                                                                                                     |
| `path`                       | `project_folder`    | The full path to the Azure Machine Learning project folder                                                                       |
| `enable_early_stopping`      | `True`              | Whether to enable early termination if the score is not improving in the short term                                              |
| `featurization`              | `auto`              | Indicator for whether featurization step should be done automatically or not, or whether customized featurization should be used |
| `debug_log`                  | `automl_errors.log` | The log file to write debug information to                                                                                       |

In [6]:
# AutoML settings
automl_settings = {
    "experiment_timeout_minutes": 60,
    "max_concurrent_iterations": 5,
    "n_cross_validations": 5,
    "primary_metric" : 'accuracy'
}

# AutoML configuration
automl_config = AutoMLConfig(
    compute_target=compute_cluster_name,
    task="classification",
    training_data=dataset,
    label_column_name="DEATH_EVENT",   
    path=project_folder,
    enable_early_stopping=True,
    featurization='auto',
    debug_log='automl_errors.log',
    **automl_settings
    )

In [7]:
# Submit the experiment
remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-aml-exp,AutoML_aee3095f-aebf-486a-863a-869674d7d68d,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

The `VotingEnsemble` model provides the best performance.

A voting ensemble (or a “majority voting ensemble“) is an ensemble machine learning model that combines the predictions from multiple other models. The predictions for each label are summed and the label with the majority vote is predicted, i.e. predictions are the majority vote of contributing models. See [How to Develop Voting Ensembles With Python
](https://machinelearningmastery.com/voting-ensembles-with-python/) for more information.

Use the `RunDetails` widget to show the different experiments.

In [8]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)
remote_run

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-aml-exp,AutoML_aee3095f-aebf-486a-863a-869674d7d68d,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS

Experiment,Id,Type,Status,Details Page,Docs Page
heart-failure-aml-exp,AutoML_aee3095f-aebf-486a-863a-869674d7d68d,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


## Best Model

The cell below gets the best model from the AutoML experiments and display all the properties of the model.

In [9]:
# Retrieve and save the best model
best_run, fitted_model = remote_run.get_output()
remote_run.get_metrics()

{'experiment_status': ['DatasetEvaluation',
  'FeaturesGeneration',
  'DatasetFeaturization',
  'DatasetFeaturizationCompleted',
  'DatasetCrossValidationSplit',
  'ModelSelection'],
 'experiment_status_description': ['Gathering dataset statistics.',
  'Generating features for the dataset.',
  'Beginning to fit featurizers and featurize the dataset.',
  'Completed fit featurizers and featurizing the dataset.',
  'Generating individually featurized CV splits.',
  'Beginning model selection.'],
 'average_precision_score_macro': 0.8965977031743249,
 'precision_score_micro': 0.8763276836158193,
 'balanced_accuracy': 0.8416666666666668,
 'accuracy': 0.8763276836158193,
 'average_precision_score_micro': 0.916008586872039,
 'average_precision_score_weighted': 0.9207681681542855,
 'weighted_accuracy': 0.8997121064304476,
 'recall_score_weighted': 0.8763276836158193,
 'AUC_micro': 0.9142065977209614,
 'matthews_correlation': 0.7269575497896184,
 'recall_score_macro': 0.8416666666666668,
 'norm_

In [10]:
print(best_run)

Run(Experiment: heart-failure-aml-exp,
Id: AutoML_aee3095f-aebf-486a-863a-869674d7d68d_50,
Type: azureml.scriptrun,
Status: Completed)


In [11]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               reg_lambda=1.7708333333333335,
                                                                                               scale_pos_weight=1,
                                                                                               seed=None,
                                             

## Test the model

### Load test data

For the test data, it should have the same preparation step as the train data. Otherwise it might get failed at the preprocessing step.

In [12]:
dataset_test = Dataset.Tabular.from_delimited_files(path=data_path)
df_test = dataset_test.to_pandas_dataframe()
df_test = df_test[pd.notnull(df_test['DEATH_EVENT'])]

y_test = df_test['DEATH_EVENT']
X_test = df_test.drop(['DEATH_EVENT'], axis=1)

### Testing our best fitted model

We will use confusion matrix to see how our model works.

In [13]:
ypred = fitted_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

In [14]:
# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,197,6
1,20,76


## Model Deployment

As the AutoML model performed better than the HyperDrive model, we will register this model, create an inference config and deploy it as a web service.

### Save and register model

In [15]:
# Save the model, scoring script and conda environment of the best run
inference_folder_name = 'inference'
automl_model = os.path.join(inference_folder_name, 'model.pkl')
score_script = os.path.join(inference_folder_name, 'score.py')
conda_env = os.path.join(inference_folder_name, 'conda_env.yml')
best_run.download_file('outputs/model.pkl', automl_model)
best_run.download_file('outputs/scoring_file_v_1_0_0.py', score_script)
best_run.download_file('outputs/conda_env_v_1_0_0.yml', conda_env)

In [16]:
# Register the model
model = Model.register(
    workspace=ws,
    model_name=best_run.properties['model_name'],
    model_path=automl_model,
    model_framework=Model.Framework.SCIKITLEARN,
    model_framework_version=sklearn.__version__,
    description='Auto ML model predicting deaths caused by heart failure'
    )

print('Name:', model.name)
print('Version:', model.version)

Registering model AutoMLaee3095fa50
Name: AutoMLaee3095fa50
Version: 1


In [17]:
# Create inference configuration
env = Environment.from_conda_specification(name="env", file_path=conda_env)
inference_conf = InferenceConfig(entry_script=score_script, environment=env)

# Display the environment file
with open(conda_env, 'r') as file:
    env_file = file.read()
    print(env_file)

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - azureml-train-automl-runtime==1.27.0.post1
  - inference-schema
  - azureml-interpret==1.27.0
  - azureml-defaults==1.27.0
- numpy>=1.16.0,<1.19.0
- pandas==0.25.1
- scikit-learn==0.22.1
- py-xgboost<=0.90
- fbprophet==0.5
- holidays==0.9.11
- psutil>=5.2.2,<6.0.0
channels:
- anaconda
- conda-forge



### Deploy webservice

In [25]:
# Define deployment configuration
deployment_conf = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=1,
    description='Predicting deaths caused by heart failure',
    enable_app_insights=True)

# Deploy model as webservice using Azure Container Instance (ACI)
service_name = "aci-heart-failure-web"

service = Model.deploy(
    workspace=ws,
    name=service_name, 
    models=[model], 
    inference_config=inference_conf, 
    deployment_config=deployment_conf, 
    overwrite=True)

service.wait_for_deployment(show_output=True)

print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-05-11 20:40:56+00:00 Creating Container Registry if not exists.
2021-05-11 20:40:57+00:00 Registering the environment.
2021-05-11 20:40:57+00:00 Use the existing image.
2021-05-11 20:40:57+00:00 Generating deployment configuration.
2021-05-11 20:40:58+00:00 Submitting deployment to compute..
2021-05-11 20:41:31+00:00 Checking the status of deployment aci-heart-failure-web..
2021-05-11 20:44:57+00:00 Checking the status of inference endpoint aci-heart-failure-web.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


### Consume webservice

In [29]:

import json
import requests

test_data = json.dumps({
    "data": [
        [55, 0, 1820, 0, 38, 0, 270000, 1.2, 139, 0, 0, 271],
        [55, 0, 1199, 0, 20, 0, 263358.03, 1.83,134, 1, 1, 241],
        [65, 1, 258, 1, 25, 0, 198000, 1.4, 129, 1, 0, 235],
        [50, 0, 196, 0, 45, 0, 395000, 1.6, 136, 1, 1, 285]
    ]})

response = requests.post(
    service.scoring_uri, 
    data=test_data, 
    headers={'Content-Type':'application/json'})

print("Results:", response.json())

Results: {"result": [0, 1, 0, 0]}


### Print the logs of the webservice

In [28]:
print(service.get_logs())

2021-05-11T20:44:48,759135100+00:00 - iot-server/run 
2021-05-11T20:44:48,758708900+00:00 - gunicorn/run 
2021-05-11T20:44:48,771593200+00:00 - rsyslog/run 
rsyslogd: /azureml-envs/azureml_f8f5ff2f983718fa04a09abf22f98303/lib/libuuid.so.1: no version information available (required by rsyslogd)
2021-05-11T20:44:48,807947600+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_f8f5ff2f983718fa04a09abf22f98303/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_f8f5ff2f983718fa04a09abf22f98303/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_f8f5ff2f983718fa04a09abf22f98303/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_f8f5ff2f983718fa04a09abf22f98303/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml

### Clean up

In [None]:
# Delete the web service and the model
service.delete()
model.delete()