## Automated ML

Import dependencies. 

In [1]:
import os
import pandas as pd
import numpy as np
import json
import requests
import joblib
from sklearn.metrics import confusion_matrix
import itertools

from azureml.core import Dataset, Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig

from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

## Dataset

### Overview

**Dataset:** Davide Chicco, Giuseppe Jurman: "Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone". BMC Medical Informatics and Decision Making 20, 16 (2020)

Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.

**12 clinical features:**

- age: age of the patient (years)
- anaemia: decrease of red blood cells or hemoglobin (boolean)
- high blood pressure: if the patient has hypertension (boolean)
- creatinine phosphokinase (CPK): level of the CPK enzyme in the blood (mcg/L)
- diabetes: if the patient has diabetes (boolean)
- ejection fraction: percentage of blood leaving the heart at each contraction (percentage)
- platelets: platelets in the blood (kiloplatelets/mL)
- sex: woman or man (binary)
- serum creatinine: level of serum creatinine in the blood (mg/dL)
- serum sodium: level of serum sodium in the blood (mEq/L)
- smoking: if the patient smokes or not (boolean)
- time: follow-up period (days)

In this project, Azure AutoML will be used to make prediction on the death event based on patient's 12 clinical features. 

## Workspace setup

In [2]:
ws = Workspace.from_config()

experiment_name = 'automl-experiment'

experiment = Experiment(ws, experiment_name)

In [3]:
dataset = Dataset.get_by_name(ws, 'heart-disease-kaggle.csv')

In [4]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [5]:
data_train, data_test = dataset.random_split(0.9)

## Config Compute Cluster

In [6]:
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           max_nodes=5)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

In [7]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=data_train,
                             label_column_name="DEATH_EVENT", 
                             enable_early_stopping= True,
                             featurization= 'auto',
                             **automl_settings
                            )

AutoML typically performs cross validation, data balance check, cardinality check in prior to machine learning process with a variety of algorithms. 

In [8]:
remote_run = experiment.submit(automl_config, show_output = True)

Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_b13ca0b7-b493-43eb-8386-a42e61c7da10

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: 

## Run Details

In [9]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-experiment,AutoML_b13ca0b7-b493-43eb-8386-a42e61c7da10,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [10]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [11]:
remote_run.wait_for_completion()

{'runId': 'AutoML_b13ca0b7-b493-43eb-8386-a42e61c7da10',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-03-12T14:04:58.440767Z',
 'endTimeUtc': '2021-03-12T14:29:19.64477Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"5f88bddf-bd8c-43e0-8521-41936ae1af23\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/03-12-2021_014653_UTC/heart-disease-kaggle.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-140282\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"9b72f9e6-56c5-4c16-991b-19c

## Best Model 

In [15]:
best_run, fitted_model = remote_run.get_output()
best_run_metrics = best_run.get_metrics()

Package:azureml-automl-runtime, training version:1.23.0, current version:1.22.0
Package:azureml-core, training version:1.23.0, current version:1.22.0
Package:azureml-dataprep, training version:2.10.1, current version:2.9.1
Package:azureml-dataprep-native, training version:30.0.0, current version:29.0.0
Package:azureml-dataprep-rslex, training version:1.8.0, current version:1.7.0
Package:azureml-dataset-runtime, training version:1.23.0, current version:1.22.0
Package:azureml-defaults, training version:1.23.0, current version:1.22.0
Package:azureml-interpret, training version:1.23.0, current version:1.22.0
Package:azureml-mlflow, training version:1.23.0, current version:1.22.0
Package:azureml-pipeline-core, training version:1.23.0, current version:1.22.0
Package:azureml-telemetry, training version:1.23.0, current version:1.22.0
Package:azureml-train-automl-client, training version:1.23.0, current version:1.22.0
Package:azureml-train-automl-runtime, training version:1.23.0, current versio

AttributeError: /anaconda/envs/azureml_py36/lib/libxgboost.so: undefined symbol: XGBoosterUnserializeFromBuffer

In [14]:
best_run

NameError: name 'best_run' is not defined

In [20]:
fitted_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                objective=None,
                                                                                                random_state=None,
                                                                                                reg_alpha=0.42105263157894735,
                                      

In [21]:
print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print(fitted_model._final_estimator)
print(best_run.get_tags())

Best Run Id:  AutoML_0df6a668-9229-4f79-a88d-ebd9d5ce001d_36

 Accuracy: 0.8617749825296995
PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('16',
                                           Pipeline(memory=None,
                                                    steps=[('robustscaler',
                                                            RobustScaler(copy=True,
                                                                         quantile_range=[25,
                                                                                         75],
                                                                         with_centering=True,
                                                                         with_scaling=False)),
                                                           ('extratreesclassifier',
                                                            ExtraTreesClassifier(bootstrap=False,
                  

In [22]:
os.makedirs('./outputs', exist_ok=True)

joblib.dump(fitted_model, filename='outputs/automl.joblib')

model_name = best_run.properties['model_name']
model_name

'AutoML0df6a668936'

In [23]:
env = best_run.get_environment()

script_file = 'score.py'

best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)

## Model Deployment

In [20]:
#Register the fitted model
model = remote_run.register_model(model_name = model_name,
                                  description = 'AutoML model')

In [21]:
inference_config = InferenceConfig(entry_script = script_file, environment = env)

aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

aci_service_name = 'automl-heart-disease'
print(aci_service_name)

automl-heart-disease


In [22]:
service = Model.deploy(ws, aci_service_name, [model], inference_config, aci_config)
service.wait_for_deployment(True)
print("State: " + service.state)
print("Scoring URI: " + service.scoring_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.............
Succeeded
ACI service creation operation finished, operation "Succeeded"
State: Healthy
Scoring URI: http://af889be7-14d8-48e8-9205-0338fe6afdcb.southcentralus.azurecontainer.io/score


## Testing using 2 different methods

In [23]:
%run endpoint.py

{"result": [1, 1]}


In [35]:
data_test = data_test.to_pandas_dataframe().dropna()
data_sample = data_test.sample(3)
y_true = data_sample.pop('DEATH_EVENT')
sample_json = json.dumps({'data':data_sample.to_dict(orient='records')})
print(sample_json)

{"data": [{"age": 90, "anaemia": 1, "creatinine_phosphokinase": 60, "diabetes": 1, "ejection_fraction": 50, "high_blood_pressure": 0, "platelets": 226000.0, "serum_creatinine": 1.0, "serum_sodium": 134, "sex": 1, "smoking": 0, "time": 30}, {"age": 70, "anaemia": 0, "creatinine_phosphokinase": 618, "diabetes": 0, "ejection_fraction": 35, "high_blood_pressure": 0, "platelets": 327000.0, "serum_creatinine": 1.1, "serum_sodium": 142, "sex": 0, "smoking": 0, "time": 245}, {"age": 60, "anaemia": 1, "creatinine_phosphokinase": 260, "diabetes": 1, "ejection_fraction": 38, "high_blood_pressure": 0, "platelets": 255000.0, "serum_creatinine": 2.2, "serum_sodium": 132, "sex": 0, "smoking": 1, "time": 45}]}


In [36]:
output = service.run(sample_json)
print('Prediction: ', output)
print('True Values: ', y_true.values)

Prediction:  {"result": [1, 0, 1]}
True Values:  [1 0 1]


In [37]:
service.get_logs()

'2020-12-28T08:02:58,638664100+00:00 - nginx/run \n/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n2020-12-28T08:02:58,647916700+00:00 - rsyslog/run \n2020-12-28T08:02:58,650225800+00:00 - iot-server/run \n2020-12-28T08:02:58,659505900+00:00 - gunicorn/run 

In [38]:
service.delete()