## Model Training with Automated ML
### Create a high quality classification model with Azure ML service through Automated ML

#### <font color='red'> Before you begin: please download the dataset from Kaggle and save it into the "data" folder as "heart.csv". You will need to login into Kaggle to be able to download the dataset. </font>

#### Setup diagnostics collection

In [1]:
from azureml.telemetry import set_diagnostics_collection

set_diagnostics_collection(send_diagnostics=True)

Turning diagnostics collection on. 


#### Initialize the Azure ML Workspace

In [2]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print("Workspace name: " + ws.name, 
      "Azure region: " + ws.location,
      "Resource group: " + ws.resource_group, sep = "\n")

Found the config file in: C:\AI+ Tour Tutorials\Azure ML service\heart\Azure ML\aml_config\config.json
Workspace name: ML-Service-Workspace
Azure region: eastus
Resource group: ML-Service-RG


#### Attach your compute target

In [3]:
from azureml.core.compute import ComputeTarget

cluster_name = "cpunode"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)

print(compute_target.status.serialize())

{'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-02-14T18:43:52.143000+00:00', 'creationTime': '2019-02-14T18:43:13.923060+00:00', 'currentNodeCount': 0, 'errors': None, 'modifiedTime': '2019-02-14T18:43:59.767876+00:00', 'nodeStateCounts': {'idleNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0, 'preparingNodeCount': 0, 'runningNodeCount': 0, 'unusableNodeCount': 0}, 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'targetNodeCount': 0, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


#### Create an Experiment in your Workspace to track the training runs

In [4]:
from azureml.core import Experiment

experiment_name = "classification-automl"
experiment = Experiment(ws, name=experiment_name)

#### Upload data to the cloud

In [5]:
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

ds.upload(src_dir="../data", target_path="classification-automl", overwrite=True, show_progress=True)

AzureBlob mlservicstoragevqkhmalr azureml-blobstore-03a77933-b9d0-4918-bd23-4f23d00afafb
Uploading ../data\heart.csv
Uploaded ../data\heart.csv, 1 files out of an estimated total of 1


$AZUREML_DATAREFERENCE_dd8b846a15d84f39a3b1f8c5f4db669b

#### Create a Data Reference Configuration, which allows the system to download data to the compute target

In [6]:
from azureml.core.runconfig import DataReferenceConfiguration

path_on_datastore = "classification-automl/"
path_on_compute = "/tmp/classification-automl/"

dr = DataReferenceConfiguration(datastore_name=ds.name,
                                path_on_datastore=path_on_datastore,
                                path_on_compute=path_on_compute,
                                mode="download", # download files from datastore to compute target
                                overwrite=True)

#### Create a Run Configuration with the Automated ML dependencies, which allows you to submit training jobs to your target compute environment to run Automated ML tasks

In [7]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")

# Set compute target to the cpu cluster
conda_run_config.target = compute_target
# set the data reference of the run coonfiguration
conda_run_config.data_references = {ds.name: dr}

cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy'])
conda_run_config.environment.python.conda_dependencies = cd

#### Create the get_data.py script, which allows the remote execution job to read the dataset. Notice that this file should be in the root directory of the project and the actual data path is specified by the path_on_compute value specified in the DataReferenceConfiguration call plus the path_on_datastore folder and then the actual file name

In [8]:
%%writefile get_data.py

import pandas as pd
from sklearn.model_selection import train_test_split

path_on_compute = "/tmp/classification-automl/"
path_on_datastore = "classification-automl/"

def get_data():
    df_heart = pd.read_csv(path_on_compute + path_on_datastore + "heart.csv")
    df_heart_X = df_heart.drop(["target"], axis=1).values
    df_heart_y = df_heart["target"].values
    X_train, X_test, y_train, y_test = train_test_split(df_heart_X, df_heart_y, test_size = 0.2, random_state=123)
    
    return { "X": X_train, "y": y_train, "X_valid": X_test, "y_valid": y_test }

Writing get_data.py


#### Define the AutoML settings and the job to be submitted

In [9]:
import logging
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "iteration_timeout_minutes": 60,
    "iterations": 50,
    "primary_metric": 'accuracy',
    "preprocess": True,
    "max_cores_per_iteration": 4,
    "verbosity": logging.INFO,
    "model_explainability": True
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             path=".",
                             run_configuration=conda_run_config,
                             data_script = "get_data.py",
                             **automl_settings
                            )

#### Submit your trainig job

In [10]:
run = experiment.submit(automl_config)
print(run)

Run(Experiment: classification-automl,
Id: AutoML_2a412fe9-ed1e-462c-9d13-36146c4b2e4d,
Type: automl,
Status: Preparing)


#### Get more details of your run

In [11]:
# print(run.get_details())

#### Monitor your job

In [12]:
from azureml.widgets import RunDetails

RunDetails(run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 'sd…

#### Wait for the job to complete and print a summary of the execution

In [13]:
run.wait_for_completion(show_output = True)


*******************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
TRAINFRAC: Fraction of the training data to train on.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
*******************************************************************************************************************

 ITERATION   PIPELINE                                       TRAINFRAC  DURATION      METRIC      BEST
         0   RobustScaler LogisticRegression                1          0:00:43       0.7541    0.7541
         1   StandardScalerWrapper LightGBM                 1          0:00:38       0.7213    0.7541
         2   StandardScalerWrapper DecisionTree             1          0:00:42       0.7869    0.7869
         3   RobustScaler LightGBM             



         9   StandardScalerWrapper BernoulliNaiveBayes      1          0:00:39       0.7541    0.7869
        10   MaxAbsScaler LightGBM                          1          0:00:37       0.7541    0.7869
        11   MinMaxScaler LightGBM                          1          0:00:40       0.7213    0.7869
        12   StandardScalerWrapper LightGBM                 1          0:00:46       0.6885    0.7869
        13   MinMaxScaler SGD                               1          0:00:31       0.5082    0.7869
        14   SparseNormalizer LightGBM                      1          0:00:29       0.7869    0.7869
        15   SparseNormalizer LightGBM                      1          0:00:33       0.7213    0.7869
        16   SparseNormalizer LightGBM                      1          0:00:32       0.7377    0.7869
        17   SparseNormalizer LightGBM                      1          0:00:42       0.7377    0.7869
        18   RobustScaler LightGBM                          1          0:00:42    

{'runId': 'AutoML_2a412fe9-ed1e-462c-9d13-36146c4b2e4d',
 'target': 'cpunode',
 'status': 'Completed',
 'startTimeUtc': '2019-02-14T19:33:50.624188Z',
 'endTimeUtc': '2019-02-14T20:15:14.070763Z',
 'properties': {'num_iterations': '50',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'MaxTimeSeconds': '3600',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'cpunode',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'dependencies_versions': '{"azuremlftk": "0.1.18323.5a1", "azureml-widgets": "1.0.10", "azureml-train": "1.0.10", "azureml-train-restclients-hyperdrive": "1.0.10", "azureml-train-core": "1.0.10", "azureml-train-automl": "1.0.10", "azureml-telemetry": "1.0.10", "azureml-sdk": "1.0.10", "azureml-pipeline": "1.0.10", "azureml-pipeline-steps": "1.0.10.1", "azureml-pipeline-core": "1.0.10", "azureml-explain-

#### In case you need to cancel your job while still running

In [14]:
# run.cancel()

#### You can also use sdk methods to fetch all the child runs and see individual metrics

In [15]:
import pandas as pd

children = list(run.get_children())
metricslist = {}

for single_run in children:
    properties = single_run.get_properties()
    metrics = {k: v for k, v in single_run.get_metrics().items() if isinstance(v, float)}    
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
display(rundata)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
AUC_macro,0.870968,0.856989,0.846774,0.864516,0.85914,0.836559,0.895699,0.673118,0.846237,0.872043,...,0.869892,0.841935,0.851613,0.868817,0.863441,0.869892,0.867742,0.866667,0.869892,0.877419
AUC_micro,0.870968,0.856989,0.846774,0.864516,0.85914,0.836559,0.895699,0.673118,0.846237,0.872043,...,0.869892,0.841935,0.851613,0.868817,0.863441,0.869892,0.867742,0.866667,0.869892,0.877419
AUC_weighted,0.870968,0.856989,0.846774,0.864516,0.85914,0.836559,0.895699,0.673118,0.846237,0.872043,...,0.869892,0.841935,0.851613,0.868817,0.863441,0.869892,0.867742,0.866667,0.869892,0.877419
accuracy,0.754098,0.721311,0.786885,0.786885,0.737705,0.721311,0.770492,0.508197,0.737705,0.754098,...,0.754098,0.754098,0.721311,0.754098,0.770492,0.737705,0.754098,0.754098,0.770492,0.770492
average_precision_score_macro,0.883593,0.879039,0.817197,0.88316,0.881275,0.848592,0.915778,0.628024,0.873079,0.875602,...,0.882858,0.861258,0.867306,0.880976,0.878317,0.899235,0.880374,0.879596,0.882801,0.89766
average_precision_score_micro,0.883593,0.879039,0.817197,0.88316,0.881275,0.848592,0.915778,0.628024,0.873079,0.875602,...,0.882858,0.861258,0.867306,0.880976,0.878317,0.899235,0.880374,0.879596,0.882801,0.89766
average_precision_score_weighted,0.883593,0.879039,0.817197,0.88316,0.881275,0.848592,0.915778,0.628024,0.873079,0.875602,...,0.882858,0.861258,0.867306,0.880976,0.878317,0.899235,0.880374,0.879596,0.882801,0.89766
balanced_accuracy,0.753763,0.72043,0.787097,0.786559,0.736559,0.719892,0.768817,0.5,0.735484,0.754839,...,0.753763,0.753226,0.722581,0.753763,0.77043,0.737634,0.753763,0.753763,0.77043,0.77043
f1_score_macro,0.753834,0.720108,0.786885,0.786656,0.735931,0.718589,0.767429,0.336957,0.731868,0.753834,...,0.753834,0.753036,0.720108,0.753834,0.77043,0.737634,0.753834,0.753834,0.77043,0.77043
f1_score_micro,0.754098,0.721311,0.786885,0.786885,0.737705,0.721311,0.770492,0.508197,0.737705,0.754098,...,0.754098,0.754098,0.721311,0.754098,0.770492,0.737705,0.754098,0.754098,0.770492,0.770492


#### Retrieve the best model

In [16]:
best_run, fitted_pipeline = run.get_output()

print(best_run)
print("####################")
print(fitted_pipeline)
print("####################")
print(fitted_pipeline.steps)

Run(Experiment: classification-automl,
Id: AutoML_2a412fe9-ed1e-462c-9d13-36146c4b2e4d_38,
Type: azureml.scriptrun,
Status: Completed)
####################
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(logger=None, task=None)), ('standardscalerwrapper', <automl.client.core.common.model_wrappers.StandardScalerWrapper object at 0x0000020C2CA90A90>), ('sgdclassifierwrapper', SGDClassifierWrapper(alpha=6.53064693877551, class_weight=None, eta0=0.001,
        ...bs=4,
           penalty='l2', power_t=0.3333333333333333, random_state=None,
           tol=0.01))])
####################
[('datatransformer', DataTransformer(logger=None, task=None)), ('standardscalerwrapper', <automl.client.core.common.model_wrappers.StandardScalerWrapper object at 0x0000020C2CA90A90>), ('sgdclassifierwrapper', SGDClassifierWrapper(alpha=6.53064693877551, class_weight=None, eta0=0.001,
           fit_intercept=False, l1_ratio=0.8979591836734693,
           learning_rate='optimal', loss='lo

#### Score the best model on the test set

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_heart = pd.read_csv("../data/heart.csv")
df_heart_X = df_heart.drop(["target"], axis=1).values
df_heart_y = df_heart["target"].values
X_train, X_test, y_train, y_test = train_test_split(df_heart_X, df_heart_y, test_size = 0.2, random_state=123)
    
score_train = round(fitted_pipeline.score(X_train, y_train), 3)
print("Best Accuracy train: ", score_train)
score_test = round(fitted_pipeline.score(X_test, y_test), 3)
print("Best Accuracy test: ", score_test) 

Best Accuracy train:  0.86
Best Accuracy test:  0.82


#### View model explainability

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_heart = pd.read_csv("../data/heart.csv")
df_heart_X = df_heart.drop(["target"], axis=1).values
df_heart_y = df_heart["target"].values
X_train, X_test, y_train, y_test = train_test_split(df_heart_X, df_heart_y, test_size = 0.2, random_state=123)

from azureml.train.automl.automlexplainer import explain_model

shap_values, expected_values, overall_summary, overall_imp, per_class_summary, per_class_imp = explain_model(fitted_pipeline, X_train, X_test)

#Overall feature importance
print(overall_imp)
print(overall_summary)

#Class-level feature importance
print(per_class_imp)
print(per_class_summary)

100%|██████████████████████████████████████████████████████████████████████████████████| 61/61 [00:36<00:00,  2.57it/s]


['C12_MeanImputer', 'C10_MeanImputer', 'C3_MeanImputer', 'C9_MeanImputer', 'C11_MeanImputer', 'C8_MeanImputer', 'C13_MeanImputer', 'C2_MeanImputer', 'C1_MeanImputer', 'C7_MeanImputer', 'C4_MeanImputer', 'C5_MeanImputer', 'C6_MeanImputer']
[0.007709839770767843, 0.007191645513937159, 0.007109638948903136, 0.006533919339842723, 0.005820693723920989, 0.005661370492156981, 0.005615518197133055, 0.004377947602452456, 0.0032724209164514545, 0.0023804991366112235, 0.0012496833597931468, 0.0003991945689791898, 3.0002391221483813e-05]
[['C12_MeanImputer', 'C10_MeanImputer', 'C3_MeanImputer', 'C9_MeanImputer', 'C11_MeanImputer', 'C8_MeanImputer', 'C13_MeanImputer', 'C2_MeanImputer', 'C1_MeanImputer', 'C7_MeanImputer', 'C4_MeanImputer', 'C5_MeanImputer', 'C6_MeanImputer'], ['C12_MeanImputer', 'C10_MeanImputer', 'C3_MeanImputer', 'C9_MeanImputer', 'C11_MeanImputer', 'C8_MeanImputer', 'C13_MeanImputer', 'C2_MeanImputer', 'C1_MeanImputer', 'C7_MeanImputer', 'C4_MeanImputer', 'C5_MeanImputer', 'C6_Me

#### Save the best model

In [19]:
from joblib import dump

dump(fitted_pipeline, "best_model.joblib") 

#### The preprocess data gets cache at user default file store. When the run is completed the cache can be cleaned

In [21]:
# run.clean_preprocessor_cache()