# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv
import json

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

from azureml.automl.runtime.onnx_convert import OnnxConverter


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.49.0


## Dataset

### Overview

The dataset contains information about visual characteristics of cancerous cells in the breast and ground truth about the diagnosis made based on the cell data. The visuals were measured based on X-rays.

The task is to predict based on the 30 different visual parameters if the cell is benign (B) or malignant (M)

In [2]:
ws = Workspace.from_config()
experiment_name = 'cancer-auto-ml'
project_folder = './capstone-project'

experiment=Experiment(ws, experiment_name)
experiment

2023-05-09:12:26:54,889 INFO     [workspace.py:291] Found the config file in: /config.json
2023-05-09:12:26:56,887 INFO     [clientbase.py:192] Created a worker pool for first use


Name,Workspace,Report Page,Docs Page
cancer-auto-ml,quick-starts-ws-233260,Link to Azure Machine Learning studio,Link to Documentation


In [3]:


amlcompute_cluster_name = "cluster-project"


try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)


Found existing cluster, use it.
Succeeded....................................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


In [4]:
dataset_name = 'cancer-data'
try: 
    ds = ws.datasets[dataset_name]
except KeyError:
    print("Dataset not found, create and rerun this cell!")
    raise

In [5]:
df = ds.to_pandas_dataframe()
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.12,0.28,0.3,0.15,...,25.38,17.33,184.6,2019.0,0.16,0.67,0.71,0.27,0.46,0.12
1,842517,M,20.57,17.77,132.9,1326.0,0.08,0.08,0.09,0.07,...,24.99,23.41,158.8,1956.0,0.12,0.19,0.24,0.19,0.28,0.09
2,84300903,M,19.69,21.25,130.0,1203.0,0.11,0.16,0.2,0.13,...,23.57,25.53,152.5,1709.0,0.14,0.42,0.45,0.24,0.36,0.09
3,84348301,M,11.42,20.38,77.58,386.1,0.14,0.28,0.24,0.11,...,14.91,26.5,98.87,567.7,0.21,0.87,0.69,0.26,0.66,0.17
4,84358402,M,20.29,14.34,135.1,1297.0,0.1,0.13,0.2,0.1,...,22.54,16.67,152.2,1575.0,0.14,0.2,0.4,0.16,0.24,0.08


## AutoML Configuration

I have chosen the timeout and concurrent iterations so the training would not be too time-consuming. For the same reason I have also enabled early stopping. 

In [6]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'AUC_weighted'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=ds,
                             label_column_name="diagnosis",   
                             path = project_folder,
                             enable_early_stopping=True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [7]:

remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
cancer-auto-ml,AutoML_04db1331-2219-4eef-93bc-3b73ce29a4ec,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

In [8]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
cancer-auto-ml,AutoML_04db1331-2219-4eef-93bc-3b73ce29a4ec,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  In order to accurately evaluate the model(s) trained by AutoML, we leverage a dataset that the model is not trained on. Hence, if the user doesn't provide an explicit validation dataset, a part of the training dataset is used to achieve this. For smaller datasets (fewer than 20,000 samples), cross-validation is leveraged, else a single hold-out set is split from the training data to serve as the validation dataset. Hence, for your input data we leverage cross-validation with 10 folds, if the number of training samples are fewer than 1000, and 3 folds in all other cases.
              Learn mo

{'runId': 'AutoML_04db1331-2219-4eef-93bc-3b73ce29a4ec',
 'target': 'cluster-project',
 'status': 'Completed',
 'startTimeUtc': '2023-05-09T12:37:22.568395Z',
 'endTimeUtc': '2023-05-09T13:02:21.878424Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'cluster-project',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"d5a5fa2f-0ead-4d2c-888b-37b80ea0733f\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azur

## Best Model



In [9]:

best_run = remote_run.get_best_child()
print(best_run)

brm = best_run.get_metrics()

for metric_name, metric in zip(brm.keys(), brm.values()):
    print(f"{metric_name}: {metric}")

Run(Experiment: cancer-auto-ml,
Id: AutoML_04db1331-2219-4eef-93bc-3b73ce29a4ec_61,
Type: azureml.scriptrun,
Status: Completed)
average_precision_score_weighted: 0.9973250077446478
balanced_accuracy: 0.9781373310193644
f1_score_macro: 0.9808651753980211
accuracy: 0.982393483709273
norm_macro_recall: 0.9562746620387286
recall_score_micro: 0.982393483709273
precision_score_micro: 0.982393483709273
average_precision_score_micro: 0.9969485499951599
precision_score_macro: 0.9847982110482111
log_loss: 0.10125785324674945
matthews_correlation: 0.9628508423823933
AUC_weighted: 0.9970297023947919
average_precision_score_macro: 0.9972113334687732
precision_score_weighted: 0.983250633845872
f1_score_micro: 0.982393483709273
AUC_micro: 0.996816210325312
AUC_macro: 0.9970297023947919
recall_score_macro: 0.9781373310193644
recall_score_weighted: 0.982393483709273
f1_score_weighted: 0.9822991559480798
weighted_accuracy: 0.9855823082884321
accuracy_table: aml://artifactId/ExperimentRun/dcid.AutoML_04d

In [10]:
print(best_run.properties)

{'runTemplate': 'automl_child', 'pipeline_id': '__AutoML_Ensemble__', 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'AUC_weighted\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'cancer-auto-ml\',\'compute_target\':\'cluster-project\',\'subscription_id\':\'3d1a56d2-7c81-4118-9790-f85d1acf0c77\',\'region\':\'westeurope\',\'spark_service\':None}","ensemble_run_id":"AutoML_04db1331-2219-4eef-93bc-3b73ce29a4ec_61","experiment_name":"cancer-auto-ml","workspace_name":"quick-starts-ws-233260","subscription_id":"3d1a56d2-7c81-4118-9790-f85d1acf0c77","resource_group_name":"aml-quickstarts-233260"}}]}', 'training_percent': '100', 'predicted_cost': None, 'iteration': '61', '_aml_system_scenario_identification': 'Remote.Child', '_azureml.ComputeTargetType'

In [11]:
model_name = best_run.properties["model_name"]

script_file_name = "inference/score.py"

best_run.download_file("outputs/scoring_file_v_1_0_0.py", "inference/score.py")

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.


In [12]:
description = "AutoML Model trained on cancer data to predict if a patient has a malignant or benign tumour"
tags = None
model = remote_run.register_model(
    model_name=model_name, description=description, tags=tags
)

In [13]:

inference_config = InferenceConfig(entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=2,
    memory_gb=2,
    tags={"area": "CancerData", "type": "automl_cd_classification"},
    description="sample service for CancerData Automl Classification",
)

aci_service_name = model_name.lower()
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

automl04db1331261
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2023-05-09 13:02:49+00:00 Creating Container Registry if not exists.
2023-05-09 13:02:50+00:00 Use the existing image.
2023-05-09 13:02:51+00:00 Submitting deployment to compute.
2023-05-09 13:02:59+00:00 Checking the status of deployment automl04db1331261..
2023-05-09 13:05:16+00:00 Checking the status of inference endpoint automl04db1331261.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [14]:
aci_service.get_logs()



In [23]:
from sklearn.model_selection import train_test_split

def clean_data(data):
    x_df = pd.read_csv(data).dropna()
    y_df = x_df.pop("diagnosis")
    return x_df, y_df

x, y = clean_data('breast-cancer.csv')

# TODO: Split data into train and test sets.

_, X_test, _, y_test = train_test_split(x, y, test_size=1/len(y), random_state=42)

In [24]:
import requests

X_test_json = X_test.to_json(orient="records")
data = '{"data": ' + X_test_json + "}"
headers = {"Content-Type": "application/json"}

resp = requests.post(aci_service.scoring_uri, data, headers=headers)

y_pred = json.loads(json.loads(resp.text))["result"]

In [26]:
print(len(y_pred), " ", len(y_test))
print(y_pred, y_test)

1   1
['B'] 204    B
Name: diagnosis, dtype: object


**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
