In [1]:
### How to log 
### How to register a data
### How to retrieve data from datastore / workspace (registered)
### Check all experiments in a workspace
### Check all runs in a experiment
### Get a specific run by its tag/property
### Tag runs, add properties

### How to register a model
### Review all registered models
### Retrieve a model by its tag


### How to create a new environment
### How to register a enviroment


### How to create a new compute cluster
### How to create a run config for pipelines


In [2]:
from azureml.core import Workspace, Experiment, Run, ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import os
import sys

In [3]:
script_dir = 'playground'
os.makedirs(script_dir, exist_ok=True)


#### Couple of things need attention:
- argument: --input-data and --preped-data should be the same in PythonScriptStep
- filename should be the same in OutputFileDatasetConfig(filename) and same in the second data input step
- the input for the inital step: diabetes_ds.as_named_input('raw_data')
- the input for the second step: prep_data.as_input()

In [4]:
%%writefile $script_dir/data_prep.py
import os
import argparse
from azureml.core import Run
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

parser = argparse.ArgumentParser()
parser.add_argument('--input-data', type=str, dest='raw_data', help='Raw dataset')
parser.add_argument('--preped-data', type=str, dest='preped_data', help='Prepared dataset')
args = parser.parse_args()

saved_dir = args.preped_data
run = Run.get_context()

df = run.input_datasets['raw_data'].to_pandas_dataframe()
run.log(name='data shape', value=str(df.shape[0]))

num_features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
                'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']

num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']

run.log_list("numerical cols", num_cols)

scaler = MinMaxScaler()
df[num_features] = scaler.fit_transform(df[num_features])

print('saving data to preped_data')
os.makedirs(saved_dir, exist_ok=True)
df.to_csv(os.path.join(saved_dir, 'preped_data.csv'), index=False)
run.complete()

Overwriting playground/data_prep.py


In [5]:
%%writefile $script_dir/train.py

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import argparse
import numpy as np
from azureml.core import Run, Model
import joblib

parser = argparse.ArgumentParser()
parser.add_argument('--training-data', type=str, dest='training_data', help='Input dataset for model')
args = parser.parse_args()

training_data_dir = args.training_data

run = Run.get_context()

# !!!!!
# df = run.input_datasets['input_data'].to_pandas_dataframe()
df = pd.read_csv(os.path.join(training_data_dir, 'preped_data.csv'))

features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
            'SerumInsulin','BMI', 'DiabetesPedigree', 'Age']

# Separate features and labels

X, y = df[features].values, df['Diabetic'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

model = DecisionTreeClassifier().fit(X_train, y_train)

y_pred = model.predict(X_test)

# ACC
acc = np.average(y_pred==y_test)
run.log(name='acc', value=acc)

# AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:, 1])
run.log(name='auc', value=auc)

# ROC
fpr, tpr, threshold = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

print('Saving model')
os.makedirs('models', exist_ok=True)

model_file_path = os.path.join('models', 'diabetes_model.pkl')
joblib.dump(value=model, filename=model_file_path)

Model.register(workspace=run.experiment.workspace,
               model_path=model_file_path,
               model_name='diabetes_model',
               tags={"training context": "pipeline_2"},
               properties={'AUC': np.float(auc), "ACC": np.float(acc)})
run.complete()


Overwriting playground/train.py


In [6]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()


If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [7]:
for data in ws.datasets:
    print(data)


diabetes_dataset2
batch-data
diabetes file dataset
diabetes dataset
loan_data


In [8]:
'diabetes dataset' in ws.datasets

True

In [9]:
from glob import glob
# [os.path.join(root, f) for f in files for root, dirs, files in os.walk('data')]
glob('data/*.csv')

['data\\diabetes.csv', 'data\\diabetes2.csv']

In [10]:
# register dataset
from azureml.core import Dataset
from azureml.data.datapath import DataPath

dataset_name = 'diabetes_dataset2'
# if this dataset already registered, we will do nothing, otherwise, we upload the files, and register it.
# if the file already uploaded, we will be notified, without re-uploading it again.
if dataset_name not in ws.datasets: 
    print("We will upload the files to cloud datastore, retrive it and register it")
    Dataset.File.upload_directory(src_dir='data', target=DataPath(datastore, dataset_name)) 
    tab_data = Dataset.Tabular.from_delimited_files(path=(datastore, os.path.join(dataset_name, '*.csv')))
    try:
        tab_data = tab_data.register(workspace=ws,
                                    name=dataset_name,
                                    tags={"format": "csv"},
                                    description="diabetes dataset",
                                    create_new_version=True)
    except Exception as ex:
        print(ex)
else:
    print("dataset already exist")
    


dataset already exist


In [11]:
# get a registered dataset from workspace
ws.datasets.get(dataset_name).to_pandas_dataframe().head(10)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
5,1619297,0,82,92,9,253,19.72416,0.103424,26,0
6,1660149,0,133,47,19,227,21.941357,0.17416,21,0
7,1458769,0,67,87,43,36,18.277723,0.236165,26,0
8,1201647,8,80,95,33,24,26.624929,0.443947,53,1
9,1403912,1,72,31,40,42,36.889576,0.103944,26,0


### Prepare the environment

In [12]:
%%writefile $script_dir/experiment_env.yml
name: experiment_env_2
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting playground/experiment_env.yml


In [13]:
from azureml.core import Environment
# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env_2", script_dir + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env_2')

### Prepare a compute instance

In [14]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ComputerCluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


### Setup the run config for the pipeline

https://learn.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfig.runconfiguration?view=azure-ml-py

In [15]:
from azureml.core.runconfig import RunConfiguration
# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


#### Define all the PythonScriptStep and intermediate data reference

In [16]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
preped_data = OutputFileDatasetConfig("preped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = script_dir,
                                script_name = "data_prep.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--preped-data', preped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = script_dir,
                                script_name = "train.py",
                                arguments = ['--training-data', preped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [17]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

Pipeline is built.


In [18]:
# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline-2')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Created step Prepare Data [830f93a4][1e9d667b-4866-4d90-8474-29b91b3aa5d8], (This step will run and generate new outputs)
Created step Train and Register Model [88404573][bf3ec571-dd7e-431c-a1f5-545e5620f870], (This step will run and generate new outputs)
Submitted PipelineRun 03bee7ad-a0b8-40d5-b132-f3edf3a3056c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/03bee7ad-a0b8-40d5-b132-f3edf3a3056c?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 03bee7ad-a0b8-40d5-b132-f3edf3a3056c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/03bee7ad-a0b8-40d5-b132-f3edf3a3056c?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
PipelineRun Status: Running


StepRunId: 3d0985dd-1002-4fbf-a7c8-40a17b8dbccb
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3d0985dd-1002-4fbf-a7c8-40a17b8dbccb?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
StepRun( Prepare Data ) Status: NotStarted
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished
{'runId': '3d0985dd-1002-4fbf-a7c8-40a17b8dbccb', 'target': 'ComputerCluster', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:36:37.74835Z', 'endTimeUtc': '2023-01-16T12:37:01.843611Z', 's




StepRunId: 025ea179-0e8e-4871-928b-ca349b912d18
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/025ea179-0e8e-4871-928b-ca349b912d18?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
StepRun( Train and Register Model ) Status: NotStarted

StepRun(Train and Register Model) Execution Summary
StepRun( Train and Register Model ) Status: Finished
{'runId': '025ea179-0e8e-4871-928b-ca349b912d18', 'target': 'ComputerCluster', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:37:10.884406Z', 'endTimeUtc': '2023-01-16T12:37:26.269079Z', 'services': {}, 'properties': {'ContentSnapshotId': '329170f4-0f15-454c-bfc1-4a02dc486f18', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': 'bf3ec571-dd7e-431c-a1f5-545e5620f870', 'azureml.moduleName': 'Train and Register Model', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '88404573', 



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '03bee7ad-a0b8-40d5-b132-f3edf3a3056c', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:36:26.096379Z', 'endTimeUtc': '2023-01-16T12:37:27.326387Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-01-16T12:36:26.3751879+00:00","EndTime":"2023-01-16T12:37:27.2665943+00:00","Status":"Finished"}}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://myworkspace4931631789.blob.core.windows.net/azureml/ExperimentRun/dcid.03bee7ad-a0b8-40d5-b132-f3edf3a3056c/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=9YxxOdy19TwLWNKLx%2FA35iYpfqkk2934DtL6lpgmzXg%

'Finished'

In [21]:
from azureml.core import Model

for model in Model.list(ws):
    print('*' * 20)
    print(model.name, 'version:', model.version)
    for tag in model.tags:
        print(f"{tag}: {model.tags[tag]}")
    for p in model.properties:
        print(f"{p}: {model.properties[p]}")

********************
diabetes_model version: 15
training context: pipeline_2
AUC: 0.8887200152893158
ACC: 0.9024444444444445
********************
diabetes_model version: 14
Training context: Pipeline
AUC: 0.8825848234075178
Accuracy: 0.8971111111111111
********************
diabetes_model version: 13
Training context: Pipeline
AUC: 0.8855756630900167
Accuracy: 0.9
********************
diabetes_model version: 12
Training context: Pipeline
AUC: 0.8872129824694899
Accuracy: 0.9004444444444445
********************
diabetes_model version: 11
Training context: Inline Training
AUC: 0.8803323548435243
Accuracy: 0.8926666666666667
********************
diabetes_model version: 10
Training context: Inline Training
AUC: 0.8753520075625654
Accuracy: 0.888
********************
diabetes_model version: 9
Training context: Pipeline
AUC: 0.8852547024821248
Accuracy: 0.9002222222222223
********************
diabetes_model version: 8
Training context: Pipeline
AUC: 0.884408171643805
Accuracy: 0.8986666666666

### Publish the pipeline

In [24]:
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
    name="diabetes-training-pipeline", description="Trains diabetes model", version="1.1")

published_pipeline

Name,Id,Status,Endpoint
diabetes-training-pipeline,b89309ef-15d6-4c40-a3bd-72468312e4d0,Active,REST Endpoint


### Call the pipeline endpoint

To use the endpoint, client applications need to make a REST call over HTTP. This request must be authenticated, so an authorization header is required. A real application would require a service principal with which to be authenticated, but to test this out, we'll use the authorization header from your current connection to your Azure workspace, which you can get using the following code:

In [25]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print("Authentication header ready.")

Authentication header ready.


In [27]:
import requests

experiment_name = 'mslearn-diabetes-pipeline-2'

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id

'4557242f-6379-4461-83b9-d55e61d7638e'

Since you have the run ID, you can use it to wait for the run to complete.

Note: The pipeline should complete quickly, because each step was configured to allow output reuse. This was done primarily for convenience and to save time in this course. In reality, you'd likely want the first step to run every time in case the data has changed, and trigger the subsequent steps only if the output from step one changes.

#### Re-trigger the pipeline and run it, in reality, you may want to re-trigger the pipeline when the upstream data changed

In [28]:
from azureml.pipeline.core.run import PipelineRun

published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
published_pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: 4557242f-6379-4461-83b9-d55e61d7638e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4557242f-6379-4461-83b9-d55e61d7638e?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
PipelineRun Status: Running


StepRunId: b5c9ef9d-69db-477c-bbf2-4f81191d1477
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/b5c9ef9d-69db-477c-bbf2-4f81191d1477?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
StepRun( Prepare Data ) Status: NotStarted
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished
{'runId': 'b5c9ef9d-69db-477c-bbf2-4f81191d1477', 'target': 'ComputerCluster', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:54:20.824285Z', 'endTimeUtc': '2023-01-16T12:54:45.376452Z', '




StepRunId: 8df3a0ba-7a40-4c2d-9b84-a1f67602570a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8df3a0ba-7a40-4c2d-9b84-a1f67602570a?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
StepRun( Train and Register Model ) Status: Running

StepRun(Train and Register Model) Execution Summary
StepRun( Train and Register Model ) Status: Finished
{'runId': '8df3a0ba-7a40-4c2d-9b84-a1f67602570a', 'target': 'ComputerCluster', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:55:12.490638Z', 'endTimeUtc': '2023-01-16T12:55:29.377472Z', 'services': {}, 'properties': {'ContentSnapshotId': '329170f4-0f15-454c-bfc1-4a02dc486f18', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': 'bf3ec571-dd7e-431c-a1f5-545e5620f870', 'azureml.moduleName': 'Train and Register Model', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '88404573', 'az



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '4557242f-6379-4461-83b9-d55e61d7638e', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:51:39.060067Z', 'endTimeUtc': '2023-01-16T12:55:30.724886Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineid': 'b89309ef-15d6-4c40-a3bd-72468312e4d0', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-01-16T12:51:39.2855349+00:00","EndTime":"2023-01-16T12:55:30.66766+00:00","Status":"Finished"}}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://myworkspace4931631789.blob.core.windows.net/azureml/ExperimentRun/dcid.4557242f-6379-4461-83b9-d55e61d7638e/logs/azureml/executionlogs.txt?

'Finished'

In [32]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule
# Submit the Pipeline every Monday at 00:00 UTC
recurrence = ScheduleRecurrence(frequency="Week", interval=1, week_days=["Monday"], time_of_day="09:00")

weekly_schedule = Schedule.create(ws, name="weekly-diabetes-training-1", 
                                  description="Based on time",
                                  pipeline_id=published_pipeline.id, 
                                  experiment_name='mslearn-diabetes-pipeline-2', 
                                  recurrence=recurrence)
print('Pipeline scheduled.')


Pipeline scheduled.


In [33]:
schedules = Schedule.list(ws)
schedules

[Pipeline(Name: weekly-diabetes-training-1,
 Id: 0fba7c2f-b323-4ace-8caf-5b2b03cf9781,
 Status: Active,
 Pipeline Id: b89309ef-15d6-4c40-a3bd-72468312e4d0,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 9:00 on Monday every Week),
 Pipeline(Name: weekly-diabetes-training,
 Id: 54c322c6-838b-4ad7-baa3-4021199567c2,
 Status: Active,
 Pipeline Id: 84cd4e98-8fae-4159-9d8b-29ef8445e527,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week)]

In [34]:
pipeline_experiment = ws.experiments.get('mslearn-diabetes-pipeline-2')
latest_run = list(pipeline_experiment.get_runs())[0]

latest_run.get_details()

{'runId': '7d88b786-394f-43ad-b21c-2767e04da040',
 'status': 'Running',
 'startTimeUtc': '2023-01-16T13:03:11.15337Z',
 'services': {},
 'properties': {'azureml.git.repository_uri': 'https://github.com/wliu40/mslearn-dp100.git',
  'mlflow.source.git.repoURL': 'https://github.com/wliu40/mslearn-dp100.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': 'cc1f6002f88edd30301472f299aa6cc8886b2a8d',
  'mlflow.source.git.commit': 'cc1f6002f88edd30301472f299aa6cc8886b2a8d',
  'azureml.git.dirty': 'True',
  'azureml.runsource': 'azureml.PipelineRun',
  'runSource': 'Unavailable',
  'runType': 'Schedule',
  'azureml.parameters': '{}',
  'azureml.continue_on_step_failure': 'False',
  'azureml.continue_on_failed_optional_input': 'True',
  'azureml.pipelineid': 'b89309ef-15d6-4c40-a3bd-72468312e4d0',
  'azureml.pipelineComponent': 'pipelinerun'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mywo

#### Create real-time inference service

In [37]:
model = ws.models['diabetes_model']
print(model.name, 'version', model.version)


diabetes_model version 17


In [38]:
model

Model(workspace=Workspace.create(name='myworkspace', subscription_id='efaef50b-3a01-4bf1-ad06-b63c101ab300', resource_group='resource-group-1'), name=diabetes_model, id=diabetes_model:17, version=17, tags={'training context': 'pipeline_2'}, properties={'AUC': '0.8832546157718848', 'ACC': '0.898'})


The web service where we deploy the model will need some Python code to load the input data, get the model from the workspace, and generate and return predictions. We'll save this code in an entry script (often called a scoring script) that will be deployed to the web service.

The script consists of two functions:

- __init__: This function is called when the service is initialized, and is generally used to load the model. Note that the scoring script uses the __AZUREML_MODEL_DIR__ environment variable to determine the folder where the model is stored.

- __run__: This function is called each time a client application submits new data, and is generally used to inference predictions from the model.

In [39]:
import os

# Create a folder for the deployment files
deployment_folder = './diabetes_service_2'
os.makedirs(deployment_folder, exist_ok=True)
print(deployment_folder, 'folder created.')

# Set path for scoring script
script_file = 'score_diabetes.py'
script_path = os.path.join(deployment_folder,script_file)


./diabetes_service_2 folder created.


In [40]:
%%writefile $script_path
import json
import joblib
import numpy as np
import os

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'diabetes_model.pkl')
    print(f"model path is {model_path}")
    model = joblib.load(model_path)

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])
    # Get a prediction from the model
    predictions = model.predict(data)
    # Get the corresponding classname for each prediction (0 or 1)
    classnames = ['not-diabetic', 'diabetic']
    predicted_classes = []
    for prediction in predictions:
        predicted_classes.append(classnames[prediction])
    # Return the predictions as JSON
    return json.dumps(predicted_classes)

Writing ./diabetes_service_2\score_diabetes.py


In [41]:
path = os.getenv('AZUREML_MODEL_DIR')
print(path)

None


In [45]:
from azureml.core.model import InferenceConfig
from azureml.core import Environment
from azureml.core.webservice import AciWebservice

In [49]:
# Configure the scoring environment
service_env = Environment.get(workspace=ws, name="AzureML-sklearn-0.24.1-ubuntu18.04-py37-cpu-inference")
service_env.inferencing_stack_version="latest"

inference_config = InferenceConfig(source_directory=deployment_folder,
                    entry_script=script_file,
                    environment=service_env)

deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model as a service
print('Deploying model...')
service_name = "diabetes-service-2"
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config, overwrite=True)
service.wait_for_deployment(True)
print(service.state)



Deploying model...


azureml.core.model:
To leverage new model deployment capabilities, AzureML recommends using CLI/SDK v2 to deploy models as online endpoint, 
please refer to respective documentations 
https://docs.microsoft.com/azure/machine-learning/how-to-deploy-managed-online-endpoints /
https://docs.microsoft.com/azure/machine-learning/how-to-attach-kubernetes-anywhere 
For more information on migration, see https://aka.ms/acimoemigration. 


Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2023-01-16 08:42:42-05:00 Creating Container Registry if not exists.
2023-01-16 08:42:42-05:00 Registering the environment.
2023-01-16 08:42:42-05:00 Use the existing image.
2023-01-16 08:42:43-05:00 Submitting deployment to compute.
2023-01-16 08:42:48-05:00 Checking the status of deployment diabetes-service-2..
2023-01-16 08:44:53-05:00 Checking the status of inference endpoint diabetes-service-2.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [50]:
for webservice_name in ws.webservices:
    print(webservice_name)

diabetes-service-2


In [51]:
import json

x_new = [[2,180,74,24,21,23.9091702,1.488172308,22]]
print ('Patient: {}'.format(x_new[0]))

# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})

# Call the web service, passing the input data (the web service will also accept the data in binary format)
predictions = service.run(input_data = input_json)

# Get the predicted class - it'll be the first (and only) one.
predicted_classes = json.loads(predictions)
print(predicted_classes[0])

Patient: [2, 180, 74, 24, 21, 23.9091702, 1.488172308, 22]
diabetic


In [52]:
endpoint = service.scoring_uri
print(endpoint)

http://91afb9e1-a82b-4a66-b159-6ed8e14249aa.eastus2.azurecontainer.io/score


In [53]:
import requests
import json

x_new = [[2,180,74,24,21,23.9091702,1.488172308,22],
         [0,148,58,11,179,39.19207553,0.160829008,45]]

# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})

# Set the content type
headers = { 'Content-Type':'application/json' }

predictions = requests.post(endpoint, input_json, headers = headers)
predicted_classes = json.loads(predictions.json())

for i in range(len(x_new)):
    print ("Patient {}".format(x_new[i]), predicted_classes[i] )

Patient [2, 180, 74, 24, 21, 23.9091702, 1.488172308, 22] diabetic
Patient [0, 148, 58, 11, 179, 39.19207553, 0.160829008, 45] diabetic


#### Batch Inference

In [55]:
import os
# Create a folder for the experiment files
experiment_folder = 'batch_pipeline_2'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

batch_pipeline_2


In [56]:
%%writefile $experiment_folder/batch_diabetes.py
import os
import numpy as np
from azureml.core import Model
import joblib


def init():
    # Runs when the pipeline step is initialized
    global model

    # load the model
    model_path = Model.get_model_path('diabetes_model')
    model = joblib.load(model_path)


def run(mini_batch):
    # This runs for each batch
    resultList = []

    # process each file in the batch
    for f in mini_batch:
        # Read the comma-delimited data into an array
        data = np.genfromtxt(f, delimiter=',')
        # Reshape into a 2-dimensional array for prediction (model expects multiple items)
        prediction = model.predict(data.reshape(1, -1))
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))
    return resultList

Writing batch_pipeline_2/batch_diabetes.py


In [57]:
%%writefile $experiment_folder/batch_environment.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pip
- pip:
  - azureml-defaults

Writing batch_pipeline_2/batch_environment.yml


In [58]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the experiment
batch_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/batch_environment.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


In [74]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig

output_dir = OutputFileDatasetConfig(name='inferences')

parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script="batch_diabetes.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target='ComputerCluster',
    node_count=1)

parallelrun_step = ParallelRunStep(
    name='batch-score-diabetes',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('diabetes_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

Steps defined


In [65]:

default_ds = ws.get_default_datastore()
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)


Uploading files to datastore...


In [66]:
batch_data_set

{
  "source": [
    "('workspaceblobstore', 'batch-data/')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

In [70]:
print(f"How many files in batch data: {len(batch_data_set.to_path())}")
print(f"Show 10 files in this list")
for file_path in batch_data_set.to_path()[:10]:
    print(file_path)

How many files in batch data: 100
Show 10 files in this list
/1.csv
/10.csv
/100.csv
/11.csv
/12.csv
/13.csv
/14.csv
/15.csv
/16.csv
/17.csv


In [75]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, 'mslearn-diabetes-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batch-score-diabetes [0dc54f4c][6ff81a5d-40e4-415c-a575-d32fcf47f389], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 02f13632-5a60-4dcb-9e78-86ffb84d5a6c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/02f13632-5a60-4dcb-9e78-86ffb84d5a6c?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
PipelineRunId: 02f13632-5a60-4dcb-9e78-86ffb84d5a6c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/02f13632-5a60-4dcb-9e78-86ffb84d5a6c?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
PipelineRun Status: Running


StepRunId: c1cab539-3458-41d6-a1ef-288ea7f0d9cd
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/c1cab539-3458-41d6-a1ef-288ea7f0d9cd?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/r



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '02f13632-5a60-4dcb-9e78-86ffb84d5a6c', 'status': 'Completed', 'startTimeUtc': '2023-01-16T14:02:46.121071Z', 'endTimeUtc': '2023-01-16T14:07:41.891484Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-01-16T14:02:46.4100944+00:00","EndTime":"2023-01-16T14:07:41.8288136+00:00","Status":"Finished"}}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://myworkspace4931631789.blob.core.windows.net/azureml/ExperimentRun/dcid.02f13632-5a60-4dcb-9e78-86ffb84d5a6c/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=hxQu0W2DHFjczARhKHokkIQvwdB4LG839FmqhoBtdU0%3D

'Finished'

When the pipeline has finished running, the resulting predictions will have been saved in the outputs of the experiment associated with the first (and only) step in the pipeline. You can retrieve it as follows:

In [76]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
shutil.rmtree('diabetes-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,1.csv,1
1,10.csv,1
2,100.csv,1
3,11.csv,1
4,12.csv,1
5,13.csv,1
6,14.csv,1
7,15.csv,1
8,16.csv,1
9,17.csv,1


#### Publish the pipeline

In [77]:
published_pipeline = pipeline_run.publish_pipeline(
    name='diabetes-batch-pipeline', description='Batch scoring of diabetes data', version='1.0')

published_pipeline


Name,Id,Status,Endpoint
diabetes-batch-pipeline,8c31892f-dd77-40bd-a83f-6cecfa61bc60,Active,REST Endpoint


In [79]:
published_pipeline.endpoint

'https://eastus2.api.azureml.ms/pipelines/v1.0/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourceGroups/resource-group-1/providers/Microsoft.MachineLearningServices/workspaces/myworkspace/PipelineRuns/PipelineSubmit/8c31892f-dd77-40bd-a83f-6cecfa61bc60'

### Use the published pipeline in REST

In [80]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print('Authentication header ready.')


rest_endpoint = published_pipeline.endpoint

response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": "mslearn-diabetes-batch"})
run_id = response.json()["Id"]
run_id


Authentication header ready.


'd9171399-8b0b-40d1-81dc-b1fa4860e7d1'

In [81]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments['mslearn-diabetes-batch'], run_id)

# Block until the run completes
published_pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: d9171399-8b0b-40d1-81dc-b1fa4860e7d1
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d9171399-8b0b-40d1-81dc-b1fa4860e7d1?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'd9171399-8b0b-40d1-81dc-b1fa4860e7d1', 'status': 'Completed', 'startTimeUtc': '2023-01-16T14:34:54.713814Z', 'endTimeUtc': '2023-01-16T14:34:55.584544Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineid': '8c31892f-dd77-40bd-a83f-6cecfa61bc60', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-01-16T14:34:54.9831238+00:00","

'Finished'

In [82]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
shutil.rmtree('diabetes-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,1.csv,1
1,10.csv,1
2,100.csv,1
3,11.csv,1
4,12.csv,1
5,13.csv,1
6,14.csv,1
7,15.csv,1
8,16.csv,1
9,17.csv,1
