In [1]:
### How to log 
### How to register a data
### How to retrieve data from datastore / workspace (registered)
### Check all experiments in a workspace
### Check all runs in a experiment
### Get a specific run by its tag/property
### Tag runs, add properties

### How to register a model
### Review all registered models
### Retrieve a model by its tag


### How to create a new environment
### How to register a enviroment


### How to create a new compute cluster
### How to create a run config for pipelines


In [2]:
from azureml.core import Workspace, Experiment, Run, ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import os
import sys

In [3]:
script_dir = 'playground'
os.makedirs(script_dir, exist_ok=True)


#### Couple of things need attention:
- argument: --input-data and --preped-data should be the same in PythonScriptStep
- filename should be the same in OutputFileDatasetConfig(filename) and same in the second data input step
- the input for the inital step: diabetes_ds.as_named_input('raw_data')
- the input for the second step: prep_data.as_input()

In [4]:
%%writefile $script_dir/data_prep.py
import os
import argparse
from azureml.core import Run
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

parser = argparse.ArgumentParser()
parser.add_argument('--input-data', type=str, dest='raw_data', help='Raw dataset')
parser.add_argument('--preped-data', type=str, dest='preped_data', help='Prepared dataset')
args = parser.parse_args()

saved_dir = args.preped_data
run = Run.get_context()

df = run.input_datasets['raw_data'].to_pandas_dataframe()
run.log(name='data shape', value=str(df.shape[0]))

num_features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
                'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']

num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']

run.log_list("numerical cols", num_cols)

scaler = MinMaxScaler()
df[num_features] = scaler.fit_transform(df[num_features])

print('saving data to preped_data')
os.makedirs(saved_dir, exist_ok=True)
df.to_csv(os.path.join(saved_dir, 'preped_data.csv'), index=False)
run.complete()

Overwriting playground/data_prep.py


In [5]:
%%writefile $script_dir/train.py

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import argparse
import numpy as np
from azureml.core import Run, Model
import joblib

parser = argparse.ArgumentParser()
parser.add_argument('--training-data', type=str, dest='training_data', help='Input dataset for model')
args = parser.parse_args()

training_data_dir = args.training_data

run = Run.get_context()

# !!!!!
# df = run.input_datasets['input_data'].to_pandas_dataframe()
df = pd.read_csv(os.path.join(training_data_dir, 'preped_data.csv'))

features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
            'SerumInsulin','BMI', 'DiabetesPedigree', 'Age']

# Separate features and labels

X, y = df[features].values, df['Diabetic'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

model = DecisionTreeClassifier().fit(X_train, y_train)

y_pred = model.predict(X_test)

# ACC
acc = np.average(y_pred==y_test)
run.log(name='acc', value=acc)

# AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:, 1])
run.log(name='auc', value=auc)

# ROC
fpr, tpr, threshold = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

print('Saving model')
os.makedirs('models', exist_ok=True)

model_file_path = os.path.join('models', 'diabetes_model.pkl')
joblib.dump(value=model, filename=model_file_path)

Model.register(workspace=run.experiment.workspace,
               model_path=model_file_path,
               model_name='diabetes_model',
               tags={"training context": "pipeline_2"},
               properties={'AUC': np.float(auc), "ACC": np.float(acc)})
run.complete()


Overwriting playground/train.py


In [6]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()


If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [7]:
for data in ws.datasets:
    print(data)


diabetes_dataset2
batch-data
diabetes file dataset
diabetes dataset
loan_data


In [8]:
'diabetes dataset' in ws.datasets

True

In [9]:
from glob import glob
# [os.path.join(root, f) for f in files for root, dirs, files in os.walk('data')]
glob('data/*.csv')

['data\\diabetes.csv', 'data\\diabetes2.csv']

In [10]:
# register dataset
from azureml.core import Dataset
from azureml.data.datapath import DataPath

dataset_name = 'diabetes_dataset2'
# if this dataset already registered, we will do nothing, otherwise, we upload the files, and register it.
# if the file already uploaded, we will be notified, without re-uploading it again.
if dataset_name not in ws.datasets: 
    print("We will upload the files to cloud datastore, retrive it and register it")
    Dataset.File.upload_directory(src_dir='data', target=DataPath(datastore, dataset_name)) 
    tab_data = Dataset.Tabular.from_delimited_files(path=(datastore, os.path.join(dataset_name, '*.csv')))
    try:
        tab_data = tab_data.register(workspace=ws,
                                    name=dataset_name,
                                    tags={"format": "csv"},
                                    description="diabetes dataset",
                                    create_new_version=True)
    except Exception as ex:
        print(ex)
else:
    print("dataset already exist")
    


dataset already exist


In [11]:
# get a registered dataset from workspace
ws.datasets.get(dataset_name).to_pandas_dataframe().head(10)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
5,1619297,0,82,92,9,253,19.72416,0.103424,26,0
6,1660149,0,133,47,19,227,21.941357,0.17416,21,0
7,1458769,0,67,87,43,36,18.277723,0.236165,26,0
8,1201647,8,80,95,33,24,26.624929,0.443947,53,1
9,1403912,1,72,31,40,42,36.889576,0.103944,26,0


### Prepare the environment

In [12]:
%%writefile $script_dir/experiment_env.yml
name: experiment_env_2
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting playground/experiment_env.yml


In [13]:
from azureml.core import Environment
# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env_2", script_dir + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env_2')

### Prepare a compute instance

In [14]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ComputerCluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


### Setup the run config for the pipeline

https://learn.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfig.runconfiguration?view=azure-ml-py

In [15]:
from azureml.core.runconfig import RunConfiguration
# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


#### Define all the PythonScriptStep and intermediate data reference

In [16]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
preped_data = OutputFileDatasetConfig("preped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = script_dir,
                                script_name = "data_prep.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--preped-data', preped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = script_dir,
                                script_name = "train.py",
                                arguments = ['--training-data', preped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [17]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

Pipeline is built.


In [18]:
# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline-2')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Created step Prepare Data [830f93a4][1e9d667b-4866-4d90-8474-29b91b3aa5d8], (This step will run and generate new outputs)
Created step Train and Register Model [88404573][bf3ec571-dd7e-431c-a1f5-545e5620f870], (This step will run and generate new outputs)
Submitted PipelineRun 03bee7ad-a0b8-40d5-b132-f3edf3a3056c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/03bee7ad-a0b8-40d5-b132-f3edf3a3056c?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 03bee7ad-a0b8-40d5-b132-f3edf3a3056c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/03bee7ad-a0b8-40d5-b132-f3edf3a3056c?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
PipelineRun Status: Running


StepRunId: 3d0985dd-1002-4fbf-a7c8-40a17b8dbccb
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3d0985dd-1002-4fbf-a7c8-40a17b8dbccb?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
StepRun( Prepare Data ) Status: NotStarted
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished
{'runId': '3d0985dd-1002-4fbf-a7c8-40a17b8dbccb', 'target': 'ComputerCluster', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:36:37.74835Z', 'endTimeUtc': '2023-01-16T12:37:01.843611Z', 's




StepRunId: 025ea179-0e8e-4871-928b-ca349b912d18
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/025ea179-0e8e-4871-928b-ca349b912d18?wsid=/subscriptions/efaef50b-3a01-4bf1-ad06-b63c101ab300/resourcegroups/resource-group-1/workspaces/myworkspace&tid=cb956b3e-0e1a-485c-a395-a000041d2695
StepRun( Train and Register Model ) Status: NotStarted

StepRun(Train and Register Model) Execution Summary
StepRun( Train and Register Model ) Status: Finished
{'runId': '025ea179-0e8e-4871-928b-ca349b912d18', 'target': 'ComputerCluster', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:37:10.884406Z', 'endTimeUtc': '2023-01-16T12:37:26.269079Z', 'services': {}, 'properties': {'ContentSnapshotId': '329170f4-0f15-454c-bfc1-4a02dc486f18', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': 'bf3ec571-dd7e-431c-a1f5-545e5620f870', 'azureml.moduleName': 'Train and Register Model', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '88404573', 



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '03bee7ad-a0b8-40d5-b132-f3edf3a3056c', 'status': 'Completed', 'startTimeUtc': '2023-01-16T12:36:26.096379Z', 'endTimeUtc': '2023-01-16T12:37:27.326387Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-01-16T12:36:26.3751879+00:00","EndTime":"2023-01-16T12:37:27.2665943+00:00","Status":"Finished"}}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://myworkspace4931631789.blob.core.windows.net/azureml/ExperimentRun/dcid.03bee7ad-a0b8-40d5-b132-f3edf3a3056c/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=9YxxOdy19TwLWNKLx%2FA35iYpfqkk2934DtL6lpgmzXg%

'Finished'

In [21]:
from azureml.core import Model

for model in Model.list(ws):
    print('*' * 20)
    print(model.name, 'version:', model.version)
    for tag in model.tags:
        print(f"{tag}: {model.tags[tag]}")
    for p in model.properties:
        print(f"{p}: {model.properties[p]}")

********************
diabetes_model version: 15
training context: pipeline_2
AUC: 0.8887200152893158
ACC: 0.9024444444444445
********************
diabetes_model version: 14
Training context: Pipeline
AUC: 0.8825848234075178
Accuracy: 0.8971111111111111
********************
diabetes_model version: 13
Training context: Pipeline
AUC: 0.8855756630900167
Accuracy: 0.9
********************
diabetes_model version: 12
Training context: Pipeline
AUC: 0.8872129824694899
Accuracy: 0.9004444444444445
********************
diabetes_model version: 11
Training context: Inline Training
AUC: 0.8803323548435243
Accuracy: 0.8926666666666667
********************
diabetes_model version: 10
Training context: Inline Training
AUC: 0.8753520075625654
Accuracy: 0.888
********************
diabetes_model version: 9
Training context: Pipeline
AUC: 0.8852547024821248
Accuracy: 0.9002222222222223
********************
diabetes_model version: 8
Training context: Pipeline
AUC: 0.884408171643805
Accuracy: 0.8986666666666