In [22]:
### How to log 
### How to register a data
### How to retrieve data from datastore / workspace (registered)
### Check all experiments in a workspace
### Check all runs in a experiment
### Get a specific run by its tag/property
### Tag runs, add properties

### How to register a model
### Review all registered models
### Retrieve a model by its tag


### How to create a new environment
### How to register a enviroment


### How to create a new compute cluster
### How to create a run config for pipelines


In [1]:
from azureml.core import Workspace, Experiment, Run, ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import os
import sys

In [2]:
script_dir = 'playground'
os.makedirs(script_dir, exist_ok=True)


In [3]:
%%writefile $script_dir/data_prep.py
import os
import argparse
from azureml.core import Run
import pandas as pd
from sklearn.processing import MinMaxScaler

parser = argparse.ArgumentParser()
parser.add_argument('--input_data', type=str, dest='raw_dataset', help='Raw dataset')
parser.add_argument('--preped_data', type=str, dest='preped_dataset', help='Prepared dataset')
args = parser.parse_args()

saved_dir = args.preped_data
run = Run.get_context()

df = run.input_datasets['raw_dataset'].to_pandas_dataframe()

num_features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
                'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
scaler = MinMaxScaler()
df[num_features] = scaler.fit(df[num_features])

print('saving data to preped_data')
os.makedirs(saved_dir, exist_ok=True)
df.to_csv(os.path.join(saved_dir, 'preped_data.csv'), index=False)
run.complete()

Overwriting playground/data_prep.py


In [4]:
%%writefile $script_dir/train.py

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import argparse
import numpy as np
from azureml.core import Run, Model
import joblib

parser = argparse.ArgumentParser()
parser.add_argument('--input_data', type=str, dest='input_data', help='Input dataset for model')
args = parser.parse_args()

run = Run.get_context()
df = run.input_datsets['input_data'].to_pandas_dataframe()

X_train, y_train, X_test, y_test = train_test_split(df, test_size=.3, random_state=0)

model = DecisionTreeClassifier().fit(X_train, y_train)

y_pred = model.predict(X_test)

# ACC
acc = np.average(y_pred==y_test)
run.log(name='acc', value=acc)

# AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:, 1])
run.log(name='auc', value=auc)

# ROC
fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])
fig, ax = plt.subplots(figsize=(6, 6))
ax.plot([0,1], [0,1], 'k--')
ax.plot(fpr, tpr)
ax.title('ROC curve')
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
run.log_image(name='roc_curve', plot=fig)

print('Saving model')
os.makedirs('models', exist_ok=True)

model_file_path = os.path.join('models', 'diabetes_model.pkl')
joblib.dump(model, model_file_path)

Model.register(workspace=ws,
              model_path=model_file,
              tags={"training context": "pipeline"},
              properties={'AUC': auc, "ACC": acc},
              model_Path=model_file_path)
run.complete()


Overwriting playground/train.py


In [5]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()


If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [6]:
for data in ws.datasets:
    print(data)


diabetes_dataset2
batch-data
diabetes file dataset
diabetes dataset
loan_data


In [7]:
'diabetes dataset' in ws.datasets

True

In [8]:
from glob import glob
# [os.path.join(root, f) for f in files for root, dirs, files in os.walk('data')]
glob('data/*.csv')

['data\\diabetes.csv', 'data\\diabetes2.csv']

In [9]:
# register dataset
from azureml.core import Dataset
from azureml.data.datapath import DataPath

dataset_name = 'diabetes_dataset2'
# if this dataset already registered, we will do nothing, otherwise, we upload the files, and register it.
# if the file already uploaded, we will be notified, without re-uploading it again.
if dataset_name not in ws.datasets: 
    print("We will upload the files to cloud datastore, retrive it and register it")
    Dataset.File.upload_directory(src_dir='data', target=DataPath(datastore, dataset_name)) 
    tab_data = Dataset.Tabular.from_delimited_files(path=(datastore, os.path.join(dataset_name, '*.csv')))
    try:
        tab_data = tab_data.register(workspace=ws,
                                    name=dataset_name,
                                    tags={"format": "csv"},
                                    description="diabetes dataset",
                                    create_new_version=True)
    except Exception as ex:
        print(ex)
else:
    print("dataset already exist")
    


dataset already exist


In [13]:
# get a registered dataset from workspace
ws.datasets.get(dataset_name).to_pandas_dataframe().head(10)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
5,1619297,0,82,92,9,253,19.72416,0.103424,26,0
6,1660149,0,133,47,19,227,21.941357,0.17416,21,0
7,1458769,0,67,87,43,36,18.277723,0.236165,26,0
8,1201647,8,80,95,33,24,26.624929,0.443947,53,1
9,1403912,1,72,31,40,42,36.889576,0.103944,26,0


### Prepare the environment

In [16]:
%%writefile $script_dir/experiment_env.yml
name: experiment_env_2
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing playground/experiment_env.yml


In [18]:
from azureml.core import Environment
# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env_2", script_dir + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env_2')

### Prepare a compute instance

In [20]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "ComputerCluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


### Setup the run config for the pipeline

In [21]:
from azureml.core.runconfig import RunConfiguration
# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


#### Define all the PythonScriptStep and intermediate data reference

In [24]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = script_dir,
                                script_name = "prep_diabetes.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = script_dir,
                                script_name = "train_diabetes.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [None]:
tab_data.to_pandas_dataframe().head(10)

In [None]:
%%writefile $script_dir/data_prep.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(diabetes))
run.log('raw_rows', row_count)

# remove nulls
diabetes = diabetes.dropna()

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])

# Log processed rows
row_count = len(diabetes)
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
diabetes.to_csv(save_path, index=False, header=True)

# End the run
run.complete()