# 1.Install the Azure Machine Learning SDK

In [1]:
!pip install --upgrade azureml-sdk azureml-widgets

Requirement already up-to-date: azureml-sdk in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (1.38.0)
Requirement already up-to-date: azureml-widgets in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (1.38.0)
Collecting jedi<=0.17.2,>=0.10
  Downloading jedi-0.17.2-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 21.0 MB/s eta 0:00:01
Installing collected packages: jedi
  Attempting uninstall: jedi
    Found existing installation: jedi 0.18.0
    Uninstalling jedi-0.18.0:
      Successfully uninstalled jedi-0.18.0
Successfully installed jedi-0.17.2


# 2. Connect to your workspace

In [13]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Workspace, Datastore, Dataset
from azureml.data.datapath import DataPath

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))




Ready to use Azure ML 1.38.0 to work with session17workspace


# 3. Prepare data
### 3.1. Upload data onto the DataStore

In [4]:
import pandas as pd
import os

In [5]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

azureml_globaldatasets - Default = False
workspaceartifactstore - Default = False
workspacefilestore - Default = False
workspaceworkingdirectory - Default = False
workspaceblobstore - Default = True


In [14]:
## we create a dir
os.mkdir('./diabetes')


FileExistsError: [Errno 17] File exists: './diabetes'

In [15]:
## we dowload some files from the internet
diabetes1 = pd.read_csv('https://raw.githubusercontent.com/MicrosoftLearning/mslearn-dp100/main/data/diabetes.csv')
diabetes2 = pd.read_csv('https://raw.githubusercontent.com/MicrosoftLearning/mslearn-dp100/main/data/diabetes2.csv')

In [16]:
## we store the files locally


diabetes1.to_csv('./diabetes/diabetes1.csv')
diabetes2.to_csv('./diabetes/diabetes2.csv')

In [17]:
datastore = Datastore.get(ws, 'workspaceblobstore')

In [18]:
ds = Dataset.File.upload_directory(src_dir='./diabetes/',
           target=DataPath(datastore,  '/diabetes'),
           show_progress=True)

Validating arguments.
Arguments validated.
Uploading file to /diabetes
Uploading an estimated of 2 files
Uploading ./diabetes/diabetes1.csv
Uploaded ./diabetes/diabetes1.csv, 1 files out of an estimated total of 2
Uploading ./diabetes/diabetes2.csv
Uploaded ./diabetes/diabetes2.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Creating new dataset


### 3.2. Create a tabular dataset

In [19]:
# Get the default datastore
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes/*.csv'))

# Display the dataset as a Pandas dataframe
tab_data_set.to_pandas_dataframe()

Unnamed: 0,Column1,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...,...
14995,4995,1490300,10,65,60,46,177,33.512468,0.148327,41,1
14996,4996,1744410,2,73,66,27,168,30.132636,0.862252,38,1
14997,4997,1742742,0,93,89,43,57,18.690683,0.427049,24,0
14998,4998,1099353,0,132,98,18,161,19.791645,0.302257,23,0


### 3.3. Register datasets

In [20]:
# Register the tabular dataset
try:
    tab_data_set = tab_data_set.register(workspace=ws, 
                                        name='diabetes tabular dataset',
                                        description='diabetes tabular data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)
except Exception as ex:
    print(ex)

# 4. Create scripts for pipeline steps
#### Pipelines consist of one or more steps, which can be Python scripts, or specialized steps like a data transfer step that copies data from one location to another. Each step can run in its own compute context. In this exercise, you'll build a simple pipeline that contains two Python script steps: one to pre-process some training data, and another to use the pre-processed data to train and register a model.

#### First, let's create a folder for the script files we'll use in the pipeline steps.

In [21]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'diabetes_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

diabetes_pipeline


### 4.1 Let's create the first script
#### Now let's create the first script, which will read data from the diabetes dataset and apply some simple pre-processing to remove any rows with missing data and normalize the numeric features so they're on a similar scale.

#### The script includes a argument named --prepped-data, which references the folder where the resulting data should be saved.

In [22]:
## %%writefile creates python files

%%writefile $experiment_folder/prep_diabetes.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(diabetes))
run.log('raw_rows', row_count)

# remove nulls
diabetes = diabetes.dropna()

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])

# Log processed rows
row_count = (len(diabetes))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
diabetes.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Writing diabetes_pipeline/prep_diabetes.py


### 4.2 Let's create the second script
#### Now you can create the script for the second step, which will train a model. The script includes a argument named --training-folder, which references the folder where the prepared data was saved by the previous step.

In [23]:
%%writefile $experiment_folder/train_diabetes.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-folder", type=str, dest='training_folder', help='training data folder')
args = parser.parse_args()
training_folder = args.training_folder

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_folder,'data.csv')
diabetes = pd.read_csv(file_path)

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train adecision tree model
print('Training a decision tree model...')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'diabetes_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'diabetes_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

Writing diabetes_pipeline/train_diabetes.py


# 5. Prepare a compute environment for the pipeline steps

In [25]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "s17-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

InProgress....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### The compute will require a Python environment with the necessary package dependencies installed, so you'll need to create a run configuration.

In [26]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
diabetes_env = Environment("diabetes-pipeline-env")
diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
diabetes_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','pyarrow'])

# Add the dependencies to the environment
diabetes_env.python.conda_dependencies = diabetes_packages

# Register the environment 
diabetes_env.register(workspace=ws)
registered_env = Environment.get(ws, 'diabetes-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")


'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


Run configuration created.


# 6. Create and run a pipeline

In [34]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes tabular dataset")

# Create a PipelineData (temporary Data Reference) for the model folder
prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore())


In [35]:
type(diabetes_ds)

azureml.data.tabular_dataset.TabularDataset

In [37]:
# Step 1, Run the data prep script
train_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_diabetes.py",
                                arguments = ['--input-data', diabetes_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data_folder],
                                outputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

In [38]:
# Step 2, run the training script
register_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_diabetes.py",
                                arguments = ['--training-folder', prepped_data_folder],
                                inputs=[prepped_data_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

### OK, you're ready build the pipeline from the steps you've defined and run it as an experiment.


#### A graphical representation of the pipeline experiment will be displayed in the widget as it runs. keep an eye on the kernel indicator at the top right of the page, when it turns from ⚫ to ◯, the code has finished running. You can also monitor pipeline runs in the Experiments page in Azure Machine Learning studio.

In [39]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-diabetes-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [32d27d0f][6d93bb07-6da3-4f52-b5a7-350e483b9f40], (This step will run and generate new outputs)Created step Train and Register Model [baf009de][3c8bc9e3-c295-4d00-9a66-a2988c35fff8], (This step will run and generate new outputs)

Submitted PipelineRun 5565a5f8-b037-487c-a2d2-df4963e2f579
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/5565a5f8-b037-487c-a2d2-df4963e2f579?wsid=/subscriptions/17ed2092-9d59-4bde-be0e-18e04a8ef316/resourcegroups/session17resourcegroup/workspaces/session17workspace&tid=7b345f98-7531-4f8b-b8ec-18e626c7d12e
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 5565a5f8-b037-487c-a2d2-df4963e2f579
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/5565a5f8-b037-487c-a2d2-df4963e2f579?wsid=/subscriptions/17ed2092-9d59-4bde-be0e-18e04a8ef316/resourcegroups/session17resourcegroup/workspaces/session17workspace&tid=7b345f98-7531-4f8b-b8ec-18e626c7d12e
PipelineRun Status: Running


StepRunId: 63153821-3a05-4554-b60a-eaea27c14544
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/63153821-3a05-4554-b60a-eaea27c14544?wsid=/subscriptions/17ed2092-9d59-4bde-be0e-18e04a8ef316/resourcegroups/session17resourcegroup/workspaces/session17workspace&tid=7b345f98-7531-4f8b-b8ec-18e626c7d12e
StepRun( Prepare Data ) Status: NotStarted
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2022/02/22 12:03:04 Downloading source code...
2022/02/22 12:03:05 Finished downloading source code
2022/02/22 12:03:05 Creating Docker network: acb_default_network, driver: 'bridge'
2022/02/22 12:03:06 S

'Finished'

### When the pipeline has finished, you can examine the metrics recorded by it's child runs.

In [42]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.898
	 AUC : 0.883417418671546
	 ROC : aml://artifactId/ExperimentRun/dcid.f95560d9-b990-4f35-a91e-12c80480289d/ROC_1645532251.png
Prepare Data :
	 raw_rows : 15000
	 processed_rows : 15000


### Assuming the pipeline was successful, a new model should be registered with a Training context tag indicating it was trained in a pipeline. Run the following code to verify this.

In [43]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

diabetes_model version: 1
	 Training context : Pipeline
	 AUC : 0.883417418671546
	 Accuracy : 0.898


