# Train Local Project using Azure Machine Learning Compute

In [1]:
%load_ext watermark
%watermark -v -d -p sklearn,azureml.core

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 7.28.0

sklearn     : 1.0.1
azureml.core: 1.35.0



In [2]:
import azureml.core

## Initialize Workspace

Create a ML workspace in **Azure ML Studio** and export `config.json`

In [4]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

thomdml
thomd
eastus
97b5c2bf-582f-4bb7-92e2-e69b517e9eee


## Create Experiment

In [29]:
from azureml.core import Experiment

experiment = Experiment(workspace = ws, name = 'train-on-aml-compute')

Write the training script `train.py` to file

In [30]:
%%writefile train.py

from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
import os
import numpy as np
import joblib

os.makedirs('./outputs', exist_ok=True)

X, y = load_diabetes(return_X_y=True)

run = Run.get_context()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

for alpha in np.arange(0.0, 1.0, 0.05):
    reg = Ridge(alpha=alpha)  # use Ridge algorithm to create a regression model
    reg.fit(X_train, y_train)

    preds = reg.predict(X_test)
    mse = mean_squared_error(preds, y_test)
    run.log('alpha', alpha)
    run.log('mse', mse)

    model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
    # save model in the outputs folder so it automatically get uploaded
    with open(model_file_name, "wb") as file:
        joblib.dump(value=reg, filename=os.path.join('./outputs/', model_file_name))

    print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))

Overwriting train.py


## Create Azure ML Compute

### Provision as a Persistent Compute Target

In [31]:
from azureml.core.compute import ComputeTarget, AmlCompute
import pandas as pd

vms = AmlCompute.supported_vmsizes(workspace = ws, location = ws.location)

In [32]:
pd.DataFrame(vms).sort_values(by=['memoryGB'], inplace=False)

Unnamed: 0,name,vCPUs,gpus,memoryGB,maxResourceVolumeMB
0,Standard_D1,1,0,3.5,51200
13,Standard_D1_v2,1,0,3.5,51200
41,Standard_DS1_v2,1,0,3.5,7168
66,Standard_F2s_v2,2,0,4.0,16384
17,Standard_D2_v2,2,0,7.0,102400
...,...,...,...,...,...
104,Standard_M64m,64,0,1750.0,8192000
94,Standard_M128,128,0,2000.0,16384000
97,Standard_M128s,128,0,2000.0,4096000
95,Standard_M128m,128,0,3800.0,16384000


In [33]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = 'cpu-cluster'

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


Get the latest **status** of the AML Compute target:

In [41]:
cpu_cluster.get_status().serialize()

{'currentNodeCount': 1,
 'targetNodeCount': 1,
 'nodeStateCounts': {'preparingNodeCount': 0,
  'runningNodeCount': 0,
  'idleNodeCount': 1,
  'unusableNodeCount': 0,
  'leavingNodeCount': 0,
  'preemptedNodeCount': 0},
 'allocationState': 'Steady',
 'allocationStateTransitionTime': '2021-11-07T18:46:33.161000+00:00',
 'errors': None,
 'creationTime': '2021-11-07T16:04:51.049048+00:00',
 'modifiedTime': '2021-11-07T16:05:07.062488+00:00',
 'provisioningState': 'Succeeded',
 'provisioningStateTransitionTime': None,
 'scaleSettings': {'minNodeCount': 0,
  'maxNodeCount': 4,
  'nodeIdleTimeBeforeScaleDown': 'PT1800S'},
 'vmPriority': 'Dedicated',
 'vmSize': 'STANDARD_D2_V2'}

Get the list of **nodes** on the cluster with status, **IP** and associated run:

In [42]:
cpu_cluster.list_nodes()

[{'nodeId': 'tvmps_0a5f5ff945d5447a1f9bd069e3b31b1ea3d5c2124068bc0b4172aa97cbbb2485_d',
  'port': 50001,
  'publicIpAddress': '20.81.48.123',
  'privateIpAddress': '10.0.0.5',
  'nodeState': 'idle'}]

In [43]:
# cpu_cluster.delete()

## Create Environment

In [34]:
from azureml.core import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import ScriptRunConfig

myenv = Environment('myenv')
myenv.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn', 'packaging'])

# Enable Docker
docker_config = DockerConfiguration(use_docker=True)

src = ScriptRunConfig(source_directory='./', script='train.py', compute_target=cpu_cluster, environment=myenv, docker_runtime_config=docker_config)
run = experiment.submit(config=src)

In [35]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
train-on-aml-compute,train-on-aml-compute_1636312326_45565c02,azureml.scriptrun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [36]:
run.get_status()

'Queued'