# Azure ML

[doc](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/distributed-training#tensorflow)

In [1]:
from IPython.display import display
import os
import mlflow

## Connect to a Workspace

In [2]:
from azureml.core import Workspace
ws = Workspace.from_config()
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
print(ws.get_mlflow_tracking_uri())

azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/3a9fdb76-32c0-4279-978b-e502191d6e11/resourceGroups/aidea2019/providers/Microsoft.MachineLearningServices/workspaces/aidea2019?


## Create an experiment

In [3]:
from azureml.core import Experiment

experiment_name = 'aidea2019'
experiment = Experiment(workspace=ws, name=experiment_name)

## Create and Get an Environment

### Local

In [4]:
from azureml.core.environment import Environment
localenvname = 'localnlp'
mylocalenv = Environment.from_conda_specification(
    name=localenvname,
    file_path='environment.yml'
)
mylocalenv.python.user_managed_dependencies=True
mylocalenv.python.interpreter_path = "D:\\ProgramData\\Miniconda3\\envs\\nlp\\python" #"/d/ProgramData/Miniconda3/envs/nlp/python"
display(mylocalenv)

No Python version provided, defaulting to "3.6.2"


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20220113.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": "2g"
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "localnlp",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "pytorch",
                "conda-forge",
     

### Cloud

In [5]:
from azureml.core.environment import Environment
envname = 'runpyrcuda'
try:
    myenv = Environment.get(workspace=ws,name=envname,version="1")
except:
    myenv = Environment.from_dockerfile(
        name=envname,
        dockerfile='jupyterhub-cuda-run_pyr.Dockerfile',
        #container_registry=None,
        conda_specification=None,#"environment.yml",
        pip_requirements=None
    )
    myenv.python.interpreter_path = "/opt/conda/envs/python/bin/python"
    myenv.python.user_managed_dependencies=True
    myenv.register(workspace=ws)
    myenv = Environment.get(workspace=ws,name=envname,version="1")

display(myenv)
"""
az ml environment delete -n <environment_name> -g <Resource-group name> -w <Workspace name> -v <version of the environment>
az ml environment delete -n tjrunpyrcuda -g aidea2019 -w aidea2019 -v 1
az ml environment delete -n aidea2019CondaEnv -g aidea2019 -w aidea2019 -v 1
"""


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": "FROM tingjhenjiang/jupyterhub-run_pyr:ubuntu20.04-cuda\n\n# -- Layer: cluster-base\n\nARG shared_workspace=/opt/workspace\n\nRUN mkdir -p ${shared_workspace}\n\nENV SHARED_WORKSPACE=${shared_workspace}\nENV NVIDIA_DISABLE_REQUIRE=\"1\"\n\n# -- Layer: JupyterHub-base\n\nARG NB_USERs=\"user1\"\nARG NB_UID=\"1001\"\nARG NB_GID=\"100\"\nARG PYTHON_VERSION=\"3.9\"\n\n# Ref: https://github.com/jupyterhub/jupyterhub-the-hard-way/blob/HEAD/docs/installation-guide-hard.md\n# https://hub.docker.com/r/jupyter/base-notebook/dockerfile\n# https://hub.docker.com/r/rocker/rstudio/Dockerfile\n# https://github.com/grst/rstudio-server-conda/blob/master/docker/init2.sh\n\n\nRUN . /envvarset.sh && \\\n    sed -i 's|http://free.nchc.org.tw|http://archive.ubuntu.com|g' /

'\naz ml environment delete -n <environment_name> -g <Resource-group name> -w <Workspace name> -v <version of the environment>\naz ml environment delete -n tjrunpyrcuda -g aidea2019 -w aidea2019 -v 1\naz ml environment delete -n aidea2019CondaEnv -g aidea2019 -w aidea2019 -v 1\n'

## Debug the image build

In [7]:
from azureml.core import Image
if True:
    try:
        build = myenv.build_local(workspace=ws, useDocker=True, pushImageToWorkspaceAcr=True)
        build.wait_for_completion(show_output=False)
    except:
        build = myenv.build(workspace=ws)
        build.wait_for_completion(show_output=False)

Saving setup content into  R:\Temp\tmpbi730qzv
.............................................................................................................................................................................................................................................................................................................................

## Create/Find Compute Clusters

In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

managedIdentityID = '/subscriptions/3a9fdb76-32c0-4279-978b-e502191d6e11/resourceGroups/aidea2019/providers/Microsoft.ManagedIdentity/userAssignedIdentities/aidea2019ManagedIdentity'
sshPubKey = 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDlAIjNnmUDzFovsbPbl20Sno+GpKsVJNg/6DQNusgyyLP75308dYpQuRK/ykHba+cRdHe9sjE/99LWyWZnkeEGjLXEVv2RItpbyeg9+gUlyZH0bSw1ipLUzJZ62zpDJpvpYX+wRUvL69y/ikiC88IdOq5CimdrMKEysEo/4yX8rm9htnpLyNJa+VUQtJEBNbzSargD7SPwEoNO7B33BWJQ/YJHqpzf58x8DCzh8ft4kc36k/kiK2WS/Fk+lckigjZZy4RkmzSTaGD5T6+JqhOLoGpqsPIY87ru3t7HPe2m8UqtcyeC0W+BYOEliJMh8RUtSSYaXntCN62LU/S+x1+Svom2lOFNAmoLNZZTKsSJqPgqIaYDLKgc5JqWIOvC7jDC55rPGzJlg2uSYxZ/C2dwnV+Sv30P34VyYhef4MZnC0nd5jz92a3clsZxwLi6BvwgA9NZxUcuRlZ+xdQcYoiG0O14vuG94noJHXwZKum5XJpq083cTBNemq95hap0Rmj34xA+pRZl0M86+GAyCGIEDcfoI/oTW72kKr9qcIMkKETSa+5YU7M7Ni5WMZ3zttFQWZRb/B3dbVvRGwjgr5ope1z0AMe4FNt58EYk6di71Cb3KrADGg6imnWgr4EwflV7f+GAgnHtKfHN/pgQL6jDmzXOonTtsyLmBl8u1YcQWQ== tingjhenjiang@gmail.com'
# Choose a name for your CPU cluster
cluster_names = {}
cluster_names['low1'] = "aidea2019clLowP1"
cluster_names['low2'] = "aidea2019clLowP2"
compute_config = {}
general_cluster_configs = {
        'max_nodes':6,
        'vm_priority':'lowpriority',
        'identity_type':"UserAssigned",
        'identity_id':[managedIdentityID],
        'idle_seconds_before_scaledown':180,
        'admin_username':'azureuser',
        'admin_user_ssh_key':sshPubKey,
        'remote_login_port_public_access':'Enabled'
}
compute_config['low1'] = AmlCompute.provisioning_configuration(
        **{**general_cluster_configs,**{'vm_size':'Standard_NC4as_T4_v3','max_nodes':6}}
    )
compute_config['low2'] = AmlCompute.provisioning_configuration(
        **{**general_cluster_configs,**{'vm_size':'Standard_NC6s_v3','max_nodes':4}}
    )


myvm = {}

# Verify that cluster does not exist already
for key in cluster_names:
    try:
        myvm[key] = ComputeTarget(workspace=ws, name=cluster_names[key])
        print('Found existing cluster, use it.')
    except ComputeTargetException:
        # To use a different region for the compute, add a location='<region>' parameter
        myvm[key] = ComputeTarget.create(ws, cluster_names[key], compute_config[key])
        print(f'creating {key}')
        myvm[key].wait_for_completion(show_output=True)
        myvm[key].add_identity(
            identity_type="UserAssigned",
            identity_id=[managedIdentityID])


creating low1
InProgress.....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
creating low2
InProgress..
FailedProvisioning operation finished, operation "Failed"


ComputeTargetException: ComputeTargetException:
	Message: Compute object provisioning polling reached non-successful terminal state, current provisioning state: Failed
Provisioning operation error:
{'code': 'BadRequest', 'message': '{"id":"https://eastus.api.azureml.ms/batchai/subscriptions/3a9fdb76-32c0-4279-978b-e502191d6e11/providers/Microsoft.BatchAI/locations/eastus/operationresults/b6521d93-17cd-43d2-abbf-6e6c732baeb4","name":"b6521d93-17cd-43d2-abbf-6e6c732baeb4","status":"Failed","startTime":"2022-01-29T23:34:04.912Z","endTime":"2022-01-29T23:34:08.666Z","error":{"code":"ClusterMinNodesExceedCoreQuota","message":"The specified subscription has a total vCPU quota of 5 and cannot accomodate for at least 1 requested managed compute node which maps to 6 vCPUs. Talk to your Subscription Admin or refer to https://docs.microsoft.com/azure/machine-learning/how-to-manage-quotas#request-quota-increases to increase the total quota"}}'}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Compute object provisioning polling reached non-successful terminal state, current provisioning state: Failed\nProvisioning operation error:\n{'code': 'BadRequest', 'message': '{\"id\":\"https://eastus.api.azureml.ms/batchai/subscriptions/3a9fdb76-32c0-4279-978b-e502191d6e11/providers/Microsoft.BatchAI/locations/eastus/operationresults/b6521d93-17cd-43d2-abbf-6e6c732baeb4\",\"name\":\"b6521d93-17cd-43d2-abbf-6e6c732baeb4\",\"status\":\"Failed\",\"startTime\":\"2022-01-29T23:34:04.912Z\",\"endTime\":\"2022-01-29T23:34:08.666Z\",\"error\":{\"code\":\"ClusterMinNodesExceedCoreQuota\",\"message\":\"The specified subscription has a total vCPU quota of 5 and cannot accomodate for at least 1 requested managed compute node which maps to 6 vCPUs. Talk to your Subscription Admin or refer to https://docs.microsoft.com/azure/machine-learning/how-to-manage-quotas#request-quota-increases to increase the total quota\"}}'}"
    }
}

## Run Configuration

In [None]:
from azureml.core.compute import ComputeTarget
myvm = myvm
"""
for key,value in cluster_names.items():
    myvm[key] = ComputeTarget(workspace=ws, name=value)
"""
myvm

## Use environments for training

In [8]:
from azureml.core import ScriptRunConfig, Experiment
from azureml.core.environment import Environment

exp = experiment#(name=experiment_name, workspace = ws)
# Instantiate environment
mylocalenv

scriptrunconfigs = {
    #'source_directory':"./cloud/tensorflow/mnist-distributed/src/",
    'source_directory':"./cloud/myown/",
    'script':"train.py",
    #'arguments':['--lr',0.005,'--epochs',1]
}
scriptrunconfigs_local = {
    'compute_target':"local", 'environment':mylocalenv
}
scriptrunconfigs_cloud = {
    'compute_target':myvm['low1'],
    'environment':myenv,
}

## Single Node Training

In [10]:
# Configure the ScriptRunConfig and specify the environment)
src = ScriptRunConfig( **{**scriptrunconfigs, **scriptrunconfigs_cloud} )

# Submit run
run = exp.submit(src)

## Distributed Training

In [11]:
from azureml.core.runconfig import TensorflowConfiguration
distr_config = TensorflowConfiguration(worker_count=1, parameter_server_count=0)
# Configure the ScriptRunConfig and specify the environment)
src = ScriptRunConfig( **{**scriptrunconfigs, **scriptrunconfigs_cloud, **{'distributed_job_config':distr_config}} )

# Submit run
run = exp.submit(src)