In [1]:
import os
#Azure
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace, Datastore, Experiment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails

## 1- Acces to the workspace in AzureML

In [2]:
interactive_auth = InteractiveLoginAuthentication(tenant_id="d7b671d8-de19-4f7f-956a-6d47e45b42ac", force=True)
ws = Workspace.get(name="HermesML",
                   subscription_id='1ccbbadb-1165-4ee3-bbef-245d6ccf661b',
                   resource_group='Avanade-ia-Hermes',
                  auth=interactive_auth)
print (ws.name, ws.location)

Performing interactive authentication. Please follow the instructions on the terminal.




Interactive authentication successfully completed.
HermesML westeurope


In [3]:
exp = Experiment(workspace=ws, name='Material')

## 2- Create or/and attach training cluster with Azure ML Compute

In [4]:
# choose a name for your cluster
cluster_name = "GPU4"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC_12',vm_priority='lowpriority', max_nodes=4,min_nodes=1)

    # create the cluster
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(cpu_cluster.get_status().serialize())

Found existing compute target
{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 2, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-03-27T15:41:06.308000+00:00', 'errors': None, 'creationTime': '2020-03-20T11:42:15.326930+00:00', 'modifiedTime': '2020-03-23T14:12:57.603080+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 2, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'LowPriority', 'vmSize': 'STANDARD_NC12'}


## 3- Training model of Deep Learning

In [5]:
#Get dataset
dataset = Dataset.get_by_name(ws, name='material')
dataset

{
  "source": [
    "('hermesdatastore', 'Material/')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "1f87f8e7-da66-40cf-bbd5-782c9e44bc11",
    "name": "material",
    "version": 1,
    "description": "training and test dataset",
    "workspace": "Workspace.create(name='HermesML', subscription_id='1ccbbadb-1165-4ee3-bbef-245d6ccf661b', resource_group='Avanade-ia-Hermes')"
  }
}

##### Victor

In [6]:
#Get dataset
script_params = {
    '--data-folder': dataset.as_named_input('material').as_mount(),
    '--epochs': 50,
    '--base-model':'Inception_V3',#VGG16
    '--learnable-layers':20,#4
    '--input-shape':512,
    '--batch-size':64
}

#r"C:\Users\victor.vincent\OneDrive - Avanade\Envs\DPtrain\Notebook"
est = TensorFlow(source_directory='.',
                 script_params=script_params,
                 node_count=2,
                 pip_packages=['matplotlib',
                               'tensorflow==1.13.1',
                               'keras==2.2.4',
                               'keras-radam',
                               'keras-lookahead',
                               'Pillow==7.0.0',
                               'scikit-learn==0.22.2.post1',
                               'seaborn',
#                               'azureml-dataprep[fuse]'
                              ],
                 distributed_training='mpi',
                 compute_target=cpu_cluster, 
                 framework_version='1.13',
                 entry_script='train_model.py',
                 use_gpu=True)
run = exp.submit(est)
RunDetails(run).show()

Submitting C:\Users\victor.vincent\OneDrive - Avanade\Avanade_Projet\Hermes\Script2\AzureML directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [7]:
run.cancel()

#### Hamza

In [None]:
from azureml.train.dnn import TensorFlow

script_params = {
        '--data-folder': dataset.as_named_input('type').as_mount(),
        '--batch-size': 64,
        '--learnable-layers':10,
        '--epochs': 20,
        '--learning-rate': 0.001,
        '--input-shape': 512,
        '--base-model': 'vgg16',

    }
est = TensorFlow(source_directory='.',
                 script_params=script_params,
                 node_count=2,
                 pip_packages=['matplotlib',
                               'tensorflow==1.13.1',
                               'keras==2.2.4',
                               'keras-radam',
                               'keras-lookahead',
                               'Pillow==7.0.0',
                               'scikit-learn==0.22.2.post1',
                               'seaborn',
#                               'azureml-dataprep[fuse]'
                              ],
                 distributed_training='mpi',
                 compute_target=cpu_cluster, 
                 framework_version='1.13',
                 entry_script='train_model_hamza.py',
                 use_gpu=True)
run = exp.submit(est)
RunDetails(run).show()

In [None]:
run.cancel()