In [1]:
%matplotlib inline
import numpy as np
import os

In [2]:
import azureml
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.34.0


# Connect Workspace

In [3]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: mlrgeastus
Azure region: eastus
Subscription id: 932c3e14-d0cf-4e41-9998-bdebb9bfa1cf
Resource group: mlrg


# Create experiment

In [4]:
from azureml.core import Experiment
experiment_name = 'myExp'

exp = Experiment(workspace=ws, name=experiment_name)

In [5]:
script_folder = './'
os.makedirs(script_folder, exist_ok=True)

with open(os.path.join(script_folder, 'myModel.py'), 'r') as f:
    print(f.read())

import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense



DATA_PATH="https://raw.githubusercontent.com/urtbest86/MLOps/master/result_train_dataset2.csv"
df = pd.read_csv(DATA_PATH)

DATA_PATH="https://raw.githubusercontent.com/urtbest86/MLOps/master/result_test_dataset2.csv"
test = pd.read_csv(DATA_PATH)

train=df.sample(frac=0.8)
val=df.sample(frac=0.2)

mean = train.mean(axis=0)
train -= mean
std = train.std(axis=0)
train /= std

test -= mean
test /= std

val-=mean
val/=std

train_data_set = train.values
x_train = train_data_set[:, 2:-1].astype(float)
y_train = train_data_set[:, -1].astype(float)

test_data_set = test.values
x_test = test_data_set[:, 2:-1].astype(float)
y_test = test_data_set[:, -1].astype(float)

val_data_set = val.values
x_val = val_data_set[:, 2:-1].astype(float)
y_val = val_data_set[:, -1].astype(float)


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dens

# Attach existing compute resource

In [6]:
'''from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "test-cluster")

if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")'''

'from azureml.core.compute import AmlCompute\nfrom azureml.core.compute import ComputeTarget\nimport os\n\ncompute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "test-cluster")\n\nif compute_name in ws.compute_targets:\n    compute_target = ws.compute_targets[compute_name]\n    if compute_target and type(compute_target) is AmlCompute:\n        print("found compute target: " + compute_name)\nelse:\n    print("creating new compute target...")'

In [7]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D11_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

creating new compute target...
InProgress....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Resizing', 'allocationStateTransitionTime': '2021-10-31T14:50:57.221000+00:00', 'errors': None, 'creationTime': '2021-10-31T14:50:56.834548+00:00', 'modifiedTime': '2021-10-31T14:51:12.407329+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D11_V2'}


In [8]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.6.2

- pip:
  - h5py<=2.10.0
  - azureml-defaults
  - tensorflow-gpu==2.0.0
  - keras<=2.3.1
  - matplotlib
  - pandas
  - scikit-learn

Overwriting conda_dependencies.yml


In [9]:
from azureml.core import Environment

env = Environment.from_conda_specification(name = 'keras-2.3.1', file_path = './conda_dependencies.yml')

'''# Specify a GPU base image
env.docker.enabled = True
env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04'''

env.register(workspace = ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "keras-2.3.1",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "de

# 학습 실행 구성 및 제출
## ScriptRunConfig 만들기

In [10]:
from azureml.core import ScriptRunConfig

args = ['--data-folder', 
        '--batch-size', 4,
        '--first-layer-neurons', 1024,
        '--second-layer-neurons', 500,
        '--third-layer-neurons', 300,
        '--forth-layer-neurons', 200,
        '--learning-rate', 0.001]

src = ScriptRunConfig(source_directory=script_folder,
                      script='myModel.py',
                      arguments=args,
                      compute_target=compute_name,
                      environment=env)

In [11]:
run = exp.submit(src)

In [12]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [13]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
myExp,myExp_1635691878_4428a645,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [14]:
run.wait_for_completion(show_output=True)

RunId: myExp_1635691878_4428a645
Web View: https://ml.azure.com/runs/myExp_1635691878_4428a645?wsid=/subscriptions/932c3e14-d0cf-4e41-9998-bdebb9bfa1cf/resourcegroups/mlrg/workspaces/mlrgeastus&tid=19411b3a-b7ca-48d9-a1e2-31694d0ccb65

Streaming azureml-logs/20_image_build_log.txt

2021/10/31 14:51:33 Downloading source code...
2021/10/31 14:51:34 Finished downloading source code
2021/10/31 14:51:34 Creating Docker network: acb_default_network, driver: 'bridge'
2021/10/31 14:51:34 Successfully set up Docker network: acb_default_network
2021/10/31 14:51:34 Setting up Docker configuration...
2021/10/31 14:51:35 Successfully set up Docker configuration
2021/10/31 14:51:35 Logging in to registry: 0bc3a186164e4d5b93a43d2334d6fb27.azurecr.io
2021/10/31 14:51:36 Successfully logged into 0bc3a186164e4d5b93a43d2334d6fb27.azurecr.io
2021/10/31 14:51:36 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'
2021/10/31 14:51:36 Scanning for depende

{'runId': 'myExp_1635691878_4428a645',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-10-31T15:01:46.500343Z',
 'endTimeUtc': '2021-10-31T15:05:59.384305Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '07d5ffc9-aeda-4775-8a30-601049b46c73',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'myModel.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--data-folder',
   '--batch-size',
   '4',
   '--first-layer-neurons',
   '1024',
   '--second-layer-neurons',
   '500',
   '--third-layer-neurons',
   '300',
   '--forth-layer-neurons',
   '200',
   '--learning-rate',
   '0.001'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpu-cluster',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'datac

In [15]:
print(run.get_metrics())

{'Final test loss(mse)': 0.3835103484717282, 'Final test loss(mae)': 0.5201957821846008, 'loss VS val_loss': 'aml://artifactId/ExperimentRun/dcid.myExp_1635691878_4428a645/loss VS val_loss_1635692738.png'}


In [16]:
print(run.get_file_names())

['azureml-logs/20_image_build_log.txt', 'azureml-logs/55_azureml-execution-tvmps_dd69fe3e4bc7905cd1242b8a0a4d0d11aed400dbf469bd081edc9fe3d1c72231_d.txt', 'azureml-logs/65_job_prep-tvmps_dd69fe3e4bc7905cd1242b8a0a4d0d11aed400dbf469bd081edc9fe3d1c72231_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_dd69fe3e4bc7905cd1242b8a0a4d0d11aed400dbf469bd081edc9fe3d1c72231_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/97_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'loss VS val_loss_1635692738.png', 'outputs/Youjin_test_model.pkl']


In [17]:
'''model = run.register_model(model_name='myModelYoujin2', model_path='.')'''

"model = run.register_model(model_name='myModelYoujin2', model_path='.')"

In [18]:
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

model = run.register_model(model_name='Youjin-test', 
                           model_path='outputs/Youjin_test_model.pkl',
                           model_framework=Model.Framework.TENSORFLOW,
                           model_framework_version='2.0',
                           )

print(model.name, model.id, model.version, sep='\t')

Youjin-test	Youjin-test:1	1


In [19]:
'''os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/model'):
        output_file_path = os.path.join('./model', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        run.download_file(name=f, output_file_path=output_file_path)'''

"os.makedirs('./model', exist_ok=True)\n\nfor f in run.get_file_names():\n    if f.startswith('outputs/model'):\n        output_file_path = os.path.join('./model', f.split('/')[-1])\n        print('Downloading from {} to {} ...'.format(f, output_file_path))\n        run.download_file(name=f, output_file_path=output_file_path)"