In [1]:
import boto3
import json
import logging

In [2]:
import numpy as np
import pandas as pd

In [3]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.pytorch import PyTorchModel

In [4]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

In [12]:
pytorch_estimator._hyperparameters['sagemaker_job_name'] = 'kanto-job-1'

In [19]:
pytorch_estimator._hyperparameters

{'sagemaker_job_name': 'kanto-base-job-2020-08-22-02-01-56-986',
 'sagemaker_submit_directory': 's3://sagemaker-ap-northeast-2-029498593638/kanto-base-job-2020-08-22-02-01-56-986/source/sourcedir.tar.gz',
 'sagemaker_program': 'HRC_0818_final.py',
 'sagemaker_enable_cloudwatch_metrics': False,
 'sagemaker_container_log_level': 20,
 'sagemaker_region': 'ap-northeast-2'}

In [56]:
pytorch_estimator = PyTorch(entry_point='HRC_0818_final.py',
                            train_instance_type='ml.m4.xlarge',
                            role=role,
                            train_instance_count=1,
                            framework_version='1.4.0',
                            #output_path = 's3://{}/{}/model-output',
                            base_job_name = 'step-test'
                            )

In [57]:
pytorch_estimator.fit('s3://{}/hrms/train/train.jsonl'.format(bucket),job_name='step-fit')

2020-08-22 04:14:35 Starting - Starting the training job...
2020-08-22 04:14:39 Starting - Launching requested ML instances......
2020-08-22 04:15:47 Starting - Preparing the instances for training......
2020-08-22 04:16:45 Downloading - Downloading input data...
2020-08-22 04:17:30 Training - Downloading the training image...
2020-08-22 04:18:00 Uploading - Uploading generated training model[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-08-22 04:17:51,501 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-08-22 04:17:51,505 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-22 04:17:51,517 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-08-22 04:17:51,731 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-08-22


2020-08-22 04:18:08 Completed - Training job completed
Training seconds: 83
Billable seconds: 83


In [36]:
predictor = pytorch_estimator.deploy(instance_type='local',
                                     initial_instance_count=1)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


Attaching to tmpi7w4a7iy_algo-1-ii0wl_1
[36malgo-1-ii0wl_1  |[0m 2020-08-22 02:42:48,782 [INFO ] main com.amazonaws.ml.mms.ModelServer - 
[36malgo-1-ii0wl_1  |[0m MMS Home: /opt/conda/lib/python3.6/site-packages
[36malgo-1-ii0wl_1  |[0m Current directory: /
[36malgo-1-ii0wl_1  |[0m Temp directory: /home/model-server/tmp
[36malgo-1-ii0wl_1  |[0m Number of GPUs: 0
[36malgo-1-ii0wl_1  |[0m Number of CPUs: 4
[36malgo-1-ii0wl_1  |[0m Max heap size: 3566 M
[36malgo-1-ii0wl_1  |[0m Python executable: /opt/conda/bin/python
[36malgo-1-ii0wl_1  |[0m Config file: /etc/sagemaker-mms.properties
[36malgo-1-ii0wl_1  |[0m Inference address: http://0.0.0.0:8080
[36malgo-1-ii0wl_1  |[0m Management address: http://0.0.0.0:8080
[36malgo-1-ii0wl_1  |[0m Model Store: /.sagemaker/mms/models
[36malgo-1-ii0wl_1  |[0m Initial Models: ALL
[36malgo-1-ii0wl_1  |[0m Log dir: /logs
[36malgo-1-ii0wl_1  |[0m Metrics dir: /logs
[36malgo-1-ii0wl_1  |[0m Netty threads: 0
[36malgo-1-ii0wl_

## step function

import sys
!{sys.executable} -m pip install --upgrade stepfunctions

In [44]:
import uuid

In [45]:
import stepfunctions

from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import s3_input
from sagemaker.s3 import S3Uploader
from stepfunctions import steps
from stepfunctions.steps import TrainingStep, ModelStep
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow

stepfunctions.set_stream_logger(level=logging.INFO)
id = uuid.uuid4().hex

In [58]:
execution_input = ExecutionInput(schema={
    'ModelName': str,
    'TrainTargetLocation':str,
    'TrainingJobName':str
})

In [59]:
pytorch_estimator._current_job_name

'step-fit'

In [60]:
training_step = steps.TrainingStep(
    'Model Training', 
    estimator=pytorch_estimator,
    data={
         'training': s3_input(s3_data=execution_input['TrainTargetLocation'])
    } ,
    job_name=execution_input['TrainingJobName'],
    wait_for_completion=True
)


model_step = steps.ModelStep(
    'Create model',
    model=training_step.get_expected_model(),
    model_name=execution_input['ModelName'] ,
    instance_type='ml.m4.xlarge',
)

In [61]:
catch_state_training= steps.states.Catch(
    error_equals=["States.TaskFailed"],
    next_step=model_step
)

training_step.add_catch(catch_state_training)


In [62]:
workflow_definition = steps.Chain([
    training_step,
    model_step
])

In [63]:
workflow_execution_role = 'arn:aws:iam::029498593638:role/StepFunctionsWorkflowExecutionRole'

In [64]:
workflow = Workflow(
    name='training_pipeline_kanto_another',
    definition=workflow_definition,
    role=workflow_execution_role,
    execution_input=execution_input
)

In [54]:
workflow.render_graph()

In [65]:
workflow.create()

[32m[INFO] Workflow created successfully on AWS Step Functions.[0m


'arn:aws:states:ap-northeast-2:029498593638:stateMachine:training_pipeline_kanto_another'