In [None]:
## 準備

In [None]:
import sys
#!{sys.executable} -m pip install --upgrade stepfunctions

## データの準備、s3 へアップロード

In [None]:
import boto3
import sagemaker
import time
import random
import uuid
import logging
import stepfunctions
import io
import random

from sagemaker.amazon.amazon_estimator import get_image_uri
from stepfunctions import steps
from stepfunctions.steps import TrainingStep, ModelStep, TransformStep
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow
from stepfunctions.template import TrainingPipeline
from stepfunctions.template.utils import replace_parameters_with_jsonpath


import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
workflow_execution_role = "arn:aws:iam::815969174475:role/StepFunctionsWorkflowExecutionRole-bp"

sagemaker_session = sagemaker.Session()
input_train = sagemaker_session.upload_data(path='./data/train.csv', key_prefix='kaggle-ml-pipeline/data')
input_test = sagemaker_session.upload_data(path='./data/test.csv', key_prefix='kaggle-ml-pipeline/data')

In [None]:
stepfunctions.set_stream_logger(level=logging.INFO)

region = boto3.Session().region_name
bucket = session.default_bucket()
prefix = 'sagemaker/DEMO-xgboost-regression'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

## データ前処理用のコンテナの準備

In [None]:
!docker build -t sagemaker-kaggle-titanic-preprocess ./scripts/preprocess

import boto3

# boto3の機能を使ってリポジトリ名に必要な情報を取得する
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name
tag = ':latest'

# SageMakerFullAccess を使っているから repository 名の中に sagemaker が含まれている必要がある
ecr_repository = f'sagemaker-kaggle-titanic-preprocess'
image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repository+tag}'

!$(aws ecr get-login --region $region --registry-ids $account_id --no-include-email)
 
# リポジトリの作成
# すでにある場合はこのコマンドは必要ない
!aws ecr create-repository --repository-name $ecr_repository
 
!docker build -t {ecr_repository} .
!docker tag {ecr_repository + tag} $image_uri
!docker push $image_uri

print(f'コンテナは {image_uri} へ登録されています。')

## ワークフロー定義

In [None]:
# SageMaker へは学習ジョブ、モデル、エンドポイントへそれぞれユニークな名前を用います。 
execution_input = ExecutionInput(schema={
    'JobName': str, 
    'ModelName': str,
    'EndpointName': str,
    'PreprocessingJobName': str
})

### 学習

### 学習用前処理

In [None]:
job_name = f'sagemaker-kaggle-preprocessing-train'
output_s3_path = 's3://' + sagemaker_session.default_bucket() + '/kaggle-ml-pipeline'

processing_input_dir = '/opt/ml/processing/input'
processing_code_dir = '/opt/ml/processing/input/code'
processing_output_dir = '/opt/ml/processing/output'


PREPROCESSING_SCRIPT_LOCATION = './scripts/preprocess/preprocess_script/preprocess.py'

input_code = sagemaker_session.upload_data(
    PREPROCESSING_SCRIPT_LOCATION,
    bucket=sagemaker_session.default_bucket(),
    key_prefix= 'kaggle-ml-pipeline/preprocess/code',
)

output_s3_path_preprocess = output_s3_path + '/preprocessed'

processor = ScriptProcessor(base_job_name=job_name,
                                   image_uri=image_uri,
                                   command=['python3'],
                                   role=role,
                                   instance_count=1,
                                   instance_type='ml.c5.xlarge'
                                  )

train_preprocess_step = steps.ProcessingStep(
    'Preprocess for Training Step', 
    processor=processor,
    job_name=execution_input["PreprocessingJobName"],
    inputs=[
        ProcessingInput(source=input_code, destination=processing_code_dir),
        ProcessingInput(source=input_train, destination=processing_input_dir),
    ],
    outputs=[ProcessingOutput(source=processing_output_dir, destination=output_s3_path_preprocess)],
    container_arguments=[
                  '--data_type', 'train',
                  '--input_dir',processing_input_dir,
                  '--output_dir',processing_output_dir
                      ],
    container_entrypoint=["python3", "/opt/ml/processing/input/code/preprocessing.py"]
)

In [None]:
from sagemaker.sklearn.estimator import SKLearn

output_s3_path_train = output_s3_path + '/train'

sklearn = SKLearn(
    entry_point='scripts/train/train.py',
    framework_version="0.23-1",
    train_instance_type="ml.m5.xlarge",
    output_path=output_s3_path_train,
    role=role)

train_input = output_s3_path_preprocess + '/train.csv'

training_step = steps.TrainingStep(
    'Train Step', 
    estimator=sklearn,
    data={'train': train_input},
    job_name=execution_input['JobName']  
)

## 推論

### モデル作成ステップ

In [None]:
model_step = steps.ModelStep(
    'Save model',
    model=training_step.get_expected_model(),
    model_name=execution_input['ModelName']  
)

In [None]:
job_name = f'sagemaker-kaggle-preprocessing-test'

processor = ScriptProcessor(base_job_name=job_name,
                                   image_uri=image_uri,
                                   command=['python3'],
                                   role=role,
                                   instance_count=1,
                                   instance_type='ml.c5.xlarge'
                                  )

test_preprocess_step = steps.ProcessingStep(
    'Preprocess for Test Step', 
    processor=processor,
    job_name=execution_input["PreprocessingJobName"],
    inputs=[
        ProcessingInput(source=input_code, destination=processing_code_dir),
        ProcessingInput(source=input_test, destination=processing_input_dir),
    ],
    outputs=[ProcessingOutput(source=processing_output_dir, destination=output_s3_path_preprocess)],
    container_arguments=[
                  '--data_type', 'test',
                  '--input_dir',processing_input_dir,
                  '--output_dir',processing_output_dir
                      ],
    container_entrypoint=["python3", "/opt/ml/processing/input/code/preprocessing.py"]
)

In [None]:
output_s3_path_inference = output_s3_path + '/batch_inference'
transformer = sklearn.transformer(instance_count=1,
                                  instance_type='ml.m5.xlarge',
                                  output_path=output_s3_path_inference)

transform_step = steps.TransformStep(
    'Transform Input Dataset',
    transformer=sklearn.transformer(
        instance_count=1,
        instance_type='ml.m5.large'
    ),
    job_name=execution_input['JobName'],     
    model_name=execution_input['ModelName'], 
    data=output_s3_path_inference,
    content_type='text/libsvm'
)

In [None]:
failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
    "ML Workflow failed", cause="SageMakerProcessingJobFailed"
)

In [None]:
catch_state_processing = stepfunctions.steps.states.Catch(
    error_equals=["States.TaskFailed"],
    next_step=failed_state_sagemaker_processing_failure,
)

train_preprocess_step.add_catch(catch_state_processing)
training_step.add_catch(catch_state_processing)
test_preprocess_step.add_catch(catch_state_processing)
transform_step.add_catch(catch_state_processing)

In [None]:
workflow_graph = Chain([train_preprocess_step, training_step, test_preprocess_step, transform_step])

workflow = Workflow(
    name="titanic-ml-pipeline",
    definition=workflow_graph,
    role=workflow_execution_role,
)


workflow.render_graph()

In [None]:
template = workflow.get_cloudformation_template()

### Workflow の実行

In [None]:
execution = workflow.execute(
    inputs={
        "PreprocessingJobName": preprocessing_job_name, 
        "TrainingJobName": training_job_name, 
        "EvaluationProcessingJobName": evaluation_job_name, 
    }
)