In [None]:
import pandas as pd

import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [None]:
sagemaker_session = sagemaker.Session()
input_train = sagemaker_session.upload_data(path='./data/train.csv', key_prefix='kaggle-ml-pipeline/data')
input_test = sagemaker_session.upload_data(path='./data/test.csv', key_prefix='kaggle-ml-pipeline/data')

In [None]:
!cat ./scripts/preprocess/Dockerfile

In [None]:
!docker build -t sagemaker-kaggle-titanic-preprocess ./scripts/preprocess

In [None]:
import boto3

# boto3の機能を使ってリポジトリ名に必要な情報を取得する
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name
tag = ':latest'

# SageMakerFullAccess を使っているから repository 名の中に sagemaker が含まれている必要がある
ecr_repository = f'sagemaker-kaggle-titanic-preprocess'
image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repository+tag}'

!$(aws ecr get-login --region $region --registry-ids $account_id --no-include-email)
 
# リポジトリの作成
# すでにある場合はこのコマンドは必要ない
!aws ecr create-repository --repository-name $ecr_repository
 
!docker build -t {ecr_repository} .
!docker tag {ecr_repository + tag} $image_uri
!docker push $image_uri

print(f'コンテナは {image_uri} へ登録されています。')

In [None]:
job_name = f'sagemaker-kaggle-preprocessing-train'
output_s3_path = 's3://' + sagemaker_session.default_bucket() + '/kaggle-ml-pipeline'

processing_input_dir = '/opt/ml/processing/input'
processing_output_dir = '/opt/ml/processing/output'

output_s3_path_preprocess = output_s3_path + '/preprocessed'

In [None]:
processor = ScriptProcessor(base_job_name=job_name,
                                   image_uri=image_uri,
                                   command=['python3'],
                                   role=role,
                                   instance_count=1,
                                   instance_type='ml.c5.xlarge'
                                  )

In [None]:
processor.run(code='./scripts/preprocess/preprocess_script/preprocess.py', # S3 の URI でも可
              inputs=[ProcessingInput(source=input_train, destination=processing_input_dir)],
              outputs=[ProcessingOutput(source=processing_output_dir, destination=output_s3_path_preprocess)],
              arguments=[
                  '--data_type', 'train',
                  '--input_dir',processing_input_dir,
                  '--output_dir',processing_output_dir
                      ]
                    )

## 学習

In [None]:
from sagemaker.sklearn.estimator import SKLearn

output_s3_path_train = output_s3_path + '/train'

sklearn = SKLearn(
    entry_point='scripts/train/train.py',
    framework_version="0.23-1",
    train_instance_type="ml.m5.xlarge",
    #train_instance_type="local",
    output_path=output_s3_path_train,
    role=role)

In [None]:
train_input = output_s3_path_preprocess + '/train.csv'
sklearn.fit({'train': train_input})

### 推論

In [None]:
job_name = f'sagemaker-kaggle-preprocessing-test'

processor = ScriptProcessor(base_job_name=job_name,
                                   image_uri=image_uri,
                                   command=['python3'],
                                   role=role,
                                   instance_count=1,
                                   instance_type='ml.c5.xlarge'
                                  )



processor.run(code='./scripts/preprocess/preprocess_script/preprocess.py', # S3 の URI でも可
              inputs=[ProcessingInput(source=input_test, destination=processing_input_dir)],
              outputs=[ProcessingOutput(source=processing_output_dir, destination=output_s3_path_preprocess)],
              arguments=[
                  '--data_type', 'test',
                  '--input_dir',processing_input_dir,
                  '--output_dir',processing_output_dir
                      ]
                    )

In [None]:
output_s3_path_inference = output_s3_path + '/batch_inference'

transformer = sklearn.transformer(instance_count=1,
                                  instance_type='ml.m5.xlarge',
                                  output_path=output_s3_path_inference)

In [None]:
test_input = output_s3_path_preprocess + '/test.csv'

transformer.transform(
    data=test_input,
    content_type='text/csv')

print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)

transformer.wait()