# データの準備

In [2]:
import os
import keras
import numpy as np
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

os.makedirs("./data", exist_ok = True)

np.savez('./data/train', image=x_train, label=y_train)
np.savez('./data/test', image=x_test, label=y_test)

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


# ローカルでの動作確認
## ローカル用のDockerコンテナのビルド
tf2.0をベースにしたSageMakerコンパチなDockerfile。

In [7]:
!cat container/Dockerfile

FROM tensorflow/tensorflow:2.0.0a0

RUN pip install sagemaker-containers

# 必要なパッケージをインストールします
# インストールによって生じる不要なファイルは削除します
RUN apt -y update && apt install -y --no-install-recommends \
    wget \
    nginx \
    ca-certificates \
    && apt clean
 
# pythonパッケージをインストールします
# キャッシュファイルは重たいので、削除しておきます
RUN pip install wheel numpy pandas flask gevent gunicorn && \
    rm -rf /root/.cache

# Copies the training code inside the container
COPY tf_codes /opt/ml/code

# Defines train.py as script entry point
ENV SAGEMAKER_PROGRAM train.py
ENV SAGEMAKER_SERVING_MODULE serve.py

ローカルモードでの検証のために、ノートブックインスタンス上にビルド。

In [8]:
!docker build -t tf-2.0 container/.

Sending build context to Docker daemon   34.3kB
Step 1/7 : FROM tensorflow/tensorflow:2.0.0a0
 ---> 2ebc856b5e27
Step 2/7 : RUN pip install sagemaker-containers
 ---> Using cache
 ---> ef98812ac7e3
Step 3/7 : RUN apt -y update && apt install -y --no-install-recommends     wget     nginx     ca-certificates     && apt clean
 ---> Running in 4a550b4c0366
[91m

[0mHit:1 http://archive.ubuntu.com/ubuntu xenial InRelease
Get:2 http://security.ubuntu.com/ubuntu xenial-security InRelease [109 kB]
Get:3 http://archive.ubuntu.com/ubuntu xenial-updates InRelease [109 kB]
Get:4 http://security.ubuntu.com/ubuntu xenial-security/main amd64 Packages [970 kB]
Get:5 http://archive.ubuntu.com/ubuntu xenial-backports InRelease [107 kB]
Get:6 http://archive.ubuntu.com/ubuntu xenial-updates/main amd64 Packages [1347 kB]
Get:7 http://security.ubuntu.com/ubuntu xenial-security/universe amd64 Packages [587 kB]
Get:8 http://archive.ubuntu.com/ubuntu xenial-updates/restricted amd64 Packages [13.1 kB]
Get:9 h

In [9]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator

sagemaker_session = sagemaker.Session()

role = get_execution_role()

estimator = Estimator(image_name='tf-2.0',
                      role=role,
                      hyperparameters={'batch_size': 64,'epochs': 1},
                      train_instance_count=1,
                      train_instance_type='local')

In [10]:
estimator.fit({'train': 'file://data'})

Creating tmpqmpd6rzv_algo-1-m7hu3_1 ... 
[1BAttaching to tmpqmpd6rzv_algo-1-m7hu3_12mdone[0m
[36malgo-1-m7hu3_1  |[0m 2019-10-03 16:45:29,505 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-m7hu3_1  |[0m 2019-10-03 16:45:29,522 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-m7hu3_1  |[0m 2019-10-03 16:45:29,539 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-m7hu3_1  |[0m 2019-10-03 16:45:29,552 sagemaker-containers INFO     Invoking user script
[36malgo-1-m7hu3_1  |[0m 
[36malgo-1-m7hu3_1  |[0m Training Env:
[36malgo-1-m7hu3_1  |[0m 
[36malgo-1-m7hu3_1  |[0m {
[36malgo-1-m7hu3_1  |[0m     "module_dir": "/opt/ml/code", 
[36malgo-1-m7hu3_1  |[0m     "channel_input_dirs": {
[36malgo-1-m7hu3_1  |[0m         "train": "/opt/ml/input/data/train"
[36malgo-1-m7hu3_1  |[0m     }, 
[36malgo-1-m7hu3_1  |[0m     "resource_config": {
[36malgo-1-m7hu3_

In [None]:
predictor = estimator.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

W1003 16:46:14.698861 140552688699200 connectionpool.py:662] Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd49924b5f8>: Failed to establish a new connection: [Errno 111] Connection refused',)': /ping
W1003 16:46:14.705081 140552688699200 connectionpool.py:662] Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd49924b198>: Failed to establish a new connection: [Errno 111] Connection refused',)': /ping
W1003 16:46:14.706547 140552688699200 connectionpool.py:662] Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd49924b518>: Failed to establish a new connection: [Errno 111] Connection refused',)': /ping


Attaching to tmpczpei0dx_algo-1-wejmj_1
[36malgo-1-wejmj_1  |[0m [33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
[36malgo-1-wejmj_1  |[0m Processing /opt/ml/code
[36malgo-1-wejmj_1  |[0m Building wheels for collected packages: train
[36malgo-1-wejmj_1  |[0m   Building wheel for train (setup.py) ... [?25ldone
[36malgo-1-wejmj_1  |[0m [?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-o3nM0t/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3
[36malgo-1-wejmj_1  |[0m Successfully built train
[36malgo-1-wejmj_1  |[0m Installing collected packages: train
[36malgo-1-wejmj_1  |[0m Successfully installed train-1.0.0
[36malgo-1-wejmj_1  |[0m [33mYou are using pip version 19.0.3, however version 19.2.3 is available.
[36malgo-1-wejmj_1  |[0m You should consider upgrading via the 'pip

W1003 16:46:19.709921 140552688699200 connectionpool.py:662] Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd49924bc50>: Failed to establish a new connection: [Errno 111] Connection refused',)': /ping
W1003 16:46:19.711754 140552688699200 connectionpool.py:662] Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd49924ba58>: Failed to establish a new connection: [Errno 111] Connection refused',)': /ping
W1003 16:46:19.713486 140552688699200 connectionpool.py:662] Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fd4992b0c50>: Failed to establish a new connection: [Errno 111] Connection refused',)': /ping
W1003 16:4

# コンテナのビルドとECRへの登録

In [12]:
!bash container/build_and_push.sh

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon  33.28kB
Step 1/5 : FROM tensorflow/tensorflow:2.0.0a0
 ---> 2ebc856b5e27
Step 2/5 : RUN pip install sagemaker-containers
 ---> Using cache
 ---> dbe5fb9d9dd5
Step 3/5 : COPY tf_codes /opt/ml/code
 ---> Using cache
 ---> afd31f563dd0
Step 4/5 : ENV SAGEMAKER_PROGRAM train.py
 ---> Using cache
 ---> 8a5cdf3330b5
Step 5/5 : ENV SAGEMAKER_SERVING_MODULE serve.py
 ---> Using cache
 ---> e5475ce1fcbd
Successfully built e5475ce1fcbd
Successfully tagged sagemaker-tf2.0-example:latest
The push refers to repository [815969174475.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tf2.0-example]

[1B0f8a9227: Preparing 
[1Bb1817ceb: Preparing 
[1B409a88ea: Preparing 
[1Bbf629a2c: Preparing 
[1B452db38c: Preparing 
[1Bb6322759: Preparing 
[1B2331cf80: Preparing 
[1B4a53a228: Preparing 
[1Ba0c9a8cd: Preparing 
[1B91ae09b8: Preparing 
[1B8b4c3da7: Preparing 
[2B8b4

コンテナイメージのパスの取得。

In [13]:
import boto3

client = boto3.client('sts')
account = client.get_caller_identity()['Account']

my_session = boto3.session.Session()
region = my_session.region_name

algorithm_name = 'sagemaker-tf2.0-example'

ecr_image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)

print(ecr_image)

815969174475.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tf2.0-example:latest


## データのS3へのアップロード

In [14]:
bucket_name = sagemaker_session.default_bucket()
input_data = sagemaker_session.upload_data(path='./data', bucket=bucket_name, key_prefix='dataset/mnist')
print('Training data is uploaded to: {}'.format(input_data))

Training data is uploaded to: s3://sagemaker-us-east-1-815969174475/dataset/mnist


In [15]:
estimator = Estimator(image_name=ecr_image,
                      role=role,
                      base_job_name='tf20-example',
                      hyperparameters={'batch_size': 64,'epochs': 1},
                      train_instance_count=1,
                      train_instance_type='ml.p2.xlarge')

In [17]:
estimator.fit({'train': '{}'.format(input_data)})

2019-10-03 07:32:57 Starting - Starting the training job...
2019-10-03 07:32:59 Starting - Launching requested ML instances......
2019-10-03 07:34:00 Starting - Preparing the instances for training...
2019-10-03 07:34:55 Downloading - Downloading input data...
2019-10-03 07:35:12 Training - Downloading the training image.....[31m2019-10-03 07:36:05,425 sagemaker-containers INFO     Invoking user script
[0m
[31mTraining Env:
[0m
[31m{
    "module_dir": "/opt/ml/code", 
    "channel_input_dirs": {
        "train": "/opt/ml/input/data/train"
    }, 
    "resource_config": {
        "hosts": [
            "algo-1"
        ], 
        "network_interface_name": "eth0", 
        "current_host": "algo-1"
    }, 
    "num_cpus": 4, 
    "log_level": 20, 
    "output_intermediate_dir": "/opt/ml/output/intermediate", 
    "input_config_dir": "/opt/ml/input/config", 
    "additional_framework_parameters": {}, 
    "output_data_dir": "/opt/ml/output/data", 
    "output_dir": "/opt/ml/output", 

In [None]:
predictor = estimator.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

In [None]:
%matplotlib inline
import random
import matplotlib.pyplot as plt

num_samples = 5
indices = random.sample(range(x_test.shape[0] - 1), num_samples)
images, labels = x_test[indices]/255, y_test[indices]

for i in range(num_samples):
    plt.subplot(1,num_samples,i+1)
    plt.imshow(images[i].reshape(28, 28), cmap='gray')
    plt.title(labels[i])
    plt.axis('off')
    
prediction = predictor.predict(images.reshape(num_samples, 28, 28, 1))['predictions']
prediction = np.array(prediction)
predicted_label = prediction.argmax(axis=1)
print('The predicted labels are: {}'.format(predicted_label))

In [None]:
predictor.delete_endpoint()