In [1]:
import pickle, gzip, urllib.request, json
import numpy as np

# Load the dataset
urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
print(valid_set[1].shape)

(10000,)


In [2]:
%%time

import os
import boto3
import re
import copy
import time
import io
import struct
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

bucket='sagemaker-mnist-datasets' # Replace with your s3 bucket name
prefix = 'tf-mnist' # Used as part of the path in the bucket where you store data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket) # The URL to access the bucket


def convert_data():
    data_partitions = [('train', train_set), ('validation', valid_set), ('test', test_set)]
    for data_partition_name, data_partition in data_partitions:
        print('{}: {} {}'.format(data_partition_name, data_partition[0].shape, data_partition[1].shape))
        labels = [t.tolist() for t in data_partition[1]]
        features = [t.tolist() for t in data_partition[0]]
        
        if data_partition_name != 'test':
            examples = np.insert(features, 0, labels, axis=1)  # 在feature矩阵的第0列插入labels
        else:
            examples = features  # test数据集没有把labels加进去，why？
        #print(examples[50000,:])
        
        
        np.savetxt('data.csv', examples, delimiter=',')
        
        
        
        key = "{}/{}/examples".format(prefix,data_partition_name)
        url = 's3://{}/{}'.format(bucket, key)
        boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_file('data.csv')
        print('Done writing to {}'.format(url))
        
convert_data()

train: (50000, 784) (50000,)
Done writing to s3://sagemaker-mnist-datasets/tf-mnist/train/examples
validation: (10000, 784) (10000,)
Done writing to s3://sagemaker-mnist-datasets/tf-mnist/validation/examples
test: (10000, 784) (10000,)
Done writing to s3://sagemaker-mnist-datasets/tf-mnist/test/examples
CPU times: user 36.1 s, sys: 11 s, total: 47.1 s
Wall time: 38.2 s


In [3]:
train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'train')

validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')

s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'tf-mninst-output')
print(train_data)

s3://sagemaker-mnist-datasets/tf-mnist/train


In [4]:
data_channels = {'train': train_data, 'validation': validation_data}
print(data_channels)

{'train': 's3://sagemaker-mnist-datasets/tf-mnist/train', 'validation': 's3://sagemaker-mnist-datasets/tf-mnist/validation'}


In [5]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

model_dir = '/opt/ml/model'
git_config = {'repo': 'https://github.com/xzy0223/sagemaker-test.git',
              'branch': 'master'}
train_instance_type = 'ml.m5.xlarge'
hyperparameters = {'epochs': 1, 'batch_size': 128, 'learning_rate': 0.01, 'other_para':0.1}
tf_estimator = TensorFlow(
                       git_config=git_config,
                       entry_point='my_train.py',
                       source_dir='script_model',
                       model_dir=model_dir,
                       output_path=s3_output_location,
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-scriptmode-mnist',
                       framework_version='2.0.0',
                       py_version='py3',
                       script_mode=True)

auto tuning的流程：

首先定义超参数的范围 hyperparameter_ranges，可以通过这种方式定义https://docs.aws.amazon.com/zh_cn/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html

也可以按照对象的方式定义，也就是本例中的定义方式（IntegerParameter, CategoricalParameter, ContinuousParameter）：https://sagemaker.readthedocs.io/en/stable/tuner.html

然后定义要监测的算法的metrics，监测metrics的原理是sagemaker会监测训练容器的stdout和stderr，逐条通过正则解析要监控的metrics
https://docs.aws.amazon.com/zh_cn/sagemaker/latest/dg/automatic-model-tuning-define-metrics.html

定义tuning job要监测的目标metrics，只能指定一个

目标metrics的目标是最大化还是最小化，比如准确率是最大化，loss是最小化

In [6]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime 

hyperparameter_ranges = {
        'learning_rate': ContinuousParameter(0.001, 0.2, scaling_type="Logarithmic"),
        'epochs': IntegerParameter(1, 5),
        'batch_size': IntegerParameter(64, 256),
    }

metric_definitions = [{'Name': 'accuracy',
                       'Regex': 'accuracy=(.*?);'}]

objective_metric_name = 'accuracy'
objective_type = 'Maximize'

定义tuner：

已经定义好的estimator

目标metric

超参范围，https://docs.aws.amazon.com/zh_cn/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html

metric定义

最多的job数

并发训练的job数

目标metric类型

超参调优工作方式：默认贝叶斯，还可以选择随机搜索

In [8]:
tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=15,
                            max_parallel_jobs=5,
                            objective_type=objective_type,
                            strategy='Bayesian')

tuning_job_name = "tf-scriptmode-mnist-{}".format(strftime("%d-%H-%M-%S", gmtime()))
tuner.fit(inputs=data_channels, job_name=tuning_job_name)
tuner.wait()

...........................................................................................................................................................................................................................!


Sagemaker有个一批Aynalytics类用来统计各个训练任务的metrics，可以从Analytics的实例中将这些统计信息导出到CSV或者生成pandas的dataframe，用于分析和展示训练的结果，我么可以通过如下的方法得到这些统计信息并展示

In [9]:
tuner_metrics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
tuner_metrics.dataframe().sort_values(['FinalObjectiveValue'], ascending=True).head(5)

Unnamed: 0,FinalObjectiveValue,TrainingElapsedTimeSeconds,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,batch_size,epochs,learning_rate
1,0.0967,117.0,2020-02-28 08:54:57+00:00,tf-scriptmode-mnist-28-08-39-48-014-60286034,Completed,2020-02-28 08:53:00+00:00,70.0,1.0,0.094962
3,0.0991,182.0,2020-02-28 08:53:45+00:00,tf-scriptmode-mnist-28-08-39-48-012-069b465a,Completed,2020-02-28 08:50:43+00:00,135.0,4.0,0.162726
0,0.1009,245.0,2020-02-28 08:57:54+00:00,tf-scriptmode-mnist-28-08-39-48-015-859e3921,Completed,2020-02-28 08:53:49+00:00,103.0,5.0,0.12826
4,0.103,194.0,2020-02-28 08:54:15+00:00,tf-scriptmode-mnist-28-08-39-48-011-81e0cd26,Completed,2020-02-28 08:51:01+00:00,165.0,5.0,0.175887
2,0.8845,183.0,2020-02-28 08:55:43+00:00,tf-scriptmode-mnist-28-08-39-48-013-1c1c2ffa,Completed,2020-02-28 08:52:40+00:00,95.0,3.0,0.087233


也可以通过hyperparas tuning job的实例直接返回对应的Analytics实例，然后进行展示

In [12]:
tuner.best_estimator
tuner_metric=tuner.analytics()
print(type(tuner_metric.dataframe()))
tuner_metric.dataframe().sort_values(['FinalObjectiveValue'], ascending=True).head(5)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,FinalObjectiveValue,TrainingElapsedTimeSeconds,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,batch_size,epochs,learning_rate
1,0.0967,117.0,2020-02-28 08:54:57+00:00,tf-scriptmode-mnist-28-08-39-48-014-60286034,Completed,2020-02-28 08:53:00+00:00,70.0,1.0,0.094962
3,0.0991,182.0,2020-02-28 08:53:45+00:00,tf-scriptmode-mnist-28-08-39-48-012-069b465a,Completed,2020-02-28 08:50:43+00:00,135.0,4.0,0.162726
0,0.1009,245.0,2020-02-28 08:57:54+00:00,tf-scriptmode-mnist-28-08-39-48-015-859e3921,Completed,2020-02-28 08:53:49+00:00,103.0,5.0,0.12826
4,0.103,194.0,2020-02-28 08:54:15+00:00,tf-scriptmode-mnist-28-08-39-48-011-81e0cd26,Completed,2020-02-28 08:51:01+00:00,165.0,5.0,0.175887
2,0.8845,183.0,2020-02-28 08:55:43+00:00,tf-scriptmode-mnist-28-08-39-48-013-1c1c2ffa,Completed,2020-02-28 08:52:40+00:00,95.0,3.0,0.087233


tuner的deploy方法会把最优的estimator训练出来的模型进行部署，并在输出中展示训练时的日志

In [14]:
tuner_predictor = tuner.deploy(initial_instance_count=1,instance_type='ml.m5.xlarge')

2020-02-28 08:45:13 Starting - Preparing the instances for training
2020-02-28 08:45:13 Downloading - Downloading input data
2020-02-28 08:45:13 Training - Training image download completed. Training in progress.
2020-02-28 08:45:13 Uploading - Uploading generated training model
2020-02-28 08:45:13 Completed - Training job completed[34m2020-02-28 08:42:32,782 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-02-28 08:42:32,783 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value accuracy to Json.[0m
[34mReturning the value itself[0m
[34m2020-02-28 08:42:32,789 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-02-28 08:42:32,798 sagemaker_tensorflow_container.training INFO     Appending the training job name to model_dir: /opt/ml/model[0m
[34m2020-02-28 08:42:33,033 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[3



--------------------------------------------------------------!

In [15]:
tuner_predictor.delete_endpoint()