<font size=5><center><big><b>在SageMaker上训练推荐模型</b></big></center></font>

### 准备训练数据

In [11]:
bucket = 'video-rec-resources'
prefix = 'data/output/sampledata'

train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'trainingSamples')
validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'testSamples')
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'training_output')

data_channels = {'train': train_data, 'validation': validation_data}
print(data_channels)

{'train': 's3://video-rec-resources/data/output/sampledata/trainingSamples', 'validation': 's3://video-rec-resources/data/output/sampledata/testSamples'}


### 创建TensorFlow Estimator

In [2]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

# 配置常用的estimator参数
model_dir = '/opt/ml/model'
train_instance_type = 'ml.m5.xlarge'
hyperparameters = {'epochs': 1, 'batch_size': 12, 'learning_rate': 0.001}

metric_definitions = [
    {
        'Name': 'accuracy',
        'Regex': 'accuracy:\s([0-1].[0-9]*)'
    },
    {
        'Name': 'roc_auc',
        'Regex': 'auc:\s([0-1].[0-9]*)'
    },
    {
        'Name': 'pr_auc',
        'Regex': 'auc_1:\s([0-1].[0-9]*)'
    }
]

In [3]:
# 创建Estimator
tf_estimator = TensorFlow(
                       entry_point='./tf_model/WideNDeep-sm.py',
                       model_dir=model_dir,
                       instance_type=train_instance_type,
                       instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-scriptmode-rec',
                       framework_version='2.3.0',
                       py_version='py37',
                       enable_sagemaker_metrics=True,
                       metric_definitions=metric_definitions,
                       script_mode=True)

### 模型训练

In [4]:
tf_estimator.fit(inputs=data_channels)

2021-04-21 12:21:59 Starting - Starting the training job...
2021-04-21 12:22:22 Starting - Launching requested ML instancesProfilerReport-1619007719: InProgress
......
2021-04-21 12:23:23 Starting - Preparing the instances for training......
2021-04-21 12:24:23 Downloading - Downloading input data
2021-04-21 12:24:23 Training - Downloading the training image...
2021-04-21 12:24:44 Training - Training image download completed. Training in progress.[34m2021-04-21 12:24:48,345 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2021-04-21 12:24:48,352 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-21 12:24:48,774 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-21 12:24:48,791 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-21 12:24:48,806 sagemaker-training-toolkit INFO     No GPUs 

### 模型部署

In [5]:
tf_predictor = tf_estimator.deploy(initial_instance_count=1,instance_type='ml.m5.xlarge')

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


-------------!

In [131]:
tf_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: tf-scriptmode-rec-2021-04-19-14-51-10-680
INFO:sagemaker:Deleting endpoint with name: tf-scriptmode-rec-2021-04-19-14-51-10-680


### 通过Experiment评估训练效果

In [14]:
!pip install sagemaker-experiments

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting sagemaker-experiments
  Using cached sagemaker_experiments-0.1.30-py3-none-any.whl (42 kB)
Installing collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.30


In [16]:
import time

import boto3

from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.analytics import ExperimentAnalytics

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

sess = boto3.Session()
sm = sess.client('sagemaker')
role = get_execution_role()

In [17]:
rec_model_experiment = Experiment.create(
    experiment_name=f"rec-model-experiment-{int(time.time())}", 
    description="rec-model-experiment", 
    sagemaker_boto_client=sm)

In [24]:
for i, lr in enumerate([0.001, 0.01, 0.1]):
    
    trial_name = f"wd-training-trial-{str(lr).split('.')[1]}-{int(time.time())}"
    
    rec_model_trial = Trial.create(
        trial_name=trial_name, 
        experiment_name=rec_model_experiment.experiment_name,
        sagemaker_boto_client=sm,
    )

    model_dir = '/opt/ml/model'
    train_instance_type = 'ml.m5.xlarge'
    hyperparameters = {'epochs': 1, 'batch_size': 12, 'learning_rate': lr}

    metric_definitions = [
        {
            'Name': 'accuracy',
            'Regex': 'accuracy:\s([0-1].[0-9]*)'
        },
        {
            'Name': 'roc_auc',
            'Regex': 'auc:\s([0-1].[0-9]*)'
        },
        {
            'Name': 'pr_auc',
            'Regex': 'auc_1:\s([0-1].[0-9]*)'
        }
    ]

    wd_estimator_trial = TensorFlow(
                           entry_point='./tf_model/WideNDeep-sm.py',
                           model_dir=model_dir,
                           instance_type=train_instance_type,
                           instance_count=1,
                           hyperparameters=hyperparameters,
                           role=sagemaker.get_execution_role(),
                           base_job_name='tf-scriptmode-rec',
                           framework_version='2.3.0',
                           py_version='py37',
                           enable_sagemaker_metrics=True,
                           metric_definitions=metric_definitions,
                           script_mode=True)
    
    wd_estimator_trial.fit(
        inputs=data_channels,
        experiment_config={
            "TrialName": rec_model_trial.trial_name,
            "TrialComponentDisplayName": rec_model_trial.trial_name+"-Training",
        },
        wait=False
    )
    
    time.sleep(2)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tf-scriptmode-rec-2021-04-21-13-19-00-799
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tf-scriptmode-rec-2021-04-21-13-19-04-215
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tf-scriptmode-rec-2021-04-21-13-19-08-181


### 通过Debugger调试模型训练

In [12]:
import sys
!{sys.executable} -m pip install -U smdebug

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting smdebug
  Using cached smdebug-1.0.8-py2.py3-none-any.whl (263 kB)
Collecting pyinstrument>=3.1.3
  Using cached pyinstrument-3.4.1-py2.py3-none-any.whl (81 kB)
Collecting pyinstrument-cext>=0.2.2
  Using cached pyinstrument_cext-0.2.4-cp37-cp37m-manylinux2010_x86_64.whl (20 kB)
Installing collected packages: pyinstrument-cext, pyinstrument, smdebug
Successfully installed pyinstrument-3.4.1 pyinstrument-cext-0.2.4 smdebug-1.0.8


In [13]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs, CollectionConfig

In [14]:

rules = [
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    ProfilerRule.sagemaker(rule_configs.CPUBottleneck()),
    Rule.sagemaker(
        base_config=rule_configs.exploding_tensor(),
        rule_parameters={
                "tensor_regex": ".*gradient",
                "only_nan": "False"
        },
        collections_to_save=[ 
            CollectionConfig(
                name="gradients", 
                parameters={
                    "save_interval": "5"
                }
            )
        ]
    ),
    Rule.sagemaker(
        base_config=rule_configs.loss_not_decreasing(),
        rule_parameters={
                "tensor_regex": ".*",
                "use_losses_collection": "True",
                "num_steps": "1",
                "diff_percent": "99",
                "increase_threshold_percent": "5",
                "mode": "GLOBAL"
        },
        collections_to_save=[ 
            CollectionConfig(
                name="losses", 
                parameters={
                    "save_interval": "5"
                } 
            )
        ]
    ),
    Rule.sagemaker(
        base_config=rule_configs.vanishing_gradient(),
        rule_parameters={
                "threshold": "10"
        },
        collections_to_save=[ 
            CollectionConfig(
                name="gradients", 
                parameters={
                    "save_interval": "5"
                } 
            )
        ]
    )
]

In [15]:
from sagemaker.debugger import (ProfilerConfig, 
                                FrameworkProfile, 
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig)

# profiler_config = ProfilerConfig(
#     system_monitor_interval_millis=500,
#     framework_profile_params=FrameworkProfile(local_path="/opt/ml/output/profiler/", start_step=5, num_steps=2)  
# )

profiler_config=ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(
        detailed_profiling_config=DetailedProfilingConfig(
            start_step=5, 
            num_steps=1
        ),
        dataloader_profiling_config=DataloaderProfilingConfig(
            start_step=7, 
            num_steps=1
        ),
        python_profiling_config=PythonProfilingConfig(
            start_step=9, 
            num_steps=1, 
#             python_profiler="cProfile", 
#             cprofile_timer="total_time"
        )
    )
)

In [16]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

model_dir = '/opt/ml/model'
train_instance_type = 'ml.m5.xlarge'
hyperparameters = {'epochs': 10, 'batch_size': 12, 'learning_rate': 0.1}

tf_estimator = TensorFlow(
                       entry_point='./tf_model/WideNDeep-sm.py',
                       model_dir=model_dir,
                       instance_type=train_instance_type,
                       instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-scriptmode-rec',
                       framework_version='2.3.1',
                       py_version='py37',
                       enable_sagemaker_metrics=True,
                       # metric_definitions=metric_definitions,
                       script_mode=True,
                       rules = rules,
                       profiler_config=profiler_config
)

In [10]:
tf_estimator.fit(inputs=data_channels)

2021-04-24 14:54:00 Starting - Starting the training job...
2021-04-24 14:54:02 Starting - Launching requested ML instancesExplodingTensor: InProgress
LossNotDecreasing: InProgress
VanishingGradient: InProgress
ProfilerReport: InProgress
CPUBottleneck: InProgress
......
2021-04-24 14:55:29 Starting - Preparing the instances for training......
2021-04-24 14:56:29 Downloading - Downloading input data
2021-04-24 14:56:29 Training - Downloading the training image...
2021-04-24 14:57:01 Training - Training image download completed. Training in progress.[34m2021-04-24 14:56:43.566016: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-04-24 14:56:43.573313: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2021-04-24 14:56:43.574647: I tensorflow/core/profiler/internal/smprofiler_timeline.cc:121] SageMaker Profiler Timeline Writer read the follow

Job ended with status 'Stopped' rather than 'Completed'. This could mean the job timed out or stopped early for some other reason: Consider checking whether it completed as you expect.


Training seconds: 721
Billable seconds: 721
