In [45]:
import time
import boto3
import sagemaker
from sagemaker.debugger import TensorBoardOutputConfig
from sagemaker.pytorch import PyTorch

In [46]:
role = sagemaker.get_execution_role()

In [47]:
bucket = 'conversational-summarization'
s3_output_location = f's3://{bucket}/output'
start_time = time.strftime('%Y%m%dT%H%M%S')

In [48]:
hparams = {
    'batch-size': 8,
    'learning-rate': 3e-05,
    'model-path': 't5-small',
    'job_name': 't5-small-{start_time}',
    'epochs': 25,
    'gpus': 1
}

In [49]:
metric_definitions = [
    {'Name': 'train:loss', 'Regex': 'train_loss: (.*?);'},
    {'Name': 'validation:loss', 'Regex': 'val_loss: (.*?);'},
    {'Name': 'test:loss', 'Regex': 'test_loss: (.*?);'},
    {'Name': 'current_epoch', 'Regex': 'current_epoch: (.*?);'},
    {'Name': 'global_step', 'Regex': 'global_step: (.*?);'}
]

In [50]:
tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=f's3://{bucket}/tb_logs',
    container_local_output_path='/opt/tb_logs'
)

In [None]:
estimator = PyTorch(
    entry_point='train_t5_model.py',
    source_dir='code',
    role=role,
    framework_version='1.6.0',
    py_version='py3',
    train_instance_count=1,
    train_instance_type='ml.p3.2xlarge',
    output_path=s3_output_location,
    metric_definitions=metric_definitions,
    hyperparameters=hparams,
    tensorboard_output_config=tensorboard_output_config,
)

estimator.fit(inputs={
    'train': f'{bucket}/data/processed/t5_train_dataset.pt',
    'test': f'{bucket}/data/processed/t5_test_dataset.pt',
    'val': f'{bucket}/data/processed/t5_val_dataset.pt',
})

text-performance-management-client-20201110T221759
2020-11-10 22:17:59 Starting - Starting the training job...
2020-11-10 22:18:01 Starting - Launching requested ML instances.