## PT DDP Launcher Testing
This notebook tests the following combination:

* image: PT training DLC with my changes
* distribution = pytorchddp, backend = nccl

In [1]:
#!pip uninstall -y sagemaker

Found existing installation: sagemaker 2.100.0
Uninstalling sagemaker-2.100.0:
  Successfully uninstalled sagemaker-2.100.0


In [3]:
#Need Sagemaker v2.103.0 
%pip install -U sagemaker

Note: you may need to restart the kernel to use updated packages.


In [24]:
%pip show sagemaker

Name: sagemaker
Version: 2.103.0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages
Requires: attrs, boto3, google-pasta, importlib-metadata, numpy, packaging, pandas, pathos, protobuf, protobuf3-to-dict, smdebug-rulesconfig
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [29]:
import sagemaker
from sagemaker.local import LocalSession

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

#sess = LocalSession()
#sess.config = {"local": {"local_mode": True }}
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
#Add instructions for local environment later, if needed

sagemaker role arn: arn:aws:iam::570106654206:role/Dev
sagemaker bucket: sagemaker-us-west-2-570106654206
sagemaker session region: us-west-2


In [30]:
region = "us-west-2"
image = (
    "pt-ddp-custom"  # Example: pt-smdataparallel-efficientnet-sagemaker
)
tag = "1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker-2.6.0-numproc"  # Example: latest


In [7]:
# Uncomment and run only when docker push fails with OOM errors
#! docker system prune -af

In [4]:
! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin 570106654206.dkr.ecr.{region}.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [31]:
from sagemaker.pytorch import PyTorch

# refer https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers to get the right uri's based on region
#Using URI from 08/11
image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-ddp-custom:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker-2.6.0-numproc'

# configuration for running training on smdistributed Data Parallel
# this is the only line of code change required to leverage SageMaker Distributed Data Parallel
distribution_pt_mpi = {'pytorchddp':{ 'enabled': True },
               'mpi':{'enabled':True, 'num_of_processes_per_host':1}}
distribution = {'pytorchddp':{ 'enabled': True }}

#### Test non-SMDDP supported instance type (g5.16xlarge)


In [18]:
estimator_g5 = PyTorch(
    base_job_name="ptddp-mnist-test-g5",
    source_dir="../code",
    entry_point="train_ptddp_mnist.py",
    role=role,
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=2,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.g5.16xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)

In [19]:
estimator_g5.fit(wait=False)
estimator_g5.latest_training_job.name

'ptddp-mnist-test-g5-2022-08-12-21-01-47-371'

#### Test CPU runs with backend = gloo

In [34]:
estimator_cpu = PyTorch(
    base_job_name="ptddp-mnist-test-gloo",
    source_dir="../code",
    entry_point="train_ptddp_mnist_gloo.py",
    role=role,
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=2,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.p4d.24xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)

In [35]:
estimator_cpu.fit(wait=False)
estimator_cpu.latest_training_job.name

'ptddp-mnist-test-gloo-2022-08-13-18-54-03-474'

#### Test bigger clusters

In [36]:
estimator_8node = PyTorch(
    base_job_name="ptddp-mnist-test-8node",
    source_dir="../code",
    entry_point="train_ptddp_mnist.py",
    role=role,
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=8,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.p4d.24xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)

In [37]:
estimator_8node.fit(wait=False)
estimator_8node.latest_training_job.name

'ptddp-mnist-test-8node-2022-08-13-19-24-33-584'

In [38]:
estimator_16node = PyTorch(
    base_job_name="ptddp-mnist-test-16node",
    source_dir="../code",
    entry_point="train_ptddp_mnist.py",
    role=role,
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=16,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.p4d.24xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)

In [39]:
estimator_16node.fit(wait=False)
estimator_16node.latest_training_job.name

'ptddp-mnist-test-16node-2022-08-13-19-41-02-914'