## PT DDP Launcher Testing
This notebook tests the following combination:

* image: PT training DLC with my changes
* distribution = pytorchddp, backend = nccl
* model = Resnet50, dataset = CIFAR10


In [1]:
#Need Sagemaker v2.103.0 
%pip install -U sagemaker

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip show sagemaker

Name: sagemaker
Version: 2.103.0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages
Requires: attrs, boto3, google-pasta, importlib-metadata, numpy, packaging, pandas, pathos, protobuf, protobuf3-to-dict, smdebug-rulesconfig
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [3]:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
#Add instructions for local environment later, if needed

sagemaker role arn: arn:aws:iam::570106654206:role/Dev
sagemaker bucket: sagemaker-us-west-2-570106654206
sagemaker session region: us-west-2


In [4]:
region = "us-west-2"
image = (
    "pt-ddp-custom"  # Contains viskaria dev changes from SageMaker Python SDK and SageMaker PyTorch training toolkit
)
tag = "1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker-2.6.0-numproc"  # Example: latest


In [4]:
# Uncomment and run only when docker push fails with OOM errors
#! docker system prune -af

In [7]:
! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin 570106654206.dkr.ecr.{region}.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [9]:
from sagemaker.pytorch import PyTorch

# refer https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers to get the right uri's based on region
#image_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04'
#image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/ptddp-launcher:latest'
#Using URI with logs added to DLC
image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-ddp-custom:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker-2.6.0-numproc'
print(image_uri)

# configuration for running distributed training on Sagemaker
# this is the only line of code change required to leverage PyTorch Distributed Data Parallel
distribution = {'pytorchddp':{ 'enabled': True }}


570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-ddp-custom:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker-2.6.0-numproc


In [11]:
#Train on 4 instances
estimator4 = PyTorch(
    base_job_name="ptddp-resnet-1-12-p4d-4node",
    source_dir="../code",
    entry_point="resnet50_cifar10.py",
    role=role,
    framework_version="1.12.0",
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=4,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.p4d.24xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)

In [12]:
estimator4.fit(wait=False)
# View logs in 

In [12]:
#Train on 8 instances
# estimator8 = PyTorch(
#     base_job_name="ptddp-resnet50-cifar10",
#     source_dir="code",
#     entry_point="resnet50_cifar10.py",
#     role=role,
#     framework_version="1.11.0",
#     py_version="py38",
#     image_uri=image_uri,
#     # For training with multinode distributed training, set this count. Example: 2
#     instance_count=8,
#     # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
#     instance_type="ml.p4d.24xlarge",
#     sagemaker_session=sess,
#     # Training using SMDataParallel Distributed Training Framework
#     distribution=distribution,
#     debugger_hook_config=False,
# )

In [13]:
# estimator8.fit(wait=False)