## PT DDP Launcher Testing
This notebook tests the following combination:

* image: PT training DLC with my changes
* distribution = pytorchddp, backend = nccl

In [20]:
#!pip uninstall -y sagemaker

In [3]:
%%time
#! python3 -m pip install --upgrade sagemaker
#Upload sagemaker-python-sdk whl from local machine and install it here
#%pip install ~/SageMaker/sm1.10/sagemaker-2.101.2.dev0-py2.py3-none-any.whl
%pip install --force-reinstall /home/ec2-user/SageMaker/herring-development/PT\ Launcher/sagemaker-2.101.6.dev0-py2.py3-none-any.whl

Processing ./sagemaker-2.101.6.dev0-py2.py3-none-any.whl
Collecting google-pasta
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting pathos
  Using cached pathos-0.2.8-py2.py3-none-any.whl (81 kB)
Collecting boto3<2.0,>=1.20.21
  Using cached boto3-1.23.10-py3-none-any.whl (132 kB)
Collecting pandas
  Using cached pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
Collecting importlib-metadata<5.0,>=1.4.0
  Using cached importlib_metadata-4.8.3-py3-none-any.whl (17 kB)
Collecting protobuf3-to-dict<1.0,>=0.1.5
  Using cached protobuf3_to_dict-0.1.5-py3-none-any.whl
Collecting smdebug-rulesconfig==1.0.1
  Using cached smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl (20 kB)
Collecting packaging>=20.0
  Using cached packaging-21.3-py3-none-any.whl (40 kB)
Collecting attrs<22,>=20.3.0
  Using cached attrs-21.4.0-py2.py3-none-any.whl (60 kB)
Collecting numpy<2.0,>=1.9.0
  Using cached numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl (14.8 MB)
Collecting protobuf<4.0,>=3.

In [4]:
%pip show sagemaker

Name: sagemaker
Version: 2.101.6.dev0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages
Requires: attrs, boto3, google-pasta, importlib-metadata, numpy, packaging, pandas, pathos, protobuf, protobuf3-to-dict, smdebug-rulesconfig
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
#Add instructions for local environment later, if needed

sagemaker role arn: arn:aws:iam::570106654206:role/Dev
sagemaker bucket: sagemaker-us-west-2-570106654206
sagemaker session region: us-west-2


In [9]:
region = "us-west-2"
image = (
    "pt-ddp-custom"  # Example: pt-smdataparallel-efficientnet-sagemaker
)
tag = "1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker"  # Example: latest


In [10]:
# Uncomment and run only when docker push fails with OOM errors
#! docker system prune -af

In [8]:
! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin 570106654206.dkr.ecr.{region}.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [11]:
from sagemaker.pytorch import PyTorch

# refer https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers to get the right uri's based on region
#image_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04'
#image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/ptddp-launcher:latest'
#Using URI with logs added to DLC
image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-ddp-custom:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker'

# configuration for running training on smdistributed Data Parallel
# this is the only line of code change required to leverage SageMaker Distributed Data Parallel
distribution = {'pytorchddp':{ 'enabled': True }}
#distribution = {"mpi":{"enabled":True, "num_of_processes_per_host":8}}
#distribution = { "smdistributed": { "dataparallel": { "enabled": True } } }


estimator = PyTorch(
    base_job_name="ptddp-mnist-pt1-10",
    source_dir="code",
    entry_point="train_ptddp_mnist.py",
    role=role,
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=2,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.g5.16xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)

In [12]:
estimator.fit()

RuntimeError: calling _pytorch_distribution_configuration

In [13]:
image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-ddp-custom:pt1.12'

## Test for backend = gloo
estimator = PyTorch(
    base_job_name="ptddp-mnist-gloo",
    source_dir="code",
    entry_point="train_ptddp_mnist_gloo.py",
    role=role,
    py_version="py38",
    image_uri=image_uri,
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=4,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.p4d.24xlarge",
    sagemaker_session=sess,
    # Training using SMDataParallel Distributed Training Framework
    distribution=distribution,
    debugger_hook_config=False,
)
estimator.fit(wait=False)