In [1]:
import boto3

import sagemaker
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/DEMO-pytorch-mnist'

role = sagemaker.get_execution_role()

session = boto3.session.Session()
region = session.region_name
print(f"AWS region:{region}")

AWS region:us-east-1


!aws s3 cp s3://fast-ai-imageclas/mnist_png.tgz . --no-sign-request
!tar -xvzf  mnist_png.tgz

In [19]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch
import os

data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)

training_data = datasets.ImageFolder(root='mnist_png/training',
                            transform=transforms.Compose([
                            transforms.Grayscale(),
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,))]))
test_data = datasets.ImageFolder(root='mnist_png/testing',
                            transform=transforms.Compose([
                            transforms.Grayscale(),
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,))]))

training_data_loader = DataLoader(training_data, batch_size=len(training_data))
training_data_loaded = next(iter(training_data_loader))
torch.save(training_data_loaded, os.path.join(data_dir, 'training.pt'))

test_data_loader = DataLoader(test_data, batch_size=len(test_data))
test_data_loaded = next(iter(test_data_loader))
torch.save(test_data_loaded, os.path.join(data_dir, 'test.pt'))

In [20]:
inputs = sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix=prefix)
print('input spec (in this case, just an S3 path): {}'.format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-815969174475/sagemaker/DEMO-pytorch-mnist


In [5]:
image = "yolov5-sagemaker-109-cu100-3"  # Example: mask-rcnn-smdataparallel-sagemaker
tag = "pt1.8"  # Example: pt1.8

In [12]:
!docker system prune -a -f --volumes

Deleted Containers:
c8bbbc5ab8959f40f4e74a448d31ff8e8096ffa29a6d5aede2e7d25f9af908f5

Deleted Networks:
sagemaker-local

Total reclaimed space: 0B


In [9]:
!docker volume rm "docker volume ls -q -f dangling=true"

"docker volume rm" requires at least 1 argument.
See 'docker volume rm --help'.

Usage:  docker volume rm [OPTIONS] VOLUME [VOLUME...]

Remove one or more volumes


In [13]:
!pygmentize ./Dockerfile

[34mARG[39;49;00m region

[37m# FROM 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04[39;49;00m

[34mFROM[39;49;00m [33m763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.7.1-gpu-py36-cu110-ubuntu18.04[39;49;00m

[34mRUN[39;49;00m pip freeze

[34mRUN[39;49;00m [36mcd[39;49;00m /root && [33m\[39;49;00m
	git clone --recursive https://github.com/tkazusa/yolov5-sagemaker-distributed-data-parallel.git && [33m\[39;49;00m
    chmod -R [34m775[39;49;00m yolov5-sagemaker-distributed-data-parallel && [33m\[39;49;00m
	[36mcd[39;49;00m yolov5-sagemaker-distributed-data-parallel && [33m\[39;49;00m
	pip install -r requirements.txt &&[33m\[39;49;00m
    pip install -U smdebug


In [14]:
%%time
! chmod +x build_and_push.sh; bash build_and_push.sh {region} {image} {tag}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon    223MB
Step 1/4 : ARG region
Step 2/4 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.7.1-gpu-py36-cu110-ubuntu18.04
1.7.1-gpu-py36-cu110-ubuntu18.04: Pulling from pytorch-training

[1B57c49d0f: Pulling fs layer 
[1B40447d26: Pulling fs layer 
[1B2f862619: Pulling fs layer 
[1B278deddf: Pulling fs layer 
[1B80049843: Pulling fs layer 
[1B556b2329: Pulling fs layer 
[1Ba0c97a55: Pulling fs layer 
[1B78bd0b24: Pulling fs layer 
[1B6c31766d: Pulling fs layer 
[1B71769566: Pulling fs layer 
[1Ba466ffcf: Pulling fs layer 
[1B370b88ac: Pulling fs layer 
[1B1dc34b31: Pulling fs layer 
[1B6e246df6: Pulling fs layer 
[1Bbb5c17af: Pulling fs layer 
[1B45ef5f57: Pulling fs layer 
[1Be708890b: Pulling fs layer 
[1Bfd929695: Pulling fs layer 
[1B928f1b1c: Pulling fs layer 
[1B70564d8a: Pulling fs layer 
[1Bdbb83bc0: Pulli

In [21]:
import os
from sagemaker.pytorch import PyTorch
from sagemaker.local import LocalSession

client = boto3.client("sts")
account = client.get_caller_identity()["Account"]

instance_type = "ml.p3.16xlarge"  # Other supported instance type: ml.p3.16xlarge, ml.p4d.24xlarge
instance_count = 2  # You can use 2, 4, 8 etc.
docker_image = f"{account}.dkr.ecr.{region}.amazonaws.com/{image}:{tag}"  # YOUR_ECR_IMAGE_BUILT_WITH_ABOVE_DOCKER_FILE
job_name = "pytorch-sm-yolo10"

# SageMakerの操作で使用するローカル用セッション
local_session = LocalSession()

In [22]:
estimator = PyTorch(
    entry_point="train.py",
    role=role,
    source_dir=".",
    image_uri=docker_image,
    instance_count=instance_count,
    #instance_type="local",
    instance_type=instance_type,
    framework_version="1.8.1",
    #py_version="py37",
    #sagemaker_session=local_session,
    sagemaker_session=sagemaker_session
)

In [23]:
#inputs = sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix=prefix)
data_channels = {"training": inputs}

In [24]:
estimator.fit(inputs=data_channels, job_name=job_name, wait=False)

In [22]:
def check_dataset(dict):
    # Download dataset if not found locally
    val, s = dict.get('val'), dict.get('download')
    if val and len(val):
        print([val])
        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
        print(val)
        if not all(x.exists() for x in val):
            print('\nWARNING: Dataset not found, nonexistent paths: %s' % [str(x) for x in val if not x.exists()])
            if s and len(s):  # download script
                if s.startswith('http') and s.endswith('.zip'):  # URL
                    f = Path(s).name  # filename
                    print(f'Downloading {s} {f}...')
                    torch.hub.download_url_to_file(s, f)
                    r = os.system(f'unzip -q {f} -d ../ && rm {f}')  # unzip
                elif s.startswith('bash '):  # bash script
                    print(f'Running {s} ...')
                    r = os.system(s)
                else:  # python script
                    r = exec(s)  # return None
                print('Dataset autodownload %s\n' % ('success' if r in (0, None) else 'failure'))  # print result
            else:
                raise Exception('Dataset not found.')

In [211]:
from pathlib import Path
import yaml

with open('./data/coco128.yaml') as f:
    data_dict = yaml.safe_load(f)  # data dict

In [212]:
check_dataset(data_dict)

['../coco128/images/train2017/']
[PosixPath('/home/ec2-user/SageMaker/coco128/images/train2017')]


In [13]:
!docker system df

TYPE                TOTAL               ACTIVE              SIZE                RECLAIMABLE
Images              0                   0                   0B                  0B
Containers          1                   0                   0B                  0B
Local Volumes       0                   0                   0B                  0B
Build Cache         0                   0                   0B                  0B


In [25]:
!sudo du -shc /usr/local/*

3.9G	/usr/local/bin
28M	/usr/local/chronicle
0	/usr/local/cuda
4.3G	/usr/local/cuda-10.0
4.5G	/usr/local/cuda-10.1
4.9G	/usr/local/cuda-10.2
4.0K	/usr/local/etc
4.0K	/usr/local/games
7.3M	/usr/local/include
172M	/usr/local/lib
227M	/usr/local/lib64
4.0K	/usr/local/libexec
4.0K	/usr/local/sbin
10M	/usr/local/share
4.0K	/usr/local/src
18G	total


In [28]:
!rm -rf ./mnist*