In [1]:
%%time
! python3 -m pip install --upgrade sagemaker
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
import boto3

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

role = (
    get_execution_role()
)  # provide a pre-existing role ARN as an alternative to creating a new role
print(f"SageMaker Execution Role:{role}")

client = boto3.client("sts")
account = client.get_caller_identity()["Account"]
print(f"AWS account:{account}")

session = boto3.session.Session()
region = session.region_name
print(f"AWS region:{region}")

Collecting sagemaker
  Downloading sagemaker-2.75.1.tar.gz (511 kB)
     |████████████████████████████████| 511 kB 34.9 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.75.1-py2.py3-none-any.whl size=709437 sha256=9944f257dbf4f3d6b9fa68214fe7dff7e1fa68c63af8739fe0d5d23adea76dcf
  Stored in directory: /root/.cache/pip/wheels/04/4e/04/ad25d8866c9738524d659f5e88015dba26ebd28ced7bb8c799
Successfully built sagemaker
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.72.0
    Uninstalling sagemaker-2.72.0:
      Successfully uninstalled sagemaker-2.72.0
Successfully installed sagemaker-2.75.1
SageMaker Execution Role:arn:aws:iam::900051432098:role/service-role/AmazonSageMaker-ExecutionRole-20210311T111844
AWS account:900051432098
AWS region:

## Build and Push Docker image to ECR

In [9]:
image = "bert-smdataparallel-sagemaker"
tag = "pt1.8"

In [8]:
!wget "https://raw.githubusercontent.com/xjlwi/amazon-sagemaker-examples/main/training/distributed_training/pytorch/data_parallel/bert/train.sh"

--2022-02-12 04:45:32--  https://raw.githubusercontent.com/xjlwi/amazon-sagemaker-examples/main/training/distributed_training/pytorch/data_parallel/bert/train.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 621 [text/plain]
Saving to: ‘train.sh’


2022-02-12 04:45:32 (37.0 MB/s) - ‘train.sh’ saved [621/621]



In [19]:
!python3 -m pip install jedi==0.17.2
!pygmentize ./Dockerfile

Collecting jedi==0.17.2
  Downloading jedi-0.17.2-py2.py3-none-any.whl (1.4 MB)
     |████████████████████████████████| 1.4 MB 39.8 MB/s            
[?25hCollecting parso<0.8.0,>=0.7.0
  Downloading parso-0.7.1-py2.py3-none-any.whl (109 kB)
     |████████████████████████████████| 109 kB 122.8 MB/s            
[?25hInstalling collected packages: parso, jedi
  Attempting uninstall: parso
    Found existing installation: parso 0.8.0
    Uninstalling parso-0.8.0:
      Successfully uninstalled parso-0.8.0
  Attempting uninstall: jedi
    Found existing installation: jedi 0.18.0
    Uninstalling jedi-0.18.0:
      Successfully uninstalled jedi-0.18.0
Successfully installed jedi-0.17.2 parso-0.7.1
[34mARG[39;49;00m[37m [39;49;00mregion

[34mFROM[39;49;00m[37m [39;49;00m[33m763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04[39;49;00m

[34mARG[39;49;00m[37m [39;49;00m[31mWORK_DIR[39;49;00m=[33m"apex_build"[39;49;00m
[34mRUN[39;4

In [20]:
!pygmentize ./build_and_push.sh

[37m#!/usr/bin/env bash[39;49;00m
[37m# This script shows how to build the Docker image and push it to ECR to be ready for use[39;49;00m
[37m# by SageMaker.[39;49;00m
[37m# The argument to this script is the image name. This will be used as the image on the local[39;49;00m
[37m# machine and combined with the account and region to form the repository name for ECR.[39;49;00m
[37m# set region[39;49;00m

[31mDIR[39;49;00m=[33m"[39;49;00m[34m$([39;49;00m [36mcd[39;49;00m [33m"[39;49;00m[34m$([39;49;00m dirname [33m"[39;49;00m[33m${[39;49;00m[31mBASH_SOURCE[39;49;00m[0][33m}[39;49;00m[33m"[39;49;00m [34m)[39;49;00m[33m"[39;49;00m && [36mpwd[39;49;00m [34m)[39;49;00m[33m"[39;49;00m

[34mif[39;49;00m [ [33m"[39;49;00m[31m$#[39;49;00m[33m"[39;49;00m -eq [34m3[39;49;00m ]; [34mthen[39;49;00m
    [31mregion[39;49;00m=[31m$1[39;49;00m
    [31mimage[39;49;00m=[31m$2[39;49;00m
    [31mtag[39;49;00m=[31m$3[39;49;00m
[34melse[39;49;

## Tensorflow Board

In [2]:
import os
import datetime
import tensorflow as tf

In [3]:
# create a keras model
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

def create_model():
  return tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')
  ])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [4]:
#Create a directory for your TensorBoard logs
LOG_DIR = os.path.join(os.getcwd(), "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))


In [5]:
# Run training 

model = create_model()
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
              
              
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)

model.fit(x=x_train,
          y=y_train,
          epochs=5,
          validation_data=(x_test, y_test),
          callbacks=[tensorboard_callback])

[2022-02-12 05:17:42.746 tensorflow-2-3-gpu--ml-g4dn-xlarge-8248464766bb46d9e489eb7b7a82:77 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-02-12 05:17:42.771 tensorflow-2-3-gpu--ml-g4dn-xlarge-8248464766bb46d9e489eb7b7a82:77 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Epoch 1/5
Instructions for updating:
use `tf.profiler.experimental.stop` instead.


Instructions for updating:
use `tf.profiler.experimental.stop` instead.






Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7eb44ab690>

In [6]:
EFS_PATH_LOG_DIR = "/".join(LOG_DIR.strip("/").split('/')[1:-1])
print (EFS_PATH_LOG_DIR)

sagemaker-r/sm-r/notebooks/logs/fit
