In [1]:
import sagemaker
import boto3
import sys
import os
import glob
import re
import subprocess
import numpy as np
from IPython.display import HTML
import time
from time import gmtime, strftime

sys.path.append("common")
from misc import get_execution_role, wait_for_s3_object
from docker_utils import build_and_push_docker_image
from sagemaker.rl import RLEstimator, RLToolkit, RLFramework

In [2]:
import os 
os.getcwd()

'/home/ec2-user/SageMaker/xxx-rl-mario-ray'

In [3]:
sage_session = sagemaker.session.Session()
s3_bucket = sage_session.default_bucket()
s3_output_path = "s3://{}/".format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

S3 bucket path: s3://sagemaker-us-east-1-867521064370/


In [4]:
# create a descriptive job name
job_name_prefix = "rl-mario-ray"

framework = "torch"

In [5]:
# run in local_mode on this machine, or as a SageMaker TrainingJob?
local_mode = False

if local_mode:
    instance_type = "local"
else:
    # If on SageMaker, pick the instance type.
    instance_type = "ml.p3.8xlarge"

if "ml.p" in instance_type or "ml.g" in instance_type:
    cpu_or_gpu = "gpu"
else:
    cpu_or_gpu = "cpu"

In [6]:
train_instance_count = 1

In [7]:
try:
    role = sagemaker.get_execution_role()
except:
    role = get_execution_role()

print("Using IAM role arn: {}".format(role))

Using IAM role arn: arn:aws:iam::867521064370:role/service-role/AmazonSageMaker-ExecutionRole-20220923T161812


In [8]:
# only run from SageMaker notebook instance
if local_mode:
    !/bin/bash ./common/setup.sh

In [9]:
%%time
if framework == "tf":
    suffix = "py37"
else:
    suffix = "py36"

repository_short_name = "sagemaker-mario-ray-{}-{}".format(cpu_or_gpu, framework)
docker_build_args = {
    "CPU_OR_GPU": cpu_or_gpu,
    "AWS_REGION": boto3.Session().region_name,
    "FRAMEWORK": framework,
    "SUFFIX": suffix,
}

custom_image_name = build_and_push_docker_image(repository_short_name, build_args=docker_build_args)
print("Using ECR image %s" % custom_image_name)

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Logged into ECR
Building docker image sagemaker-mario-ray-gpu-torch from Dockerfile
$ docker build -t sagemaker-mario-ray-gpu-torch -f Dockerfile . --build-arg CPU_OR_GPU=gpu --build-arg AWS_REGION=us-east-1 --build-arg FRAMEWORK=torch --build-arg SUFFIX=py36
Sending build context to Docker daemon  513.5kB
Step 1/30 : ARG AWS_REGION
Step 2/30 : ARG CPU_OR_GPU
Step 3/30 : ARG SUFFIX
Step 4/30 : ARG VERSION
Step 5/30 : ARG FRAMEWORK
Step 6/30 : FROM 462105765813.dkr.ecr.${AWS_REGION}.amazonaws.com/sagemaker-rl-ray-container:ray-1.6.0-${FRAMEWORK}-${CPU_OR_GPU}-${SUFFIX}
 ---> bad901ccbba6
Step 7/30 : RUN rm /etc/apt/sources.list.d/cuda.list
 ---> Using cache
 ---> c461b889cced
Step 8/30 : RUN rm /etc/apt/sources.list.d/nvidia-ml.list
 ---> Using cache
 ---> b10256ed50d8
Step 9/30 : RUN apt-get update && apt-get install -y --no-install-recommends         libosmesa6-dev         libgl1-mesa-glx   

In [10]:
!pygmentize src/train-rl-mario-ray.py

[34mimport[39;49;00m [04m[36mgym_super_mario_bros[39;49;00m
[34mfrom[39;49;00m [04m[36mgym_super_mario_bros[39;49;00m[04m[36m.[39;49;00m[04m[36mactions[39;49;00m [34mimport[39;49;00m SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
[34mfrom[39;49;00m [04m[36mnes_py[39;49;00m[04m[36m.[39;49;00m[04m[36mwrappers[39;49;00m [34mimport[39;49;00m JoypadSpace
[34mfrom[39;49;00m [04m[36mray[39;49;00m[04m[36m.[39;49;00m[04m[36mrllib[39;49;00m[04m[36m.[39;49;00m[04m[36menv[39;49;00m[04m[36m.[39;49;00m[04m[36mwrappers[39;49;00m[04m[36m.[39;49;00m[04m[36matari_wrappers[39;49;00m [34mimport[39;49;00m (MonitorEnv,
                                          NoopResetEnv,
                                          WarpFrame,
                                          FrameStack)
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m

[34mimport[39;49;00m [04m[36mgym[39;49;00m
[34mimport[39;49;00m [04m[36mray[

In [None]:
%%time

metric_definitions = RLEstimator.default_metric_definitions(RLToolkit.RAY)

estimator = RLEstimator(
    entry_point="train-rl-mario-ray.py",
    source_dir="src",
    dependencies=["common/sagemaker_rl"],
    image_uri=custom_image_name,
    role=role,
    debugger_hook_config=False,
    instance_type=instance_type,
    instance_count=train_instance_count,
    output_path=s3_output_path,
    base_job_name=job_name_prefix,
    metric_definitions=metric_definitions,
    hyperparameters={
    },
)

estimator.fit(wait=True)

job_name = estimator.latest_training_job.job_name
print("Training job: %s" % job_name)

2022-11-09 02:48:38 Starting - Starting the training job...ProfilerReport-1667962118: InProgress
...
2022-11-09 02:49:28 Starting - Preparing the instances for training........
2022-11-09 02:51:06 Downloading - Downloading input data
2022-11-09 02:51:06 Training - Downloading the training image......

In [None]:
%matplotlib inline
from sagemaker.analytics import TrainingJobAnalytics

# We need to wait for the job to start before we can read metrics from CloudWatch
retry_interval = 30
retry_max_times = 10
current_retry = 0

if not local_mode:

    while current_retry < retry_max_times:
        current_retry += 1
        try:
            df = TrainingJobAnalytics(job_name, ["episode_reward_mean"]).dataframe()
            break
        except:
            print("retry {}".format(current_retry))
            time.sleep(retry_interval)

    df = TrainingJobAnalytics(job_name, ["episode_reward_mean"]).dataframe()
    num_metrics = len(df)
    if num_metrics == 0:
        print("No algorithm metrics found in CloudWatch")
    else:
        plt = df.plot(x="timestamp", y="value", figsize=(12, 5), legend=True, style="b-")
        plt.set_ylabel("Mean reward per episode")
        plt.set_xlabel("Training time (s)")
else:
    print("Can't plot metrics in local mode.")

In [None]:
if local_mode:
    model_tar_key = "{}/model.tar.gz".format(job_name)
else:
    model_tar_key = "{}/output/model.tar.gz".format(job_name)

tmp_dir = "/tmp/{}".format(job_name)
os.system("mkdir {}".format(tmp_dir))
print("Create local folder {}".format(tmp_dir))
local_checkpoint_dir = "{}/model".format(tmp_dir)

wait_for_s3_object(s3_bucket, model_tar_key, tmp_dir, training_job_name=job_name)

if not os.path.isfile("{}/model.tar.gz".format(tmp_dir)):
    raise FileNotFoundError("File model.tar.gz not found")

os.system("mkdir -p {}".format(local_checkpoint_dir))
os.system("tar -xvzf {}/model.tar.gz -C {}".format(tmp_dir, local_checkpoint_dir))

print("Checkpoint directory {}".format(local_checkpoint_dir))

In [None]:
if local_mode:
    checkpoint_path = "file://{}".format(local_checkpoint_dir)
    print("Local checkpoint file path: {}".format(local_checkpoint_dir))
else:
    checkpoint_path = "s3://{}/{}/checkpoint/".format(s3_bucket, job_name)
    if not os.listdir(local_checkpoint_dir):
        raise FileNotFoundError("Checkpoint files not found under the path")
    os.system("aws s3 cp --recursive {} {}".format(local_checkpoint_dir, checkpoint_path))
    print("S3 checkpoint file path: {}".format(checkpoint_path))

In [None]:
%%time

estimator_eval = RLEstimator(
    entry_point="evaluate-mario-ray.py",
    source_dir="src",
    dependencies=["common/sagemaker_rl"],
    image_uri=custom_image_name,
    role=role,
    instance_type=instance_type,
    instance_count=1,
    base_job_name=job_name_prefix + "-evaluation",
    hyperparameters={"evaluate_episodes": 10, "algorithm": "IMPALA", "env": 'SuperMarioBros-v0'},
)

estimator_eval.fit({"model": checkpoint_path})
job_name = estimator_eval.latest_training_job.job_name
print("Evaluation job: %s" % job_name)