# Example of SageMaker Debugging
use conda_pytorch_p36

# preparation:environments
1.make S3 bucket  
S3:bucket:work-sagemaker-debugger(ohio)  
2.build SageMaker Notebook Instance  
SageMaker Notebook Instance:ml.t2.medium(ohio)  
3.do this notebook.  
4.do 2_tensorboard.ipynb  
5.access tensorboard  

In [1]:
!python -m pip install smdebug

Collecting smdebug
[?25l  Downloading https://files.pythonhosted.org/packages/bf/2d/d8a1692b06701b6f00ca21931c5546b6abff21423855d923e45333f71f7c/smdebug-0.5.0.post0-py2.py3-none-any.whl (149kB)
[K    100% |████████████████████████████████| 153kB 6.8MB/s ta 0:00:01
[?25hCollecting protobuf>=3.6.0 (from smdebug)
[?25l  Downloading https://files.pythonhosted.org/packages/ca/ac/838c8c8a5f33a58132dd2ad2a30329f6ae1614a9f56ffb79eaaf71a9d156/protobuf-3.11.2-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
[K    100% |████████████████████████████████| 1.3MB 10.4MB/s ta 0:00:01
[31mfastai 1.0.59 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: protobuf, smdebug
  Found existing installation: protobuf 3.5.2
    Uninstalling protobuf-3.5.2:
      Successfully uninstalled protobuf-3.5.2
Successfully installed protobuf-3.11.2 smdebug-0.5.0.post0
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip insta

## download sample sagemaker debugging script of pytorch from awslabs

https://github.com/awslabs/amazon-sagemaker-examples/blob/714652a4f96fd764a247a4cee30425293db86c59/sagemaker-debugger/pytorch_custom_container/scripts/pytorch_mnist.py

In [2]:
!wget https://github.com/awslabs/amazon-sagemaker-examples/raw/714652a4f96fd764a247a4cee30425293db86c59/sagemaker-debugger/pytorch_custom_container/scripts/pytorch_mnist.py


--2020-01-28 03:24:15--  https://github.com/awslabs/amazon-sagemaker-examples/raw/714652a4f96fd764a247a4cee30425293db86c59/sagemaker-debugger/pytorch_custom_container/scripts/pytorch_mnist.py
Resolving github.com (github.com)... 192.30.253.113
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/714652a4f96fd764a247a4cee30425293db86c59/sagemaker-debugger/pytorch_custom_container/scripts/pytorch_mnist.py [following]
--2020-01-28 03:24:15--  https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/714652a4f96fd764a247a4cee30425293db86c59/sagemaker-debugger/pytorch_custom_container/scripts/pytorch_mnist.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.248.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.248.133|:443... connected.
HTTP request sent, awaiting response... 200 O

In [3]:
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig
### write your own path name ---------------------------
bucket = "work-sagemaker-debugger" # Must be changed
prefix_for_tensors = "your_s3_prefix_to_save_tensors" # no change is OK
### ----------------------------------------------------
    
hook_config = DebuggerHookConfig(
    s3_output_path=f"s3://{bucket}/{prefix_for_tensors}",
    collection_configs=[
        CollectionConfig("all")
    ]
)

In [4]:
from sagemaker.debugger import TensorBoardOutputConfig

### write your own path name ---------------------------
prefix_for_tensorboard = "your_s3_prefix_for_tensorboard_logs" # no change is OK
### ----------------------------------------------------
    
tb_config = TensorBoardOutputConfig(f"s3://{bucket}/{prefix_for_tensorboard}")

# configure debugging rules
https://github.com/awslabs/sagemaker-debugger/blob/master/docs/sagemaker.md#rules  
https://github.com/awslabs/sagemaker-debugger/blob/master/docs/sagemaker.md#built-in-rules  
https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-debugger/tensorflow_keras_custom_rule/tf-keras-custom-rule.ipynb

In [5]:
from sagemaker.debugger import Rule, rule_configs
rules = [
    Rule.sagemaker(rule_configs.exploding_tensor()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.weight_update_ratio()),
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
]

### set learning_rate a large number and observe

In [6]:
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
 
role = get_execution_role()
training_dir = "/tmp/pytorch-smdebug"
script_path = "pytorch_mnist.py"
hyperparameters = {"random_seed": True, "epochs": 5, "learning_rate": 1e-1, "data_dir": training_dir}
 
estimator = PyTorch(
    entry_point=script_path,
    framework_version="1.3.1",
    py_version="py3",
    role=role,
    train_instance_count=1,
    train_instance_type="ml.c5.xlarge",
    hyperparameters=hyperparameters,
    debugger_hook_config=hook_config,
    tensorboard_output_config=tb_config,
    rules=rules
)

### start training

In [7]:
estimator.fit(wait=False)

### print training status

In [8]:
for d in estimator.latest_training_job.rule_job_summary():
    print(f"{d['RuleConfigurationName']}: {d['RuleEvaluationStatus']}")


ExplodingTensor: InProgress
VanishingGradient: InProgress
WeightUpdateRatio: InProgress
LossNotDecreasing: InProgress


### about 10 minutes after...
exect below iteratively.

In [10]:
for d in estimator.latest_training_job.rule_job_summary():
    print(f"{d['RuleConfigurationName']}: {d['RuleEvaluationStatus']}")

ExplodingTensor: Error
VanishingGradient: IssuesFound
WeightUpdateRatio: IssuesFound
LossNotDecreasing: Error


### When status displayed like below, proceed to next line.
xplodingTensor: Error  
VanishingGradient: IssuesFound  
WeightUpdateRatio: IssuesFound  
LossNotDecreasing: Error  

# ----- displayed above? -----

In [11]:
from smdebug.trials import create_trial
trial = create_trial(path=estimator.latest_job_debugger_artifacts_path())

[2020-01-28 03:34:39.544 ip-172-16-28-203:6000 INFO s3_trial.py:42] Loading trial debug-output at path s3://work-sagemaker-debugger/your_s3_prefix_to_save_tensors/pytorch-training-2020-01-28-03-24-24-400/debug-output


### get tensor names

In [12]:
trial.tensor_names()

[2020-01-28 03:34:42.932 ip-172-16-28-203:6000 INFO trial.py:197] Training has ended, will refresh one final time in 1 sec.
[2020-01-28 03:34:43.952 ip-172-16-28-203:6000 INFO trial.py:209] Loaded all steps


['CrossEntropyLoss_output_0',
 'Net_conv1.bias',
 'Net_conv1.weight',
 'Net_conv2.bias',
 'Net_conv2.weight',
 'Net_fc1.bias',
 'Net_fc1.weight',
 'Net_fc2.bias',
 'Net_fc2.weight',
 'gradient/Net_conv1.bias',
 'gradient/Net_conv1.weight',
 'gradient/Net_conv2.bias',
 'gradient/Net_conv2.weight',
 'gradient/Net_fc1.bias',
 'gradient/Net_fc1.weight',
 'gradient/Net_fc2.bias',
 'gradient/Net_fc2.weight']

### print tensor value of each steps

In [13]:
trial.tensor("gradient/Net_fc1.weight").values()

{0: array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.02159601, -0.00996249, -0.02803487, ..., -0.05145397,
         -0.03079144, -0.0200104 ],
        ...,
        [-0.07065905, -0.03657869, -0.08962931, ..., -0.15103655,
         -0.09744397, -0.06175173],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.05386365,  0.02484792,  0.06992311, ...,  0.12833382,
          0.07679842,  0.0499089 ]], dtype=float32),
 500: array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 1000: array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
  

# Visualize by tensorboard
go to 2_tensorboard.ipynb  
use conda_tensorflow_p36 kernel and copy logdir value.

In [14]:
print(f"logdir = \"s3://{bucket}/{prefix_for_tensorboard}/\"")

logdir = "s3://work-sagemaker-debugger/your_s3_prefix_for_tensorboard_logs/"
