# Manage and Observe a Training Job

As you train and experiment with models using Training Jobs, it is important to understand how to manage and observe your job.

**Note**: Select the image `PyTorch 1.12 Python 3.8 CPU Optimized image` and instance type `ml.c5.large`.


## Retrieve Variables

In [None]:
%store -r default_bucket
%store -r base_prefix
%store -r data_s3_uri
%store -r role

In [None]:
print(default_bucket)
print(base_prefix)
print(data_s3_uri)
print(role)

## Create a new model training script
We will create a new script which leverages TensorBoard for logging and profiling.

In [None]:
%%writefile ./script/mnist_tb.py

import argparse
import json
import logging
import sys
import time
import os
from os.path import join
import boto3
import torch
from torchvision import datasets, transforms
from torch.utils.tensorboard import SummaryWriter

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


tensorboard_log_dir = "/opt/ml/output/tensorboard"
writer = SummaryWriter(tensorboard_log_dir)

# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py
class Net(torch.nn.Module):
    def __init__(self, hidden_channels, kernel_size, drop_out):
        super(Net, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, hidden_channels, kernel_size=kernel_size)
        self.conv2 = torch.nn.Conv2d(hidden_channels, 20, kernel_size=kernel_size)
        self.conv2_drop = torch.nn.Dropout2d(p=drop_out)
        self.fc1 = torch.nn.Linear(320, 50)
        self.fc2 = torch.nn.Linear(50, 10)

    def forward(self, x):
        x = torch.nn.functional.relu(torch.nn.functional.max_pool2d(self.conv1(x), 2))
        x = torch.nn.functional.relu(
            torch.nn.functional.max_pool2d(self.conv2_drop(self.conv2(x)), 2)
        )
        x = x.view(-1, 320)
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.dropout(x, training=self.training)
        x = self.fc2(x)
        return torch.nn.functional.log_softmax(x, dim=1)


def log_performance(model, data_loader, device, epoch, metric_type="Test"):
    model.eval()
    loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss += torch.nn.functional.nll_loss(
                output, target, reduction="sum"
            ).item()  # sum up batch loss
            # get the index of the max log-probability
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()
    loss /= len(data_loader.dataset)
    accuracy = 100.0 * correct / len(data_loader.dataset)

    # log metrics
    writer.add_scalar(f"{metric_type} average loss", loss, epoch)
    writer.add_scalar(f"{metric_type} accuracy", accuracy, epoch)
    logger.info(
        "{} Average loss: {:.4f}, {} Accuracy: {:.4f}%;\n".format(
            metric_type, loss, metric_type, accuracy
        )
    )


def train_model(
    train_set, test_set, optimizer="sgd", epochs=10, hidden_channels=10
):
    """
    Function that trains the CNN classifier to identify the MNIST digits.
    Args:
        train_set (torchvision.datasets.mnist.MNIST): train dataset
        test_set (torchvision.datasets.mnist.MNIST): test dataset
        optimizer (str): the optimization algorthm to use for training your CNN
                         available options are sgd and adam
        epochs (int): number of complete pass of the training dataset through the algorithm
        hidden_channels (int): number of hidden channels in your model
    """

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # set the seed for generating random numbers
    torch.manual_seed(42)

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=1000, shuffle=True)
    logger.info(
        "Processes {}/{} ({:.0f}%) of train data".format(
            len(train_loader.sampler),
            len(train_loader.dataset),
            100.0 * len(train_loader.sampler) / len(train_loader.dataset),
        )
    )

    logger.info(
        "Processes {}/{} ({:.0f}%) of test data".format(
            len(test_loader.sampler),
            len(test_loader.dataset),
            100.0 * len(test_loader.sampler) / len(test_loader.dataset),
        )
    )
    model = Net(hidden_channels, kernel_size=5, drop_out=0.5).to(device)
    model = torch.nn.DataParallel(model)
    momentum = 0.5
    lr = 0.01
    log_interval = 100
    if optimizer == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs + 1):
        print("Training Epoch:", epoch)
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = torch.nn.functional.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                logger.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)], Train Loss: {:.6f};".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.sampler),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )
        log_performance(model, train_loader, device, epoch, "Train")
        log_performance(model, test_loader, device, epoch, "Test")

    return model


def save_model(model, model_dir):
    logger.info("Saving the model.")
    path = os.path.join(model_dir, "model.pth")
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.cpu().state_dict(), path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Hyperparameters
    parser.add_argument(
        "--epochs",
        type=int,
        default=10,
        metavar="N",
        help="number of epochs to train (default: 10)",
    )
    parser.add_argument("--optimizer", type=str, default="sgd", help="optimizer for training.")
    parser.add_argument(
        "--hidden_channels",
        type=int,
        default=10,
        help="number of channels in hidden conv layer",
    )

    # Container environment
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) # equivalent to /opt/ml/model
    parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--data-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) # equivalent to /opt/ml/input/data/training

    args = parser.parse_args()

    train_set = datasets.MNIST(
        args.data_dir,
        train=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
        download=False,
    )

    test_set = datasets.MNIST(
        args.data_dir,
        train=False,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
        download=False,
    )
    
    model = train_model(
        train_set,
        test_set,
        optimizer=args.optimizer,
        epochs=args.epochs,
        hidden_channels=args.hidden_channels,
        )
    save_model(model, args.model_dir)

We will also create a `requirements.txt` with `tensorboard` as a dependency.

In [None]:
%%writefile ./script/requirements.txt
tensorboard==2.13.0

We will create a `TensorBoardOutputConfig` to map the local TensorBoard log directory to S3.

In [None]:
from sagemaker.debugger import TensorBoardOutputConfig

tensorboard_log_s3_uri = f"s3://{default_bucket}/{base_prefix}/tensorboard"
print(tensorboard_log_s3_uri)
tensorboard_output_config = TensorBoardOutputConfig(
    container_local_output_path="/opt/ml/output/tensorboard",
    s3_output_path=tensorboard_log_s3_uri,
)

From the System terminal, run the following commands. You must run this from the /home/sagemaker-user root directory.

```bash
pip install tensorboard boto3

tensorboard --logdir <tensorboard_log_s3_uri>
```

To launch TensorBoard, copy your Studio URL and replace lab? with proxy/6006/ as follows. You must include the trailing / character.

https://<YOUR_URL>.studio.region.sagemaker.aws/jupyter/default/proxy/6006/


Create the `PyTorch` estimator

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    source_dir="script",
    entry_point="mnist_tb.py",
    role=role,
    output_path=f"s3://{default_bucket}/{base_prefix}/models",
    framework_version="1.12",
    py_version="py38",
    instance_type="ml.c5.xlarge",
    instance_count=1,
    hyperparameters={"epochs": 200, "hidden_channels": 5, "optimizer": "adam"},
    input_mode="File",
    volume_size=10,
    tensorboard_output_config=tensorboard_output_config
)

Create a unique training job name and start the training job. We will also set wait to `False` (non-blocking).

In [None]:
import time
job_name = f"training-job-{time.time_ns()}"
print(job_name)

estimator.fit({"training": data_s3_uri}, job_name=job_name, wait=False)

Let's describe the training job using the AWS CLI

In [None]:
! aws sagemaker describe-training-job --training-job-name {job_name}

We can also do the same using `boto3`

In [None]:
import boto3

sm_client = boto3.client('sagemaker')

sm_client.describe_training_job(TrainingJobName=job_name)

We can use `Session.logs_for_job` to tail the logs of our training job.

In [None]:
from sagemaker import Session

session = Session()

session.logs_for_job(job_name=job_name, wait=True)

To stop the job, let's use the AWS CLI

In [None]:
! aws sagemaker stop-training-job --training-job-name {job_name}

Here's the equivalent with `boto3`

In [None]:
sm_client.stop_training_job(TrainingJobName=job_name)

Let's describe the training job again to make sure it has been stopped

In [None]:
! aws sagemaker describe-training-job --training-job-name {job_name}