## SageMaker Training using Weight & Bias



In [6]:
import os
import json

import sagemaker
from sagemaker.pytorch import PyTorch

session = sagemaker.Session()
region = session.boto_region_name

iam_role = "arn:aws:iam::835880313890:role/sagemaker-execution-role"

training_job_output = "s3://sagemaker-wandb-samples/training-jobs/"


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [7]:
import boto3
import json
import os

from botocore.exceptions import ClientError

from dotenv import load_dotenv
load_dotenv("../../.env")

wandb_secret_name = "weights_and_bias_secret"



# Initialize the Secrets Manager client
secretsmanager = boto3.client('secretsmanager')

# Create the secret with WANDB_API_KEY
secret_value = {"WANDB_API_KEY": os.environ["WANDB_API_KEY"]}
secret_string = json.dumps(secret_value)

# Create the secret in AWS Secrets Manager
try:
    # Try to get the secret to check if it exists
    secretsmanager.get_secret_value(SecretId=wandb_secret_name)
    
    # Secret exists, update it
    response = secretsmanager.update_secret(
        SecretId=wandb_secret_name,
        SecretString=secret_string
    )
    print(f"Secret updated successfully: {wandb_secret_name}")
    
except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceNotFoundException':
        # Secret doesn't exist, create it
        try:
            response = secretsmanager.create_secret(
                Name=wandb_secret_name,
                SecretString=secret_string
            )
            print(f"Secret created successfully: {response['ARN']}")
        except Exception as create_error:
            print(f"Error creating secret: {str(create_error)}")
    else:
        # Other error occurred
        print(f"Error accessing secret: {str(e)}")

Secret updated successfully: weights_and_bias_secret


### Set hyperparameters

In [8]:
import os

os.environ.keys()

KeysView(environ({'USER': 'ubuntu', 'SSH_CLIENT': '127.0.0.1 51516 22', 'XDG_SESSION_TYPE': 'tty', 'SHLVL': '1', 'LD_LIBRARY_PATH': '/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/lib:/usr/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/lib:/usr/lib', 'MOTD_SHOWN': 'pam', 'HOME': '/home/ubuntu', 'MODULES_CMD': '/usr/lib/x86_64-linux-gnu/modulecmd.tcl', 'SSL_CERT_FILE': '/usr/lib/ssl/certs/ca-certificates.crt', 'DBUS_SESSION_BUS_ADDRESS': 'unix:path=/run/user/1000/bus', 'LOGNAME': 'ubuntu', '_': '/home/ubuntu/workspace/deep-learning/sagemaker-training-job-wandb-samples/.venv/bin/python', 'XDG_SESSION_CLASS': 'user', 'XDG_SESSION_ID': '1', 'VSCODE_CLI_REQUI

In [None]:
instance_type = 'ml.g6.xlarge'
image_uri = "835880313890.dkr.ecr.us-east-1.amazonaws.com/mnist-training:latest"

estimator = PyTorch(
    entry_point="train.py",
    source_dir="code",
    role=iam_role,
    instance_type="ml.g5.xlarge", 
    instance_count=1,
    volume_size=50,
    output_path=training_job_output,
    hyperparameters={
        "epochs": 5
    }, 
    environment={
        "WANDB_PROJECT_NAME": "MNIST",
        "WANDB_SECRET_NAME": wandb_secret_name,
        "AWS_REGION": "us-east-1" # for secret resources access.

        # "WANDB_CHECKPOINT_NAME": "", # if provided, will download the checkpoint for ML training
        # "WANDB_CHECKPOINT_TAG": "" # if not provided, will use 'latest'
    },
    image_uri=image_uri
)

In [12]:
estimator.fit()

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: mnist-training-2025-08-28-20-36-23-370


2025-08-28 20:36:23 Starting - Starting the training job
2025-08-28 20:36:23 Pending - Training job waiting for capacity....................................
2025-08-28 20:42:31 Downloading - Downloading input data...
2025-08-28 20:42:41 Downloading - Downloading the training image.................................
2025-08-28 20:48:25 Training - Training image download completed. Training in progress...bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
Skipping CUDA compat setup as package not found
sed: can't read changehostname.c: No such file or directory
cc1: fatal error: changehostname.c: No such file or directory
compilation terminated.
/usr/bin/ld: cannot find changehostname.o: No such file or directory
collect2: error: ld returned 1 exit status
ERROR: ld.so: object '/libchangehostname.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/libchangehostname.so' from LD_P