In [4]:
import logging
from datetime import datetime

In [5]:
import boto3
import sagemaker
from sagemaker.session import TrainingInput
from sagemaker import image_uris
from sagemaker import hyperparameters

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()
# Define the bucket name and region
S3_BUCKET = os.getenv("S3_BUCKET")
S3_PREFIX = os.getenv("S3_PREFIX")
REGION = os.getenv("REGION")
SAGE_MAKER_LOCAL_ROLE = os.getenv("SAGE_MAKER_LOCAL_ROLE")
print(f"S3_BUCKET: {S3_BUCKET}")
print(f"REGION: {REGION}")
print(f"SAGE_MAKER_LOCAL_ROLE: {SAGE_MAKER_LOCAL_ROLE}")

S3_BUCKET: sgmkr-thangtran3112
REGION: us-west-2
SAGE_MAKER_LOCAL_ROLE: arn:aws:iam::654654352356:role/service-role/AmazonSageMaker-ExecutionRole-20250111T085887


In [7]:
boto3.set_stream_logger(name="botocore.credentials", level=logging.WARNING)

In [8]:
region = sagemaker.Session().boto_region_name
print(region)

us-west-2


Create a SageMaker Execution Role, which should have `AmazonSageMakerFullAccess` policy attached.

In [9]:
import os

if "SM_CURRENT_HOST" in os.environ:
  print("Running in SageMaker Studio")
  # only inside Sagemaker notebook Studio
  role_arn = sagemaker.get_execution_role()
else:
  print("Not running in SageMaker Studio. Using custom role for local computer")
  # in local computer, we will get it from environment variable
  role_arn = SAGE_MAKER_LOCAL_ROLE

print(role_arn)

Not running in SageMaker Studio. Using custom role for local computer
arn:aws:iam::654654352356:role/service-role/AmazonSageMaker-ExecutionRole-20250111T085887


In [10]:
!aws s3 ls {S3_BUCKET}/{S3_PREFIX}/

                           PRE batch_transform/
                           PRE data/


In [11]:
!aws s3 ls {S3_BUCKET}/{S3_PREFIX}/data/ --recursive

2025-01-11 11:34:19        900 iris/data/iris_test.csv
2025-01-11 11:34:19       1800 iris/data/iris_train.csv


In [12]:
train_file = "data/iris_train.csv"
valid_file = "data/iris_test.csv"

train_file_uri = "s3://{}/{}/{}".format(S3_BUCKET, S3_PREFIX, train_file)
valid_file_uri = "s3://{}/{}/{}".format(S3_BUCKET, S3_PREFIX, valid_file)
print("train file uri:", train_file_uri)
print("valid file uri:", valid_file_uri)

train file uri: s3://sgmkr-thangtran3112/iris/data/iris_train.csv
valid file uri: s3://sgmkr-thangtran3112/iris/data/iris_test.csv


In [13]:
train_ip = TrainingInput(train_file_uri, content_type="csv")
print(train_ip)

<sagemaker.inputs.TrainingInput object at 0x746c0cca59d0>


In [14]:
valid_ip = TrainingInput(valid_file_uri, content_type="csv")
print(valid_ip)

<sagemaker.inputs.TrainingInput object at 0x746c0be19580>


In [15]:
#model artifact will be saved in this location
model_op = "s3://{}/{}/{}".format(S3_BUCKET, S3_PREFIX, "model")
print(model_op)

s3://sgmkr-thangtran3112/iris/model


In [16]:
model_img = sagemaker.image_uris.retrieve("xgboost", region, "latest")
print(model_img)

433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest


In [17]:
base_job_name = "iris-xgboost-"

In [18]:
# Some models can be trained in parallel, but xgboost is not one of them. Therefore we have instance_count=1
xgb_model = sagemaker.estimator.Estimator(
    image_uri=model_img,
    role=role_arn,
    base_job_name=base_job_name,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=model_op,
    sagemaker_session=sagemaker.Session(),
    volume_size=5, # 5 GB storage
)

In [19]:

xgb_model.set_hyperparameters(
    num_class=3, # only 3 classes in iris dataset
    max_depth=5, # max_depth: The maximum depth of the tree. avoid overfitting
    num_round=10, # number of boosting rounds (iterations) for training the mode
    objective="multi:softmax",
)

In [20]:
job_name = base_job_name + datetime.today().strftime("%Y-%m-%d-%H-%M-%S")
print(job_name)

iris-xgboost-2025-01-11-18-03-55


In [21]:
xgb_model.fit({"train": train_ip, "validation": valid_ip}, wait=True, job_name=job_name)

2025-01-12 02:03:58 Starting - Starting the training job...
2025-01-12 02:04:11 Starting - Preparing the instances for training...
2025-01-12 02:04:38 Downloading - Downloading input data...
2025-01-12 02:05:08 Downloading - Downloading the training image...
2025-01-12 02:05:59 Training - Training image download completed. Training in progress...Arguments: train
[2025-01-12:02:06:11:INFO] Running standalone xgboost training.
[2025-01-12:02:06:11:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8455.9mb
[2025-01-12:02:06:11:INFO] Determined delimiter of CSV input is ','
[02:06:11] S3DistributionType set as FullyReplicated
[02:06:11] 100x4 matrix with 400 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,
[2025-01-12:02:06:11:INFO] Determined delimiter of CSV input is ','
[02:06:11] S3DistributionType set as FullyReplicated
[02:06:11] 50x4 matrix with 200 entries loaded from /opt/ml/input/data/validation?format=cs

In [22]:
!aws s3 ls {S3_BUCKET}/{S3_PREFIX}/model/

                           PRE iris-xgboost-2025-01-11-18-03-55/


In [23]:
!aws s3 ls {S3_BUCKET}/{S3_PREFIX}/model/{job_name}/

                           PRE debug-output/
                           PRE output/
                           PRE profiler-output/
