In [22]:
import os
import logging
from datetime import datetime
import boto3
import sagemaker
import tarfile
import pickle
from sagemaker.session import TrainingInput
from sagemaker import image_uris
from sagemaker import hyperparameters

SETTINGS

In [23]:
model_op = "s3://sagemaker-bucket-ds/training-jobs/model/" # Folder where we want to save output
train_file = "s3://sagemaker-bucket-ds/training-jobs/data/train/iris_train.csv" # File with training data

BASIC OBJECTS

In [24]:
sg_session = sagemaker.Session()
region = sg_session.boto_region_name
role_arn = sagemaker.get_execution_role()
s3 = boto3.client('s3')

print(region)
print(role_arn)

eu-west-1
arn:aws:iam::211125740051:role/service-role/AmazonSageMaker-ExecutionRole-20240607T130532


WE NEED TO GET DOCKER IMAGE FOR XGBOOST

In [25]:
model_img = sagemaker.image_uris.retrieve("xgboost", region, "latest") # This function returns path to latest docker image
print(model_img)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest


CREATE OBJECT WITH TRAINING DATA

It is neccessary because XGBoost is expecting different file format.

Here you can specify format as csv

In [26]:
train_ip = TrainingInput(train_file, content_type="csv")

CREATE ESTIMATOR OBJECT

In [27]:
base_job_name ="xgboost-07-"

xgb_model = sagemaker.estimator.Estimator(
    image_uri=model_img, # Docker image that will be using on the instance
    role=role_arn, # role that will be used to execute
    base_job_name=base_job_name, # name of the job
    instance_count=1, # how many instances should be spawned
    instance_type="ml.m5.large", # size of the instance
    output_path=model_op, # where the output of the model should be saved
    sagemaker_session=sg_session,
    volume_size=5 # volume (in GB) assigned to the instance
)

SET HYPERPARAMETERS

In [28]:
xgb_model.set_hyperparameters(
    num_class=3, 
    max_depth=3,
    num_round=10,
    objective="multi:softprob",
    eta= 0.3,  # Learning rate
    subsample= 1,  # Subsample ratio of the training instance
    colsample_bytree= 1,  # Subsample ratio of columns when constructing each tree
    min_child_weight= 1,  # Minimum sum of instance weight (hessian) needed in a child
    gamma= 0,  # Minimum loss reduction required to make a further partition on a leaf node
    alpha= 0,  # L1 regularization term on weights
    eval_metric= 'mlogloss'
)

In [29]:
job_name = base_job_name + datetime.today().strftime("%Y-%m-%d-%H-%M-%S")
print(job_name)
xgb_model.fit({"train": train_ip}, wait=True, job_name=job_name)

INFO:sagemaker:Creating training-job with name: xgboost-07-2024-06-27-14-06-46


xgboost-07-2024-06-27-14-06-46
2024-06-27 14:06:47 Starting - Starting the training job...
2024-06-27 14:07:02 Starting - Preparing the instances for training...
2024-06-27 14:07:30 Downloading - Downloading input data...
2024-06-27 14:08:15 Downloading - Downloading the training image.....[34mArguments: train[0m
[34m[2024-06-27:14:08:54:INFO] Running standalone xgboost training.[0m
[34m[2024-06-27:14:08:54:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2024-06-27:14:08:54:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 249.62mb[0m
[34m[2024-06-27:14:08:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:08:54] S3DistributionType set as FullyReplicated[0m
[34m[14:08:54] 67x4 matrix with 268 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[14:08:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1[0m
[34m[14: