In [8]:
import os
import sagemaker

from sagemaker.xgboost import XGBoost
from sagemaker import get_execution_role

CREATE FOLDER TO SAVE TRAINING CODE

In [9]:
os.makedirs("training_code_04", exist_ok=True) # Create folder for training code

CREATE TRAINING ENTRY POINT

In [15]:
%%writefile training_code_04/train.py

from __future__ import print_function

import argparse
import joblib
import os
import pandas as pd
import xgboost as xgb

if __name__ == '__main__':
    model_dir = os.environ['SM_MODEL_DIR'] # Folder where model must be saved
    train_dir = os.environ['SM_CHANNEL_TRAIN'] # Folder where train data is stored

    # Lets assume there is only one training file
    train_file_name = os.listdir(train_dir)[0]
    train_file_path = os.path.join(train_dir, train_file_name)
    
    train_data = pd.read_csv(train_file_path, header=None, engine="python")

    # labels are in the first column
    train_y = train_data.iloc[:, 0]
    train_X = train_data.iloc[:, 1:]  

    # Train the model
    # Hyperparameters are hardcoded
    clf = xgb.XGBClassifier(max_depth=5, n_estimators=100, learning_rate=0.1)
    clf = clf.fit(train_X, train_y)

    # Save model object
    joblib.dump(clf, os.path.join(model_dir, "model.joblib"))

Overwriting training_code_04/train.py


CREATE NECCESSARY OBJECTS

In [16]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()

CREATE ESTIMATOR OBJECT

In [17]:
xgboost = XGBoost(
    entry_point='train.py', # The file with the training code
    source_dir='training_code_04', # The folder with the training code
    framework_version='1.3-1', # Version of XGBoost which will be used
    instance_type='ml.m5.large', # Instance type that will be used
    instance_count=1, # number of instances to train
    role=role, # Role that will be used during execution
    sagemaker_session=sagemaker_session, 
    base_job_name='training-job-xgboost' # Name of the training job. Timestamp will be added as suffix
)

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


FIT THE MODEL
- The fit function have one required parameter in the form of a dictionary.
- The key represents the "channel" of the data. Typically it is train, test, valid. The names can be arbitrary.
- The value contains path to S3 folder that contains the data

The data from all channels will be copied to the training instance.

To obtain LOCAL path to the data one should use os.environ['SM_CHANNEL_CHANNEL'] and replace CHANNEL by actual channel name

Example:

If we specify following dictionary {"train": "s3://sagemaker-bucket-ds/training-jobs/data/train/"} then in the training script the path to the training data is in the object os.environ['SM_CHANNEL_TRAIN']


In [18]:
xgboost.fit({"train": "s3://sagemaker-bucket-ds/training-jobs/data/train/"})

INFO:sagemaker:Creating training-job with name: training-job-xgboost-2024-06-27-07-57-48-049


2024-06-27 07:57:48 Starting - Starting the training job...
2024-06-27 07:58:03 Starting - Preparing the instances for training...
2024-06-27 07:58:34 Downloading - Downloading input data...
2024-06-27 07:59:14 Downloading - Downloading the training image.....[34m[2024-06-27 07:59:59.702 ip-10-0-146-110.eu-west-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-06-27 07:59:59.730 ip-10-0-146-110.eu-west-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-06-27:07:59:59:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-06-27:07:59:59:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-06-27:07:59:59:INFO] Invoking user training script.[0m
[34m[2024-06-27:07:59:59:INFO] Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2024-06-27:07:59:59:INFO] Generating setup.cfg[0m
[34m[2024-06-27:07:59:59:INFO] Generating MANIFEST.in[0