In [1]:
pip install sagemaker -U

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import boto3
import re
import sagemaker
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name
session = Session()
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sklearn import preprocessing
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/DEMO-xgboost-dist-algo"



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## 2. Data Preparation and Visualization

In [5]:
s3 = boto3.client("s3")
s3.download_file(
    f"sagemaker-example-files-prod-{region}",
    "datasets/tabular/synthetic/churn.txt",
    "churn.txt",
)

In [3]:
import pandas as pd
churn = pd.read_csv("./churn.txt")
pd.set_option("display.max_columns", 500)
churn.head(5)

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,PA,163,806,403-2562,no,yes,300,8.162204,3,7.579174,3.933035,4,6.508639,4.065759,100,5.111624,4.92816,6,5.673203,3,True.
1,SC,15,836,158-8416,yes,no,0,10.018993,4,4.226289,2.325005,0,9.972592,7.14104,200,6.436188,3.221748,6,2.559749,8,False.
2,MO,131,777,896-6253,no,yes,300,4.70849,3,4.76816,4.537466,3,4.566715,5.363235,100,5.142451,7.139023,2,6.254157,4,False.
3,WY,75,878,817-5729,yes,yes,700,1.268734,3,2.567642,2.528748,5,2.333624,3.773586,450,3.814413,2.245779,6,1.080692,6,False.
4,WY,146,878,450-4942,yes,no,0,2.696177,3,5.908916,6.015337,3,3.670408,3.751673,250,2.796812,6.905545,4,7.134343,6,True.


In [4]:
churn = pd.concat([churn]*5000, ignore_index=True) # to increase the size of the table for the purpose of increasing the training data size

In [30]:
churn['target'].unique()

array([1, 0])

In [34]:
# churn.to_parquet('train.parquet', index=False)

# val.to_parquet('validation.parquet')
# test.to_parquet('test.parquet')

In [5]:
churn = churn.drop("Phone", axis=1)
churn["Area Code"] = churn["Area Code"].astype(object)

In [6]:
churn["target"] = churn["Churn?"].map({"True.": 1, "False.": 0})
churn.drop(["Churn?"], axis=1, inplace=True)

In [7]:
churn = churn[["target"] + churn.columns.tolist()[:-1]]

In [8]:
# One Hot Encode Cat Variables
churn=pd.get_dummies(churn, dtype=int)
# churn

In [33]:
from sklearn.model_selection import train_test_split

train, val_n_test = train_test_split(
    churn, test_size=0.3, random_state=42, stratify=churn["target"]
)

val, test = train_test_split(
    val_n_test, test_size=0.3, random_state=42, stratify=val_n_test["target"]
)

In [9]:
train.to_csv("train.csv", header=False, index=False)
val.to_csv("validation.csv", header=False, index=False)
test.to_csv("test.csv", header=False, index=False)

For demonstartion purpose on including multiple files under the training channel, we simply duplicate the training data multiple times as shown below.

For the purprose of the training logic implemented that uses `xgboost.rabit()` for distributed training,it is recommended to split training sets into multiple chunks so they can be distributed accross training nodes. However, for single node training, this is not required.

In [49]:
from tqdm import tqdm

for i in tqdm(range(10)):
    boto3.Session().resource("s3").Bucket(bucket).Object(
        os.path.join(prefix, f"train_xgb_micro_pq/data_{i}.parquet")
    ).upload_file("train.parquet")

100%|██████████| 10/10 [00:15<00:00,  1.51s/it]


In [37]:
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation_xgb_large_pq/data.parquet")
).upload_file("validation.parquet")

In [38]:
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test_xgb_large_pq/data.parquet")
).upload_file("test.parquet")

In [14]:
training_dataset_s3_path = f"s3://{bucket}/{prefix}/train_xgb_single/"
validation_dataset_s3_path = f"s3://{bucket}/{prefix}/validation_xgb/"

output_prefix = "xgboost-perf-training"
s3_output_location = f"s3://{bucket}/{output_prefix}/output_xgb"
training_dataset_s3_path, validation_dataset_s3_path

('s3://sagemaker-us-east-1-715253196401/sagemaker/DEMO-xgboost-dist-algo/train_xgb_single/',
 's3://sagemaker-us-east-1-715253196401/sagemaker/DEMO-xgboost-dist-algo/validation_xgb/')

## Create an XGBoost training script

SageMaker can now run an XGBoost script using the XGBoost estimator. When run on SageMaker, a number of helpful environment variables are available to access properties of the training environment, such as:

- `SM_MODEL_DIR`: A string representing the path to the directory to write model artifacts to. Any artifacts saved in this folder are uploaded to S3 for model hosting after the training job completes.
- `SM_OUTPUT_DIR`: A string representing the filesystem path to write output artifacts to. Output artifacts may include checkpoints, graphs, and other files to save, not including model artifacts. These artifacts are compressed and uploaded to S3 to the same S3 prefix as the model artifacts.

When two input channels, `train` and `validation`, are used in the call to the XGBoost estimator's `fit()` method, the following environment variables are set, following the format `SM_CHANNEL_[channel_name]`:

- `SM_CHANNEL_TRAIN`: A string representing the path to the directory containing data in the 'train' channel.
- `SM_CHANNEL_VALIDATION`: Same as above, but for the 'validation' channel.

A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, and saves a model to the `model_dir` so that it can be hosted later. Hyperparameters are passed to your script as arguments and can be retrieved with an `argparse.ArgumentParser` instance. For example, the script that we run in this notebook is provided as the accompanying file (`abalone.py`) and also shown below:

In [37]:
!pygmentize -g code/abalone.py

[37m#  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.[39;49;00m[37m[39;49;00m
[37m#[39;49;00m[37m[39;49;00m
[37m#  Licensed under the Apache License, Version 2.0 (the "License").[39;49;00m[37m[39;49;00m
[37m#  You may not use this file except in compliance with the License.[39;49;00m[37m[39;49;00m
[37m#  A copy of the License is located at[39;49;00m[37m[39;49;00m
[37m#[39;49;00m[37m[39;49;00m
[37m#      http://www.apache.org/licenses/LICENSE-2.0[39;49;00m[37m[39;49;00m
[37m#[39;49;00m[37m[39;49;00m
[37m#  or in the "license" file accompanying this file. This file is distributed[39;49;00m[37m[39;49;00m
[37m#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either[39;49;00m[37m[39;49;00m
[37m#  express or implied. See the License for the specific language governing[39;49;00m[37m[39;49;00m
[37m#  permissions and limitations under the License.[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36m_

Because the container imports your training script, always put your training code in a main guard `(if __name__=='__main__':)` so that the container does not inadvertently run your training code at the wrong point in execution.

For more information about training environment variables, please visit the [SageMaker Training Toolkit](https://github.com/aws/sagemaker-training-toolkit).

In [66]:
instance_type = "ml.m5.xlarge"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix, "algo-dist-xgb")
content_type ="csv" #dataset extension
hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "binary:logistic",
    "num_round": "50",
    "verbosity": "3",
    "eval_metric": "auc",
    "content_type": content_type,
}

---
## Train the XGBoost model

After setting training parameters, we kick off training, and poll for status until training is complete.

To run our training script on SageMaker, we construct a sagemaker.xgboost.estimator.XGBoost estimator, which accepts several constructor arguments:

* __entry_point__: The path to the Python script that SageMaker runs for training and prediction.
* __role__: Role ARN
* __train_instance_type__ *(optional)*: The type of SageMaker instances for training.
* __sagemaker_session__ *(optional)*: The session used to train on SageMaker.
* __hyperparameters__ *(optional)*: A dictionary passed to the train function as hyperparameters.

SageMaker Training Directory Setup for Script Mode:

- Create a root project directory.
- Place main training script (e.g., train.py) in root.
- Add other Python modules/scripts to root or subdirectories.
- Include requirements.txt for dependencies. Sagemaker automatically installs all libs listed in this text file

Example structure:
```
project/
    ├── train.py
    ├── requirements.txt
    ├── utils.py
```

SageMaker estimator setup:
```
estimator = Estimator(
    entry_point='train.py',
    source_dir='path/to/project',
    ...
)
```
Key points:

    Include all necessary code files.
    List dependencies in requirements.txt.
    SageMaker packages entire directory content.    

---

In [None]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

session = Session()
script_path = "abalone.py"
instance_count = 2 # for distributed training, set instance greater than 1

# Your MLflow tracking ARN
tracking_server_arn = "arn:aws:sagemaker:us-east-1:1234567890:mlflow-tracking-server/test"

xgb_script_mode_estimator = XGBoost(
    source_dir="code", # parent folder of training logic and dependencies
    entry_point=script_path, # training logic path
    framework_version="1.7-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=role,
    volume_size=50,
    instance_count=instance_count, # for distributed training, set instance greater than 1
    instance_type=instance_type,
    output_path=output_path,
      environment={"MLFLOW_TRACKING_ARN": tracking_server_arn},
        keep_alive_period_in_seconds = 1000 # Keep instance warm to negate cold start for fast experimentation trials. Charge is incurred for warm instances
)

train_input = TrainingInput(
   training_dataset_s3_path,  content_type="text/csv", distribution = "ShardedByS3Key" if instance_count>1 else "FullyReplicated"
)
validation_input = TrainingInput(
    validation_dataset_s3_path, content_type="text/csv"
)
xgb_script_mode_estimator.fit({"train": train_input,
                               "validation": validation_input})

In [None]:
pd.read_csv("s3://sagemaker-us-east-1-715253196401/sagemaker/DEMO-churn-dt/train2/data_106.csv")

In [24]:
0.04*96

3.84

In [None]:
import os
import boto3
import re
import sagemaker
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name
session = Session()

# bucket = "<Specify S3 Bucket>"
# prefix = "<Specify S3 prefix>"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix, "algo-dist-xgb")
hyperparams = {
    "objective": "binary:logistic",
    "num_round": "500",
    "verbosity": "3",
    "tree_method": "gpu_hist",
    "eval_metric": "auc",
    "use_dask_gpu_training": "true"
}


# output_path = "s3://{}/{}/output".format(bucket, prefix)

content_type = "application/x-parquet"
instance_type = "ml.g5.16xlarge"

xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")
xgb_script_mode_estimator = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    hyperparameters=hyperparams,
    role=role,
    instance_count=10,
    instance_type=instance_type,
    output_path=output_path,
    max_run=7200,
    volume_size=500,
    keep_alive_period_in_seconds = 1000
)

train_input = TrainingInput(
    "s3://sagemaker-us-east-1-715253196401/sagemaker/DEMO-xgboost-dist-algo/train_xgb_micro_pq/", content_type=content_type
)

validation_input = TrainingInput(
    "s3://sagemaker-us-east-1-715253196401/sagemaker/DEMO-xgboost-dist-algo/validation_xgb_large_pq/", content_type=content_type
)

xgb_script_mode_estimator.fit({"train": train_input, "validation": validation_input})

## Deploy the XGBoost model
Once the training is done, SageMaker packages your model artifacts along with any dependencies used for training including the `inference,py` which we will use as our inference logic. It is also possible to pass a seperate inference logic if you wish so.
After training we deploy the model using the Estimator object and point to our inference script for serving the model. Here we have defined SaegMaker specific functions `model_fn`, `predict_fn` to load and make prediction on the model.

In [68]:
predictor = xgb_script_mode_estimator.deploy(
    initial_instance_count=1, 
    instance_type="ml.m5.xlarge", 
    entry_point="inference.py", # path to inference script within the model package
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-10-28-03-37-11-315
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-10-28-03-37-11-315
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-10-28-03-37-11-315


------!

In [85]:
# Load test Data
import pandas as pd
features=pd.read_csv("test.csv").iloc[:,1:]
num_examples=len(features)

In [86]:
content_type = "text/csv"
import boto3
import json
import numpy as np

predictor.serializer = sagemaker.serializers.CSVSerializer()

def query_endpoint(encoded_tabular_data, endpoint_name):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType=content_type,
        Body=encoded_tabular_data,
    )
    return response

# split the test data into smaller size of batches to query the endpoint if test data has large size.
batch_size = 1500
predict_prob = []
for i in np.arange(0, num_examples, step=batch_size):
    payload=features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).strip()
    predict_prob.extend(predictor.predict(payload))

In [None]:
predict_prob

# Automatic model Tuning

Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. For example, suppose that you want to solve a binary classification problem on this marketing dataset. Your goal is to maximize the area under the curve (auc) metric of the algorithm by training an XGBoost Algorithm model. You don't know which values of the eta, alpha, min_child_weight, and max_depth hyperparameters to use to train the best model. To find the best values for these hyperparameters, you can specify ranges of values that Amazon SageMaker hyperparameter tuning searches to find the combination of values that results in the training job that performs the best as measured by the objective metric that you chose. Hyperparameter tuning launches training jobs that use hyperparameter values in the ranges that you specified, and returns the training job with highest auc.

In [87]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}

objective_metric_name = 'validation:auc'

In [89]:
tuner = HyperparameterTuner(xgb_script_mode_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=6, # maximum number of canditate jobs
                            max_parallel_jobs=3) # number of jobs to execute in parallel
tuner.fit({'train': train_input, 'validation': validation_input})

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-241028-0353


.............................................................................!


In [None]:
inference_instance_type = "ml.m5.xlarge"

# Use the estimator from the previous step to deploy to a SageMaker endpoint
predictor = tuner.deploy(
    initial_instance_count=1,
    instance_type=inference_instance_type,
    entry_point="inference.py",
)

In [91]:
# Load test Data
import pandas as pd
features=pd.read_csv("test.csv").iloc[:,1:]
num_examples=len(features)

In [92]:
content_type = "text/csv"
import boto3
import json
import numpy as np

predictor.serializer = sagemaker.serializers.CSVSerializer()


# split the test data into smaller size of batches to query the endpoint if test data has large size.
batch_size = 1500
predict_prob = []
for i in np.arange(0, num_examples, step=batch_size):
    payload=features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).strip()
    predict_prob.extend(predictor.predict(payload))

In [None]:
predict_prob