In [25]:
import sagemaker, boto3, json
from sagemaker import get_execution_role

aws_role = 'sagemaker_developer'
aws_region = boto3.Session().region_name
print(aws_region)
sess = sagemaker.Session()

us-east-1


In [26]:
from sagemaker import image_uris, model_uris, script_uris
# Currently, not all the object detection models in jumpstart support finetuning. Thus, we manually select a model
# which supports finetuning.
train_model_id, train_model_version, train_scope = "lightgbm-classification-model", "*", "training"
training_instance_type = "ml.m5.xlarge"

# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=training_instance_type,
)
# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)
# Retrieve the pre-trained model tarball to further fine-tune
train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

In [27]:
print(train_image_uri)
print(train_source_uri)
print(train_model_uri)

763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.9.0-cpu-py38
s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/transfer_learning/classification/v2.0.2/sourcedir.tar.gz
s3://jumpstart-cache-prod-us-east-1/lightgbm-training/train-lightgbm-classification-model.tar.gz


In [28]:
# Sample training data is available in this bucket
training_data_bucket = f"jumpstart-cache-prod-{aws_region}"
training_data_prefix = "training-datasets/tabular_multiclass/"

training_dataset_s3_path = f"s3://{training_data_bucket}/{training_data_prefix}"

output_bucket = sess.default_bucket()
output_prefix = "jumpstart-example-tabular-training"

s3_output_location = f"s3://{output_bucket}/{output_prefix}/output"

In [29]:
from sagemaker import hyperparameters

# Retrieve the default hyper-parameters for fine-tuning the model
hyperparameters = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)

# [Optional] Override default hyperparameters with custom values
hyperparameters[
    "num_boost_round"
] = "500"  # The same hyperparameter is named as "iterations" for CatBoost
print(hyperparameters)

{'num_boost_round': '500', 'early_stopping_rounds': '30', 'metric': 'auto', 'learning_rate': '0.009', 'num_leaves': '67', 'feature_fraction': '0.74', 'bagging_fraction': '0.53', 'bagging_freq': '5', 'max_depth': '11', 'min_data_in_leaf': '26', 'max_delta_step': '0.0', 'lambda_l1': '0.0', 'lambda_l2': '0.0', 'boosting': 'gbdt', 'min_gain_to_split': '0.0', 'scale_pos_weight': '1.0', 'tree_learner': 'serial', 'feature_fraction_bynode': '1.0', 'is_unbalance': 'False', 'max_bin': '255', 'num_threads': '0', 'verbosity': '1', 'use_dask': 'False'}


In [30]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner

use_amt = True

if train_model_id == "lightgbm-classification-model":
    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(1e-4, 1, scaling_type="Logarithmic"),
        "num_boost_round": IntegerParameter(2, 30),
        "early_stopping_rounds": IntegerParameter(2, 30),
        "num_leaves": IntegerParameter(10, 50),
        "feature_fraction": ContinuousParameter(0, 1),
        "bagging_fraction": ContinuousParameter(0, 1),
        "bagging_freq": IntegerParameter(1, 10),
        "max_depth": IntegerParameter(5, 30),
        "min_data_in_leaf": IntegerParameter(5, 50),
    }

if train_model_id == "catboost-classification-model":
    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(0.00001, 0.1, scaling_type="Logarithmic"),
        "iterations": IntegerParameter(50, 1000),
        "early_stopping_rounds": IntegerParameter(1, 10),
        "depth": IntegerParameter(1, 10),
        "l2_leaf_reg": IntegerParameter(1, 10),
        "random_strength": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    }

In [31]:
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
training_job_name = name_from_base(f"jumpstart-{train_model_id}-training")

# Create SageMaker Estimator instance
tabular_estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location,
)
if use_amt:

    tuner = HyperparameterTuner(
        tabular_estimator,
        "multi_logloss",
        hyperparameter_ranges,
        [{"Name": "multi_logloss", "Regex": "multi_logloss: ([0-9\\.]+)"}],
        max_jobs=10,
        max_parallel_jobs=2,
        objective_type="Minimize",
        base_tuning_job_name=training_job_name,
    )

    tuner.fit({"training": training_dataset_s3_path}, logs=True)
else:
    # Launch a SageMaker Training job by passing s3 path of the training data
    tabular_estimator.fit(
        {"training": training_dataset_s3_path}, logs=True, job_name=training_job_name
    )

In [None]:
print(tabular_estimator.base_job_name)
print(tabular_estimator.checkpoint_local_path)

pytorch-training
None


In [None]:
inference_instance_type = "ml.m5.large"

# Retrieve the inference docker container uri
deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    image_scope="inference",
    model_id=train_model_id,
    model_version=train_model_version,
    instance_type=inference_instance_type,
)
# Retrieve the inference script uri
deploy_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope="inference"
)

endpoint_name = name_from_base(f"jumpstart-example-{train_model_id}-")

# Use the estimator from the previous step to deploy to a SageMaker endpoint
predictor = (tuner if use_amt else tabular_estimator).deploy(
    initial_instance_count=1,
    instance_type=inference_instance_type,
    entry_point="inference.py",
    image_uri=deploy_image_uri,
    source_dir=deploy_source_uri,
    endpoint_name=endpoint_name,
)


2023-02-02 02:09:40 Starting - Found matching resource for reuse
2023-02-02 02:09:40 Downloading - Downloading input data
2023-02-02 02:09:40 Training - Training image download completed. Training in progress.
2023-02-02 02:09:40 Uploading - Uploading generated training model
2023-02-02 02:09:40 Completed - Resource reused by training job: jumpstart-lightgbm-c-230202-1004-007-cd1ca357


In [None]:
print(deploy_image_uri)
print(deploy_source_uri)

763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.9.0-cpu-py38
s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/inference/classification/v1.2.1/sourcedir.tar.gz


In [None]:
jumpstart_assets_bucket = f"jumpstart-cache-prod-{aws_region}"
test_data_prefix = "training-datasets/tabular_multiclass/test"
test_data_file_name = "data.csv"

boto3.client("s3").download_file(
    jumpstart_assets_bucket, f"{test_data_prefix}/{test_data_file_name}", test_data_file_name
)

In [None]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# read the data
test_data = pd.read_csv(test_data_file_name, header=None)
test_data.columns = ["Target"] + [f"Feature_{i}" for i in range(1, test_data.shape[1])]

num_examples, num_columns = test_data.shape
print(
    f"{bold}The test dataset contains {num_examples} examples and {num_columns} columns.{unbold}\n"
)

# prepare the ground truth target and predicting features to send into the endpoint.
ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]

print(f"{bold}The first 5 observations of the data: {unbold} \n")
test_data.head(5)

[1mThe test dataset contains 10000 examples and 785 columns.[0m

[1mThe first 5 observations of the data: [0m 



Unnamed: 0,Target,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_775,Feature_776,Feature_777,Feature_778,Feature_779,Feature_780,Feature_781,Feature_782,Feature_783,Feature_784
0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
content_type = "text/csv"


def query_endpoint(encoded_tabular_data):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_tabular_data
    )
    return response

def parse_response(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    predicted_probabilities = model_predictions["probabilities"]
    return np.array(predicted_probabilities)


# split the test data into smaller size of batches to query the endpoint due to the large size of test data.
batch_size = 1500
predict_prob = []
for i in np.arange(0, num_examples, step=batch_size):
    query_response_batch = query_endpoint(
        features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode("utf-8")
    )
    predict_prob_batch = parse_response(query_response_batch)  # prediction probability per batch
    predict_prob.append(predict_prob_batch)


predict_prob = np.concatenate(predict_prob, axis=0)
predict_label = np.argmax(
    predict_prob, axis=1
)  # Note. For binary classification, the model returns a array-like of shape (num_examples, 1),
# where each row is the probability of the positive label 1, assuming there are positive label (encoded as 1) and negative label (encoded as 0) in the target.
# To get the probability for both label 0 and 1, execute following code:
# predict_prob = np.vstack((1.0 - predict_prob, predict_prob)).transpose()

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint jumpstart-example-lightgbm-classificati-2023-02-02-02-25-14-191 of account 083839308414 not found.