# SageMakerの組み込みのXGBoost利用
参考URL  
https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html  
https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_abalone.html  
スクリプト不要でXGBoostを使える

In [None]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import sklearn.model_selection

In [33]:
# AWSの設定
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3_client = boto3.client("s3")

# SageMakerの設定
# AWSが作成したxgboost専用のDockerImageのURIを取得
container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1")
client = boto3.client("sagemaker", region_name=region)
deploy_amt_model = True

In [43]:
# S3の設定
# 入力データパス
data_bucket = f"sagemaker-ap-northeast-1-237716927536"
data_prefix = "sample_data"
data_bucket_path = f"s3://{data_bucket}"

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
output_bucket = sagemaker.Session().default_bucket()
output_prefix = "sagemaker_studio/output"
output_bucket_path = f"s3://{output_bucket}"

In [None]:
# irisデータを読み込む
iris = pd.read_csv(f'{data_bucket_path}/{data_prefix}/iris.csv')
iris.columns

X = iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']].values# 特徴量
y = iris[ 'Species' ].values # 目的変数

# yの値を数値に変換するための辞書を作成
label_dict = {'setosa': 0, 'versicolor': 1, 'virginica': 2}

# yの値を辞書に従って置き換える
y = np.array([label_dict[label] for label in y])

# データを学習用と評価用とテスト用に8:2で分割するために、まず0.4でわける
X_train, X_rem, y_train, y_rem = sklearn.model_selection.train_test_split(X, y, test_size=0.4, random_state=0)
X_val, X_test, y_val, y_test = sklearn.model_selection.train_test_split(X_rem, y_rem, test_size=0.5, random_state=0)

In [None]:
# 訓練・評価データをS3にアップロードする。
for name, df in {"X_train":X_train, "X_val":X_val, "y_train":y_train, "y_val":y_val}.items() :
    print(name,df.shape)
#     pandasにしてアップロード
    pd.DataFrame(df).to_csv(f"{output_bucket_path}/{output_prefix}/{name}.csv")

In [30]:
# チューニングの設定
tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "0.5",
                "MinValue": "0.1",
                "Name": "eta",
            }
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            }
        ],
    },
    # 探索インスタンスの設定
    # https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-limits.html
    "ResourceLimits": {
        # Jobの最大数。多いほうが探索範囲が多い。
        "MaxNumberOfTrainingJobs": 20,
        # 同時に探索する数。あまり多いとベイズ的な最適化ではなくランダムサーチになる
        "MaxParallelTrainingJobs": 2
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:rmse", "Type": "Minimize"},
}

In [40]:
# チューニングジョブ名
from time import gmtime, strftime, sleep
tuning_job_name = "DEMO-xgboost-reg-" + strftime("%d-%H-%M-%S", gmtime())

# チューニングのジョブの設定
training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{output_bucket_path}/{output_prefix}/train",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "csv",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{output_bucket_path}/{output_prefix}/validation",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "csv",
            "CompressionType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": f"{output_bucket_path}/{output_prefix}/single-xgboost"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.2xlarge", "VolumeSizeInGB": 5},
    "RoleArn": role,
    "StaticHyperParameters": {
        "objective": "reg:linear",
        "verbosity": "2",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

In [41]:
print(f"Creating a tuning job with name: {tuning_job_name}. It will take between 12 and 17 minutes to complete.")
client.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

Creating a tuning job with name: DEMO-xgboost-reg-16-13-26-39. It will take between 12 and 17 minutes to complete.


ClientError: An error occurred (ValidationException) when calling the CreateHyperParameterTuningJob operation: No S3 objects found under S3 URL "s3://sagemaker-ap-northeast-1-237716927536/sagemaker_studio/output/train" given in input data source. Please ensure that the bucket exists in the selected region (ap-northeast-1), that objects exist under that S3 prefix, and that the role "arn:aws:iam::237716927536:role/service-role/AmazonSageMaker-ExecutionRole-20221124T205608" has "s3:ListBucket" permissions on bucket "sagemaker-ap-northeast-1-237716927536".

In [2]:
# パラメータ設定
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "verbosity":"1",
        "objective":"reg:squarederror",
        "num_round":"50"}

# set an output path where the trained model will be saved
bucket = sagemaker.Session().default_bucket()
prefix = 'DEMO-xgboost-as-a-framework'
output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'abalone-xgb-framework')

# construct a SageMaker XGBoost estimator
# specify the entry_point to your xgboost training script
estimator = XGBoost(entry_point = "your_xgboost_abalone_script.py", 
                    framework_version='1.5-1',
                    hyperparameters=hyperparameters,
                    role=sagemaker.get_execution_role(),
                    instance_count=1,
                    instance_type='ml.m5.2xlarge',
                    output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "libsvm"
train_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)

# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

ModuleNotFoundError: No module named 'transformers'