In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
import logging
logging.getLogger('sagemaker').setLevel(logging.ERROR)

## preprocess data
* put data and label into one file
* the first column must be label, followed by data
* no header

https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-preprocess-data.html#ex1-preprocess-data-transform

In [None]:
iris = load_iris()
x = iris.data
y = iris.target

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

train_df=pd.DataFrame(x_train, columns=iris.feature_names)
train_df.insert(0, 'species', y_train)
train_df.to_csv('iris_data/train.csv', index=False, header=False)

val_df=pd.DataFrame(x_val, columns=iris.feature_names)
val_df.insert(0, 'species', y_val)
val_df.to_csv('iris_data/val.csv', index=False, header=False)

# save to s3
sess=sagemaker.Session()
bucket=sess.default_bucket()
boto3.Session().resource('s3').Bucket(bucket).Object('data/train.csv').upload_file('iris_data/train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object('data/val.csv').upload_file('iris_data/val.csv')

## define model

In [None]:
# get execution role from local
# https://github.com/aws/sagemaker-python-sdk/issues/300
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html
def resolve_sm_role():
    client = boto3.client('iam', region_name=boto3.Session().region_name)
    response_roles = client.list_roles(
        PathPrefix='/',
        MaxItems=123
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            return role['Arn']
    raise Exception('need to create sagemaker execution role from aws console first')
    return

try:
    execution_role_ARN = sagemaker.get_execution_role()
except ValueError:
    execution_role_ARN = resolve_sm_role()

In [None]:
xgb=Estimator(
    sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, 'latest'),
    execution_role_ARN,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=f"s3://{bucket}/output",
    sagemaker_session=sess
)

# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
xgb.set_hyperparameters(
    num_class=3,
    num_round=10,
    eta=0.2,
    gamma=4,
    max_depth=5,
    min_child_weight=6,
    objective='multi:softmax', # needs to be changed for binary classification
    verbose=2
)

## train
https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-train-model.html

In [None]:
train = TrainingInput(s3_data=f's3://{bucket}/data/train.csv', content_type='csv')
val = TrainingInput(s3_data=f's3://{bucket}/data/val.csv', content_type='csv')

xgb.fit({"train": train, "validation": val}, wait=True)

## deploy
https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-model-deployment.html

In [None]:
xgb_predictor=xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    serializer=CSVSerializer()
)

In [None]:
xgb_predictor.endpoint_name

## test
endpoint_name is the output of `xgb_predictor.endpoint_name`

In [None]:
xgb_predictor=sagemaker.predictor.Predictor(
    endpoint_name="xgboost-2024-06-14-05-56-13-766",
    sagemaker_session=sagemaker.Session(),
    serializer=sagemaker.serializers.CSVSerializer()
)

In [None]:
test = np.array([[4.6, 3.6, 1.0, 0.2],
                 [6.7, 3.1, 4.4, 1.4],
                 [5.8, 2.7, 5.1, 1.9]
])
pred = xgb_predictor.predict(test).decode('utf-8').split(',')
pred = [int(float(p)) for p in pred]
print(pred)