In [1]:
import pandas as pd
from sklearn.datasets import load_iris

In [2]:
iris = load_iris(as_frame=True)["data"]
target = load_iris(as_frame=True)["target"]
data = pd.concat([target, iris], axis=1)

In [3]:
data.head()

Unnamed: 0,target,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


In [4]:
filepath_predictors = "../data/predictors.csv"
iris.to_csv(filepath_predictors, index=False, header=False)

In [5]:
filepath_data = "../data/data.csv"
data.to_csv(filepath_data, index=False, header=False)

In [6]:
import sagemaker
from sagemaker.image_uris import retrieve
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

In [35]:
session = sagemaker.Session()

In [36]:
region_name = session.boto_region_name
region_name

'us-east-2'

In [37]:
container_image = retrieve(framework='xgboost', region=region_name, version='latest')
container_image

'825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest'

In [38]:
bucket = session.default_bucket()
bucket

'sagemaker-us-east-2-735421146733'

In [39]:
uploaded_data = session.upload_data(path=filepath_data, bucket=bucket, key_prefix='training')
uploaded_data

's3://sagemaker-us-east-2-735421146733/training/data.csv'

In [40]:
input_data = TrainingInput(s3_data=uploaded_data, content_type='text/csv')

In [41]:
role = "arn:aws:iam::735421146733:role/AmazonSageMaker-ExecutionRole"

In [42]:
xgboost = Estimator(image_uri=container_image,
                    role=role,
                    instance_type='ml.m5.large',
                    instance_count=1,
                    output_path=f"s3://{bucket}/output",
                    sagemaker_session=session)

In [43]:
xgboost.set_hyperparameters(num_round=5, max_depth=5)

In [44]:
xgboost.fit({'train': input_data})

2021-07-10 07:38:27 Starting - Starting the training job...
2021-07-10 07:38:49 Starting - Launching requested ML instancesProfilerReport-1625902699: InProgress
...
2021-07-10 07:39:32 Starting - Preparing the instances for training.........
2021-07-10 07:41:10 Downloading - Downloading input data...
2021-07-10 07:41:36 Training - Training image download completed. Training in progress.
2021-07-10 07:41:36 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2021-07-10:07:41:31:INFO] Running standalone xgboost training.[0m
[34m[2021-07-10:07:41:31:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2021-07-10:07:41:31:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 161.93mb[0m
[34m[2021-07-10:07:41:31:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:41:31] S3DistributionType set as FullyReplicated[0m
[34m[07:41:31] 150x4 matrix with 600 entries loaded from /opt/ml/input/data/train?forma

In [45]:
xgboost.latest_training_job.job_name

'xgboost-2021-07-10-07-38-19-937'

In [46]:
xgboost.model_data

's3://sagemaker-us-east-2-735421146733/output/xgboost-2021-07-10-07-38-19-937/output/model.tar.gz'