In [1]:
import pandas as pd

dataset = pd.read_csv('housing.csv')

In [12]:
print(dataset.shape)
dataset.iloc[:5,:5]

(506, 13)


Unnamed: 0,crim,zn,indus,chas,nox
0,0.00632,18.0,2.31,0,0.538
1,0.02731,0.0,7.07,0,0.469
2,0.02729,0.0,7.07,0,0.469
3,0.03237,0.0,2.18,0,0.458
4,0.06905,0.0,2.18,0,0.458


In [13]:
# Move 'medv' column to front
dataset = pd.concat([dataset['medv'], dataset.drop(['medv'], axis=1)], axis=1)

In [14]:
from sklearn.model_selection import train_test_split
training_dataset, validation_dataset = train_test_split(dataset, test_size=0.1)

print(training_dataset.shape)
print(validation_dataset.shape)

(455, 13)
(51, 13)


In [15]:
training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)

In [18]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'boston-housing'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

print(training_data_path)
print(validation_data_path)

2.72.3
s3://sagemaker-ap-south-1-362794302980/boston-housing/input/training/training_dataset.csv
s3://sagemaker-ap-south-1-362794302980/boston-housing/input/validation/validation_dataset.csv


In [38]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('xgboost', region, version='latest')

from sagemaker import get_execution_role
role = "arn:aws:iam::362794302980:role/sagemakerRole"
    
xgb_estimator = Estimator(container,
    role=role, 
    instance_count=1,
    instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    train_use_spot_instances = True,
    train_max_run = 100,
    train_max_wait = 600
)

xgb_estimator.set_hyperparameters(objective='reg:linear',
                                 num_round=200,
                                 early_stopping_rounds=10)

train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [39]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')
xgb_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [40]:
xgb_estimator.fit(xgb_data)

2022-01-11 09:45:28 Starting - Starting the training job...
2022-01-11 09:45:54 Starting - Launching requested ML instancesProfilerReport-1641894327: InProgress
...
2022-01-11 09:46:23 Starting - Preparing the instances for training............
2022-01-11 09:48:35 Downloading - Downloading input data
2022-01-11 09:48:35 Training - Downloading the training image..[34mArguments: train[0m
[34m[2022-01-11:09:48:48:INFO] Running standalone xgboost training.[0m
[34m[2022-01-11:09:48:48:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 132.64mb[0m
[34m[2022-01-11:09:48:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:48:48] S3DistributionType set as FullyReplicated[0m
[34m[09:48:48] 455x12 matrix with 5460 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-01-11:09:48:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:48:48] S3DistributionType set as FullyReplicated[0m


2022-01-11 09:49:00 Uploading - Uploading generated training model
2022-01-11 09:49:00 Completed - Training job completed
Training seconds: 39
Billable seconds: 12
Managed Spot Training savings: 69.2%


In [41]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

endpoint_name = 'xgb-demo-'+timestamp
print(endpoint_name)

xgb-demo-11-09-55-03


In [42]:
xgb_predictor = xgb_estimator.deploy(endpoint_name=endpoint_name, 
                        initial_instance_count=1, 
                        instance_type='ml.t2.medium')

--------!

In [43]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

In [44]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = xgb_predictor.predict(test_sample)
print(response)

[['24.138395309448242']]


In [45]:
test_samples = ['0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98',
                '0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,9.14']

response = xgb_predictor.predict(test_samples)
print(response)

[['24.138395309448242', '21.528553009033203']]


In [46]:
runtime = boto3.Session().client(service_name='runtime.sagemaker') 

response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                  ContentType='text/csv', 
                                  Body=test_sample)

print(response['Body'].read())

b'24.138395309448242'


In [47]:
xgb_predictor.delete_endpoint()