In [1]:
import pandas as pd
dataset = pd.read_csv('Boston.csv')

In [2]:
print(dataset.shape)

(506, 15)


In [3]:
dataset[:5]

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
dataset = pd.concat([dataset['medv'],
                     dataset.drop(['medv'], axis=1)],
                    axis=1)

In [5]:
from sklearn.model_selection import train_test_split
training_dataset, validation_dataset = train_test_split(dataset, test_size=0.1)

In [6]:
training_dataset.to_csv('training_dataset.csv',index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv',index=False, header=False)

In [7]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()

In [8]:
prefix = 'boston-housing'
training_data_path = sess.upload_data(
    path='training_dataset.csv',
    key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(
    path='validation_dataset.csv',
    key_prefix=prefix + '/input/validation')
print(training_data_path)
print(validation_data_path)

s3://sagemaker-us-east-2-664224523979/boston-housing/input/training/training_dataset.csv
s3://sagemaker-us-east-2-664224523979/boston-housing/input/validation/validation_dataset.csv


In [9]:
import boto3
from sagemaker import image_uris

In [10]:
region = boto3.Session().region_name
container = image_uris.retrieve('xgboost', region, version='latest')

In [12]:
from sagemaker.estimator import Estimator

xgb_estimator = Estimator(
    container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path='s3://{}/{}/output'.format(bucket,
                                           prefix))

In [13]:
xgb_estimator.set_hyperparameters(
    objective='reg:linear',
    num_round=200,
    early_stopping_rounds=10)

In [14]:
training_data_channel = sagemaker.TrainingInput(
    s3_data=training_data_path,
    content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(
    s3_data=validation_data_path,
    content_type='text/csv')

In [15]:
xgb_estimator.fit({'train': training_data_channel,'validation': validation_data_channel})

2021-04-04 12:36:42 Starting - Starting the training job...
2021-04-04 12:36:44 Starting - Launching requested ML instancesProfilerReport-1617539802: InProgress
......
2021-04-04 12:37:58 Starting - Preparing the instances for training...
2021-04-04 12:38:40 Downloading - Downloading input data...
2021-04-04 12:39:01 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-04-04:12:39:17:INFO] Running standalone xgboost training.[0m
[34m[2021-04-04:12:39:17:INFO] File size need to be processed in the node: 0.04mb. Available memory size in the node: 7928.28mb[0m
[34m[2021-04-04:12:39:17:INFO] Determined delimiter of CSV input is ','[0m
[34m[12:39:17] S3DistributionType set as FullyReplicated[0m
[34m[12:39:17] 455x14 matrix with 6370 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-04-04:12:39:17:INFO] Determined delimiter of CSV input is ','[0m
[34m[12:39:17] S3DistributionType set as FullyReplicated[0m


In [16]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())
endpoint_name = 'xgb-demo'+'-'+timestamp
xgb_predictor = xgb_estimator.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type='ml.t2.medium')

---------------!

In [19]:
test_sample = '0.00632, 18, 2.31, 0, 0.538, 6.575, 65.2, 4.09, 1, 296, 15.3, 396.9, 4.98, 24'
#xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer =sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer =sagemaker.deserializers.CSVDeserializer()
response = xgb_predictor.predict(test_sample)
print(response)

[['24.393402099609375']]


In [20]:
xgb_predictor.delete_endpoint()