In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import json
import os
from sagemaker.amazon.amazon_estimator import get_image_uri

In [None]:
sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

## S3 Credentials

In [None]:
bucket='krypton-data'
prefix= 'ml-data'

In [None]:
s3_train_dir = "s3://krypton-data/ml-data/train/train.csv"
s3_test_dir = "s3://krypton-data/ml-data/train/test.csv"

## Setup sagemaker to use custom transformer

In [None]:

from sagemaker.sklearn.estimator import SKLearn

script_path = 'preprocessor.py'

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    train_instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session,
    dependencies=['transformers.py'])


In [None]:
#fit transformer
sklearn_preprocessor.fit({'train': s3_train_dir})

In [None]:
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv')

In [None]:
#Perform Batch Transformation
transformer.transform(s3_train_dir, content_type='text/csv', )
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

Sagemaker models assume the first column to be the target variable, this should be done during etl process

In [None]:
file = preprocessed_train+"/train.csv.out"

In [None]:
data = pd.read_csv(file, header=None)

In [None]:
target = pd.read_csv(s3_train_dir)[['SalePrice']]

In [None]:
train_data = pd.concat([target, data], axis=1)

In [None]:
train_data.head()

In [None]:
s3 = boto3.client('s3')

In [None]:
train_data.to_csv('train_data.csv', header=None, index=False)

In [None]:
check_df = pd.read_csv('train_data.csv', header=None)
check_df.head()

In [None]:
s3 = boto3.client('s3')

In [None]:
s3.upload_file('train_data.csv', 'krypton-data', 'ml-data/train/train_data.csv')

In [None]:
s3_train_data_dir = f"s3://{bucket}/{prefix}/train/train_data.csv"

In [None]:
s3_train_data_dir

# Set up for training

In [None]:
xgb_image = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')


In [None]:
s3_xgb_output_key_prefix = "xgb_training_output"
s3_xgb_output_location = f"s3://{bucket}/{prefix}/{s3_xgb_output_key_prefix}/xgboost_model"

In [None]:

xgb_estimator = sagemaker.estimator.Estimator(
    xgb_image,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    input_mode= 'File',
    output_path=s3_xgb_output_location,
    sagemaker_session=sagemaker_session)

In [None]:
xgb_estimator.set_hyperparameters(objective="reg:linear", seed=42,  num_round=100,
                                          gamma=0.01,
                                         eta=0.1,
                                         max_depth=5,
                                         alpha=5,
                                         subsample=0.85,
                                         colsample_bytree=0.95,
                                         min_child_weight= 3)

In [None]:
xgb_estimator.hyperparameters()

In [None]:
xgb_train_data = sagemaker.session.s3_input(
    s3_train_data_dir, 
    content_type='text/csv')

In [None]:
data_channels = {'train': xgb_train_data}
xgb_estimator.fit(inputs=data_channels, logs=True)

# Build pipeline

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
xgb_learner_model = xgb_estimator.create_model()

model_name = 'inference-pipeline-' + timestamp_prefix
endpoint_name = 'inference-pipeline-ep-' + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, 
    role=role, 
    models=[
        scikit_learn_inferencee_model, 
        xgb_learner_model])

sm_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name=endpoint_name)

In [69]:
endpoint_name

'inference-pipeline-ep-2020-05-15-21-52-39'

# Deploy Model

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

In [None]:
predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=None,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

In [71]:
predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=None,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

In [None]:
single_data

In [None]:
single = single_data.to_csv(index=False).encode('utf-8')

In [72]:
predictor.predict(single)

b'{"predictions": [{"score": 100495.609375}]}'

# Delete endpoint

In [81]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'aa283897-4520-45d4-bf58-c77422cf6119',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'aa283897-4520-45d4-bf58-c77422cf6119',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 16 May 2020 03:33:24 GMT'},
  'RetryAttempts': 0}}