In [None]:
'''
import sys
!{sys.executable} -m pip install --upgrade stepfunctions
'''

In [None]:
# import libraries
import boto3
import logging
import sagemaker
import stepfunctions

from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.transformer import Transformer

from stepfunctions.inputs import ExecutionInput
from stepfunctions import steps
from stepfunctions.steps import Parallel
from stepfunctions.steps.sagemaker import TrainingStep, ModelStep, TransformStep
from stepfunctions.workflow import Workflow

In [None]:
# define boto3 clients
#s3_client = boto3.client('s3')

# define catch-all execution role
hbomax_datascience_service_role = 'arn:aws:iam::613630599026:role/hbomax-datascience-service-role'

# set logging
stepfunctions.set_stream_logger(level=logging.INFO)

# collect session info
region = boto3.Session().region_name
acount_id = boto3.client('sts').get_caller_identity().get('Account')

# define s3 bucket
resources_bucket = 'hbomax-datascience-deployment-dev'
resources_bucket2 = 'datascience-hbo-users'

# sagemaker session
sagemaker_session = sagemaker.Session(default_bucket=resources_bucket)

# featurizer model name
featurizer_model_name ='FTFeaturizer'

# name the inference pipeline modek
pipeline_model_name = 'FTPipeLineModel'

# name the Stepfunctions pipeline
pipeline_name = 'FTInferenceRoutine'

# built-in XGBoost image
xgboost_image = get_image_uri(region, 'xgboost', repo_version='latest')

In [None]:
# define the data sources
s3_train_data = f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/train/raw'
s3_test_data = f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/test/raw'
s3_new_data = f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/new'

s3_out_data = f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/output'

In [None]:
# Define runtime input.  SageMaker expects unique names for each job, model and endpoint. 
execution_input = ExecutionInput(schema={
    'SKLearnFeaturizerJobName': str,
    'TransformTrainJobName': str,
    'TransformTestJobName': str,
    'FeaturizerModelName': str,
    'XGBModelName': str
})

In [None]:
# define the SKLearn Preprocessing Estimator
sklearn_featurizer = SKLearn(
    entry_point='featurizer.py',
    role=hbomax_datascience_service_role,
    #role=sagemaker.get_execution_role(),
    output_kms_key='alias/aws/s3',
    train_instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session)

In [None]:
fit_featurizer_step = TrainingStep(
    'Fit Featurizer', 
    estimator = sklearn_featurizer,
    data={
        'train': s3_train_data
    },
    #tags= {'task': 'tune','algorithm':'linlearner'},
    job_name=execution_input['SKLearnFeaturizerJobName']
)

In [None]:
create_featurizer_model_step = ModelStep(
    'Create Featurizer Model', 
    model = fit_featurizer_step.get_expected_model(),
    model_name=execution_input['FeaturizerModelName']
)

In [None]:
train_transformer = Transformer(
    model_name=featurizer_model_name,
    instance_count=1,
    instance_type='ml.m4.2xlarge',
    strategy='SingleRecord',
    assemble_with='Line',
    #output_kms_key='alias/aws/s3',
    output_path=f's3://{resources_bucket2}/lifecycle/free-trial-propensity-model/train/transformed'
)

transform_train_step = TransformStep(
    'Transform Training Data',
    transformer=train_transformer,
    job_name=execution_input['TransformTrainJobName'],
    model_name=featurizer_model_name,
    data=s3_train_data,
    content_type= 'text/csv',
    split_type='Line',
    wait_for_completion=True
)

In [None]:
test_transformer = Transformer(
    model_name=featurizer_model_name,
    instance_count=1,
    instance_type='ml.m4.2xlarge',
    strategy='SingleRecord',
    assemble_with='Line',
    #output_kms_key='alias/aws/s3',
    output_path=f's3://{resources_bucket2}/lifecycle/free-trial-propensity-model/test/transformed'
)

transform_test_step = TransformStep(
    'Transform Test Data',
    transformer=test_transformer,
    job_name=execution_input['TransformTestJobName'],
    model_name=featurizer_model_name,
    data=s3_test_data,
    content_type= 'text/csv',
    split_type='Line',
    wait_for_completion=True
)

In [None]:
parallel_transform_step = Parallel(
    state_id="Branch Transformations"
)

parallel_transform_step.add_branch(transform_test_step)
parallel_transform_step.add_branch(transform_train_step)

In [None]:
# define the XGBoost Model Estimator
xgboost_estimator = Estimator(image_name = xgboost_image,
                          role = hbomax_datascience_service_role, 
                          train_instance_count = 1, 
                          train_instance_type='ml.m4.4xlarge',
                          output_path = s3_out_data,
                          output_kms_key = 'alias/aws/s3',
                          hyperparameters = {
                                             'eval_metric':'auc'
                                            , 'alpha':1.218487609
                                            , 'eta':0.225242353
                                            , 'max_depth':10
                                            , 'min_child_weight':2.284773815
                                            , 'num_round':100
                                            , 'objective':'binary:logistic'
                                            , 'rate_drop':0.3
                                            , 'tweedie_variance_power':1.4
                                          },
                          sagemaker_session=sagemaker_session)

In [None]:
workflow_definition = steps.Chain([fit_featurizer_step, create_featurizer_model_step, parallel_transform_step])
$workflow_definition = steps.Chain([parallel_transform_step])

workflow = Workflow(
    name=pipeline_name,
    definition=workflow_definition,
    role=hbomax_datascience_service_role,
    execution_input=execution_input
)

workflow.create()
workflow.update(workflow_definition)