In [None]:
'''
import sys
!{sys.executable} -m pip install --upgrade stepfunctions
'''

In [None]:
# import libraries
import boto3
from botocore.client import Config
import logging
import sagemaker
import stepfunctions

from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.s3 import S3Uploader
from sagemaker.transformer import Transformer

from stepfunctions.inputs import ExecutionInput
from stepfunctions import steps
from stepfunctions.steps import Parallel
from stepfunctions.steps.sagemaker import TrainingStep, ModelStep, TransformStep
from stepfunctions.steps.compute import LambdaStep
from stepfunctions.workflow import Workflow

import os
import zipfile

In [None]:
# define boto3 clients
s3_client = boto3.client('s3', config=Config(signature_version='s3v4'))

# define catch-all execution role
hbomax_datascience_service_role = 'arn:aws:iam::613630599026:role/hbomax-datascience-service-role'

# set logging
stepfunctions.set_stream_logger(level=logging.INFO)

# collect session info
region = boto3.Session().region_name
acount_id = boto3.client('sts').get_caller_identity().get('Account')

# define s3 bucket
resources_bucket = 'hbomax-datascience-deployment-dev'

# sagemaker session
sagemaker_session = sagemaker.Session(default_bucket=resources_bucket)

# name the Stepfunctions pipeline
pipeline_name = 'FTInferenceRoutine'

# built-in XGBoost image
xgboost_image = get_image_uri(region, 'xgboost', repo_version='latest')

In [None]:
# define the data sources
    # raw data
s3_train_raw = sagemaker.s3_input(s3_data=f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/train/raw', content_type='text/csv')
s3_test_raw = sagemaker.s3_input(s3_data=f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/test/raw', content_type='text/csv')

    # transformed data
s3_train_transformed = sagemaker.s3_input(s3_data=f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/train/transformed', content_type='text/csv')
s3_test_transformed = sagemaker.s3_input(s3_data=f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/test/transformed', content_type='text/csv')

s3_new_data = sagemaker.s3_input(s3_data=f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/new', content_type='text/csv')
s3_out_data = f's3://{resources_bucket}/lifecycle/free-trial-propensity-model/output'

In [None]:
def writeLambda(function_name, role, description):
    zip_name = f'{function_name}.zip'
    lambda_source_code = f'lambda_code/{function_name}.py'

    zf = zipfile.ZipFile(zip_name, mode='w')
    zf.write(lambda_source_code, arcname=lambda_source_code.split('/')[-1])
    zf.close()

    #s3_client.copy_object(zip_name, Bucket='datascience-hbo-users', zip_name) # ExtraArgs={"ServerSideEncryption": "aws:kms"},
    
    S3Uploader.upload(local_path=zip_name, 
                      desired_s3_uri=f's3://datascience-hbo-users/lambda_code', ## UPDATE!!
                      #kms_key='alias/aws/s3',
                      #kms_key='aws:kms',
                      session=sagemaker_session)

    lambda_client = boto3.client('lambda')

    # delete the existing function if necessary
    for func in [f['FunctionName'] for f in lambda_client.list_functions()['Functions']]:
        if func == function_name:
            lambda_client.delete_function(FunctionName=func)

    response = lambda_client.create_function(
        FunctionName=function_name,
        Runtime='python3.7',
        Role=role,
        Handler=f'{function_name}.lambda_handler',
        Code={
            'S3Bucket': 'datascience-hbo-users', ## UPDATE!!
            'S3Key': 'lambda_code/{}'.format(zip_name)
        },
        Description=description,
        Timeout=15,
        MemorySize=128
    )

    # delete the zip archive
    os.remove(zip_name)

In [None]:
for f in [{'name':'free_trial_create_pipeline_model', 'description':'Create an Inference Pipeline Model'}
         ]:
    writeLambda(function_name=f['name'], role=hbomax_datascience_service_role, description=f['description'])

In [None]:
# Define runtime input.  SageMaker expects unique names for each job, model and endpoint. 
execution_input = ExecutionInput(schema={
    'SKLearnFeaturizerJobName': str,
    'TransformTrainJobName': str,
    'TransformTestJobName': str,
    'FeaturizerModelName': str,
    'XGBModelName': str,
    'TrainXGBoostJobName': str,
    'PipelineModelName': str
})

In [None]:
# define the SKLearn Preprocessing Estimator
sklearn_featurizer = SKLearn(
    entry_point='featurizer.py',
    role=hbomax_datascience_service_role,
    output_kms_key='alias/aws/s3',
    train_instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session)

In [None]:
fit_featurizer_step = TrainingStep(
    'Fit Featurizer', 
    estimator = sklearn_featurizer,
    data={
        'train': s3_train_raw.config['DataSource']['S3DataSource']['S3Uri']
    },
    tags= {'model': 'free_trial_sklearn_featurizer'},
    job_name=execution_input['SKLearnFeaturizerJobName']
)

In [None]:
create_featurizer_model_step = ModelStep(
    'Create Featurizer Model', 
    model = fit_featurizer_step.get_expected_model(),
    model_name=execution_input['FeaturizerModelName']
)

In [None]:
train_transformer = Transformer(
    model_name=execution_input['FeaturizerModelName'],
    instance_count=3,
    instance_type='ml.m4.2xlarge',
    strategy='MultiRecord',
    assemble_with='Line',
    output_kms_key='alias/aws/s3',
    accept='text/csv',
    output_path=s3_train_transformed.config['DataSource']['S3DataSource']['S3Uri']
)

transform_train_step = TransformStep(
    'Transform Training Data',
    transformer=train_transformer,
    job_name=execution_input['TransformTrainJobName'],
    model_name=execution_input['FeaturizerModelName'],
    data=s3_train_raw.config['DataSource']['S3DataSource']['S3Uri'],
    content_type= 'text/csv',
    split_type='Line',
    wait_for_completion=True
)

In [None]:
test_transformer = Transformer(
    model_name=execution_input['FeaturizerModelName'],
    instance_count=3,
    instance_type='ml.m4.2xlarge',
    strategy='MultiRecord',
    assemble_with='Line',
    output_kms_key='alias/aws/s3',
    accept='text/csv',
    output_path=s3_test_transformed.config['DataSource']['S3DataSource']['S3Uri']
)

transform_test_step = TransformStep(
    'Transform Test Data',
    transformer=test_transformer,
    job_name=execution_input['TransformTestJobName'],
    model_name=execution_input['FeaturizerModelName'],
    data=s3_test_raw.config['DataSource']['S3DataSource']['S3Uri'],
    content_type= 'text/csv',
    split_type='Line',
    wait_for_completion=True
)

In [None]:
parallel_transform_step = Parallel(
    state_id="Branch Transformations"
)

parallel_transform_step.add_branch(transform_test_step)
parallel_transform_step.add_branch(transform_train_step)

In [None]:
# define the XGBoost Model Estimator
xgboost_estimator = Estimator(image_name = xgboost_image,
                          role = hbomax_datascience_service_role, 
                          train_instance_count = 1, 
                          train_instance_type='ml.m4.4xlarge',
                          output_path = s3_out_data,
                          output_kms_key = 'alias/aws/s3',
                          hyperparameters = {
                                             'eval_metric':'auc'
                                            , 'alpha':1.218487609
                                            , 'eta':0.225242353
                                            , 'max_depth':10
                                            , 'min_child_weight':2.284773815
                                            , 'num_round':2
                                            , 'objective':'binary:logistic'
                                            , 'rate_drop':0.3
                                            , 'tweedie_variance_power':1.4
                                          },
                          sagemaker_session=sagemaker_session)

In [None]:
train_xgboost_step = TrainingStep(
    'Train XGBoost', 
    estimator = xgboost_estimator,
    data={
        'train': s3_train_transformed,
        'validation': s3_test_transformed
    },
    tags= {'model': 'free_trial_xgboost'},
    job_name=execution_input['TrainXGBoostJobName']
)

In [None]:
create_pipeline_model_step = LambdaStep(
    'Create Pipeline Model',
    parameters={  
        "FunctionName": 'free_trial_create_pipeline_model',
        'Payload':{
            'PipelineModelName': execution_input['PipelineModelName'],
            'Role': hbomax_datascience_service_role
        }
    }
)

In [None]:
workflow_definition = steps.Chain([fit_featurizer_step, create_featurizer_model_step, parallel_transform_step, train_xgboost_step, create_pipeline_model_step])
#workflow_definition = steps.Chain([train_xgboost_step])

workflow = Workflow(
    name=pipeline_name,
    definition=workflow_definition,
    role=hbomax_datascience_service_role,
    execution_input=execution_input
)

workflow.create()
workflow.update(workflow_definition)