# Triggers for QA Pipeline

## Scheduled Trigger

**Parameters**

In [9]:
import uuid

rule_name = 'DailyQAPipelineTrigger'
pipeline_id = f"Id{uuid.uuid1()}"
pipeline_name = 'qa-pipeline-16322955241632295524'

In [14]:
import boto3

sm = boto3.Session().client('sagemaker')
pipeline = sm.describe_pipeline(PipelineName=pipeline_name)
pipeline_arn = pipeline['PipelineArn']
pipeline_role_arn = pipeline['RoleArn']

print(f"pipeline_arn:\t\t {pipeline_arn}")
print(f"run_pipeline_role_arn:\t {pipeline_role_arn}") #"arn:aws:iam::093729152554:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole"

pipeline_arn:		 arn:aws:sagemaker:us-east-1:093729152554:pipeline/qa-pipeline-16322955241632295524
run_pipeline_role_arn:	 arn:aws:iam::093729152554:role/service-role/AWSNeptuneNotebookRole-NepTestRole


Create a rule that executes the pipeline every 1 day:

In [15]:
import boto3

events = boto3.client('events')

# use the same name will update the trigger
events.put_rule(
    Name=rule_name,
    ScheduleExpression='rate(1 day)',
    State='DISABLED',
    Description='Daily re-run the question answering pipeline',
    EventBusName='default'
)

{'RuleArn': 'arn:aws:events:us-east-1:093729152554:rule/DailyQAPipelineTrigger',
 'ResponseMetadata': {'RequestId': '63aab4c5-f291-493d-ba19-e9676cb23252',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '63aab4c5-f291-493d-ba19-e9676cb23252',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '79',
   'date': 'Fri, 24 Sep 2021 09:22:38 GMT'},
  'RetryAttempts': 0}}

Add the qa code pipeline as a target:

In [16]:
events.put_targets(
    Rule=rule_name,
    EventBusName='default',
    Targets=[
        {
            "Id": pipeline_id,
            "Arn": pipeline_arn,
            "RoleArn": pipeline_role_arn
        }
    ]
)

{'FailedEntryCount': 0,
 'FailedEntries': [],
 'ResponseMetadata': {'RequestId': '676de8ef-7ff4-430f-9ce6-4984a426cd26',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '676de8ef-7ff4-430f-9ce6-4984a426cd26',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '41',
   'date': 'Fri, 24 Sep 2021 09:23:33 GMT'},
  'RetryAttempts': 0}}

Check events with aws cli

In [17]:
!echo 'Rule description:'
!aws events describe-rule --name $rule_name
!echo 'Targets associated:'
!aws events list-targets-by-rule --rule $rule_name

Rule description:
{
    "Name": "DailyQAPipelineTrigger",
    "Arn": "arn:aws:events:us-east-1:093729152554:rule/DailyQAPipelineTrigger",
    "ScheduleExpression": "rate(1 day)",
    "State": "DISABLED",
    "Description": "Daily re-run the question answering pipeline",
    "EventBusName": "default",
    "CreatedBy": "093729152554"
}
Targets associated:
{
    "Targets": [
        {
            "Id": "Id7835a714-1d18-11ec-a471-5ed24fdeea0c",
            "Arn": "arn:aws:sagemaker:us-east-1:093729152554:pipeline/qa-pipeline-16322955241632295524",
            "RoleArn": "arn:aws:iam::093729152554:role/service-role/AWSNeptuneNotebookRole-NepTestRole"
        }
    ]
}


## S3 Trigger

In [21]:
watched_bucket = 'sm-nlp-data'
watched_prefix = 'nlu/data/processed/' # set watchted prefix to '' if you want to watch the whole bucket
trail_name = 'WatchQAInputData'
s3_rule_name = 'QA-S3-Trigger'
s3_rule_description = 'Run question answering pipeline every time new data uploaded to specified location.'

import sagemaker
# We use default_bucket to save logs as it's better to store logs in a different bucket to avoid spin triggering.
default_bucket = sagemaker.Session().default_bucket()
default_bucket

'sagemaker-us-east-1-093729152554'

### 1. Attach policy to S3 bucket to receive the log files 


    
Check [permission for cloudtrail](https://docs.aws.amazon.com/awscloudtrail/latest/userguide/create-s3-bucket-policy-for-cloudtrail.html?icmpid=docs_cloudtrail_console) for more detail.

You can delete bucket policy to revoke permission using:</br>
`$ aws s3api delete-bucket-policy --bucket [your-bucket]`

Note: you might want to keep original policy statements. But for the sake of simplicity, here I just overwrite the bucket policy. You can check the original policy with the following code.

In [22]:
import boto3
import json
from pprint import pprint

# Retrieve the original policy of the specified bucket (this would be overwritten by new policies)
s3 = boto3.client('s3')
result = s3.get_bucket_policy(Bucket=default_bucket)
pprint(json.loads(result['Policy']))

{'Statement': [{'Action': 's3:GetBucketAcl',
                'Effect': 'Allow',
                'Principal': {'Service': 'cloudtrail.amazonaws.com'},
                'Resource': 'arn:aws:s3:::sagemaker-us-east-1-093729152554',
                'Sid': 'AWSCloudTrailAclCheck20150319'},
               {'Action': 's3:PutObject',
                'Condition': {'StringEquals': {'s3:x-amz-acl': 'bucket-owner-full-control'}},
                'Effect': 'Allow',
                'Principal': {'Service': 'cloudtrail.amazonaws.com'},
                'Resource': 'arn:aws:s3:::sagemaker-us-east-1-093729152554/AWSLogs/093729152554/*',
                'Sid': 'AWSCloudTrailWrite20150319'}],
 'Version': '2012-10-17'}


In [27]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
log_bucket_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "AWSCloudTrailAclCheck20150319",
            "Effect": "Allow",
            "Principal": {"Service": "cloudtrail.amazonaws.com"},
            "Action": "s3:GetBucketAcl",
            "Resource": f"arn:aws:s3:::{default_bucket}"
        },
        {
            "Sid": "AWSCloudTrailWrite20150319",
            "Effect": "Allow",
            "Principal": {"Service": "cloudtrail.amazonaws.com"},
            "Action": "s3:PutObject",
            "Resource": f"arn:aws:s3:::{default_bucket}/AWSLogs/{account_id}/*",
            "Condition": {"StringEquals": {"s3:x-amz-acl": "bucket-owner-full-control"}}
        }
    ]
}

log_bucket_policy = json.dumps(log_bucket_policy)
s3.put_bucket_policy(Bucket=default_bucket, Policy=log_bucket_policy)

{'ResponseMetadata': {'RequestId': '9XKT7VP8CDS04H2K',
  'HostId': '5TwBv38p4T+iT/iVLSNH8Mdu82Rx96tLLShqB1oAYVYyFW82ZsDni6jTFwuoZlKu4GW4rpIa2jE=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '5TwBv38p4T+iT/iVLSNH8Mdu82Rx96tLLShqB1oAYVYyFW82ZsDni6jTFwuoZlKu4GW4rpIa2jE=',
   'x-amz-request-id': '9XKT7VP8CDS04H2K',
   'date': 'Fri, 24 Sep 2021 10:14:46 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

### 2. Create a trail to log S3 events 

A trail captures API calls and related events in your account and then delivers the log files to an S3 bucket that you specify.

You can delete a previously defined trail by running (the default trail name we defined is `WatchKGInputDataset`):</br>
`$ aws cloudtrail delete-trail --name [your-trail-name]`

In [28]:
cloudtrail = boto3.client('cloudtrail')

In [32]:
cloudtrail.create_trail(
    Name=trail_name,
    S3BucketName=default_bucket, # this specifies the bucket to save logs
    TagsList=[
        {
            'Key': 'event',
            'Value': 'qa-dataset-update'
        }
    ]
)

{'Name': 'WatchQAInputData',
 'S3BucketName': 'sagemaker-us-east-1-093729152554',
 'IncludeGlobalServiceEvents': True,
 'IsMultiRegionTrail': False,
 'TrailARN': 'arn:aws:cloudtrail:us-east-1:093729152554:trail/WatchQAInputData',
 'LogFileValidationEnabled': False,
 'IsOrganizationTrail': False,
 'ResponseMetadata': {'RequestId': '11c6968c-e9b8-4a21-b72b-9177531c4427',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '11c6968c-e9b8-4a21-b72b-9177531c4427',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '277',
   'date': 'Fri, 24 Sep 2021 10:17:01 GMT'},
  'RetryAttempts': 0}}

### 3. Define event selector for CloudTrail

Use event selectors or advanced event selectors to specify management and data event settings for your trail. For each trail, if the event matches any event selector, the trail processes and logs the event.

In [33]:
watched_s3_resource_arn = "arn:aws:s3:::{}/{}".format(watched_bucket, watched_prefix)
event_selector = [
    { 
        "ReadWriteType": "WriteOnly", 
        "IncludeManagementEvents":False, 
        "DataResources": 
            [
                { 
                    "Type": "AWS::S3::Object", 
                    "Values": [watched_s3_resource_arn] 
                }
            ]
    }
]

In [34]:
cloudtrail.put_event_selectors(
    TrailName=trail_name,
    EventSelectors=event_selector
)

{'TrailARN': 'arn:aws:cloudtrail:us-east-1:093729152554:trail/WatchQAInputData',
 'EventSelectors': [{'ReadWriteType': 'WriteOnly',
   'IncludeManagementEvents': False,
   'DataResources': [{'Type': 'AWS::S3::Object',
     'Values': ['arn:aws:s3:::sm-nlp-data/nlu/data/processed/']}],
   'ExcludeManagementEventSources': []}],
 'ResponseMetadata': {'RequestId': 'de1acf58-653a-4eb4-a030-8e2cb1e5288c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'de1acf58-653a-4eb4-a030-8e2cb1e5288c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '298',
   'date': 'Fri, 24 Sep 2021 10:18:24 GMT'},
  'RetryAttempts': 0}}

In [35]:
cloudtrail.start_logging(
    Name=trail_name
)

{'ResponseMetadata': {'RequestId': 'ef37b715-d1db-4588-93e1-ff611b89da55',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ef37b715-d1db-4588-93e1-ff611b89da55',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'date': 'Fri, 24 Sep 2021 10:18:32 GMT'},
  'RetryAttempts': 0}}

### 4. Create EventBridge rule that can trigger SageMaker pipeline.

In [36]:
pattern = {
    "source": ["aws.s3"],
    "detail-type": ["AWS API Call via CloudTrail"],
    "detail": {
        "eventSource": ["s3.amazonaws.com"],
        "eventName": ["PutObject", "CompleteMultipartUpload", "CopyObject"],
        "requestParameters": {"bucketName": ["{}".format(watched_bucket)]},
    },
}

pattern_json = json.dumps(pattern)
pprint(pattern)

{'detail': {'eventName': ['PutObject', 'CompleteMultipartUpload', 'CopyObject'],
            'eventSource': ['s3.amazonaws.com'],
            'requestParameters': {'bucketName': ['sm-nlp-data']}},
 'detail-type': ['AWS API Call via CloudTrail'],
 'source': ['aws.s3']}


In [38]:
import boto3

events = boto3.client('events')

response = events.put_rule(
    Name=s3_rule_name,
    EventPattern=pattern_json,
    State="ENABLED",
    Description=s3_rule_description,
    EventBusName="default",
    Tags=[
        {
            'Key': 'event',
            'Value': 'qa-dataset-update'
        },
    ],
)
response

{'RuleArn': 'arn:aws:events:us-east-1:093729152554:rule/QA-S3-Trigger',
 'ResponseMetadata': {'RequestId': '5209885c-9bff-4761-8138-c03e8ac80c3e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5209885c-9bff-4761-8138-c03e8ac80c3e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '70',
   'date': 'Fri, 24 Sep 2021 10:22:27 GMT'},
  'RetryAttempts': 0}}

In [39]:
rule_arn = response["RuleArn"]
print(rule_arn)

arn:aws:events:us-east-1:093729152554:rule/QA-S3-Trigger


### 5. Add pipeline as target to the rule

In [40]:
response = events.put_targets(
    Rule=s3_rule_name,
    EventBusName='default',
    Targets=[
        {
            "Id": pipeline_id,
            "Arn": pipeline_arn,
            "RoleArn": pipeline_role_arn
        }
    ]
)
response

{'FailedEntryCount': 0,
 'FailedEntries': [],
 'ResponseMetadata': {'RequestId': '1c4dc75f-feb0-41dc-86ba-f3e12ab7734e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1c4dc75f-feb0-41dc-86ba-f3e12ab7734e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '41',
   'date': 'Fri, 24 Sep 2021 10:23:10 GMT'},
  'RetryAttempts': 0}}

### 6. Trigger pipeline by writing to the watched location