# Create event-based trigger with Cloud Functions

Create a Cloud Function that runs the pipeline whenever new data is inserted into a BigQuery table

> Specifically, we use an Eventarc to trigger the function whenever a google.`cloud.bigquery.v2.JobService.InsertJob` event occurs

> refer to BQ resource as: `projects/hybrid-vertex/datasets/mlops/tables/chicago`

For more information, see [Eventarc triggers](https://cloud.google.com/functions/docs/calling/eventarc) and [supported event types](https://cloud.google.com/eventarc/docs/reference/supported-events)

# Create Cloud Function trigger

## Follow these steps in the console:

### [1] In the Google Cloud console, go to the [Cloud Run functions](https://console.cloud.google.com/functions)

### [2] Click the Create Function button. In the Configuration page:

  * Select **2nd gen** as your environment.
  * For Function name, use **mlops**.
  * For **Region**, select the same region as your Cloud Storage bucket and Artifact Registry repository.
  * For **Trigger** Select **Other trigger**. The Eventarc Trigger pane opens.
    * For **Trigger Type**, choose **Google Sources**.
    * For **Event Provider**, choose **BigQuery**.
    * For **Event type**, choose `google.cloud.bigquery.v2.JobService.InsertJob`
    * For **Resource**, choose Specific resource and specify the **BigQuery table:**
    
        > `projects/hybrid-vertex/datasets/mlops/tables/chicago`
    
    * In the Region field, select a location for the Eventarc trigger, if applicable. See [Trigger location](https://cloud.google.com/functions/docs/calling/eventarc#trigger-location) for more information.
    
    * Click **Save Trigger**
    
If you are asked to grant roles to service account(s), click **Grant All**

### [3] Click **Next** to go to the **Code page**. In the **Code page**:

* Set the **Runtime** to **python 3.12**
* Set the **Entry point** to `mlops_entrypoint`
* Open the file `requirements.txt` and replace the contents with the following:

> ```bash
requests==2.31.0
google-auth==2.25.1

* With the **Inline Editor**, open the file `main.py` and replace the contents with the script below

In [None]:
# TODO: 
## update vars: `PROJECT_ID`,`REGION`,`BUCKET_NAME`
## update script:

# import json
# import functions_framework
# import requests
# import google.auth
# import google.auth.transport.requests
# # CloudEvent function to be triggered by an Eventarc Cloud Audit Logging trigger
# # Note: this is NOT designed for second-party (Cloud Audit Logs -> Pub/Sub) triggers!
# @functions_framework.cloud_event
# def mlops_entrypoint(cloudevent):
#     # Print out the CloudEvent's (required) `type` property
#     # See https://github.com/cloudevents/spec/blob/v1.0.1/spec.md#type
#     print(f"Event type: {cloudevent['type']}")

#     # Print out the CloudEvent's (optional) `subject` property
#     # See https://github.com/cloudevents/spec/blob/v1.0.1/spec.md#subject
#     if 'subject' in cloudevent:
#         # CloudEvent objects don't support `get` operations.
#         # Use the `in` operator to verify `subject` is present.
#         print(f"Subject: {cloudevent['subject']}")

#     # Print out details from the `protoPayload`
#     # This field encapsulates a Cloud Audit Logging entry
#     # See https://cloud.google.com/logging/docs/audit#audit_log_entry_structure

#     payload = cloudevent.data.get("protoPayload")
#     if payload:
#         print(f"API method: {payload.get('methodName')}")
#         print(f"Resource name: {payload.get('resourceName')}")
#         print(f"Principal: {payload.get('authenticationInfo', dict()).get('principalEmail')}")
#         row_count = payload.get('metadata', dict()).get('tableDataChange',dict()).get('insertedRowsCount')
#         print(f"No. of rows: {row_count} !!")
#         if row_count:
#             if int(row_count) > 0:
#                 print ("Pipeline trigger Condition met !!")
#                 submit_pipeline_job()
#         else:
#             print ("No pipeline triggered !!!")

# def submit_pipeline_job():
#     PROJECT_ID = 'PROJECT_ID'
#     REGION = 'REGION'
#     BUCKET_NAME = "BUCKET_NAME"
#     DATASET_NAME = "mlops"
#     TABLE_NAME = "chicago"

#     base_output_dir = BUCKET_NAME
#     BUCKET_URI = "gs://{}".format(BUCKET_NAME)
#     PIPELINE_ROOT = "{}/pipeline_root/chicago-taxi-pipe".format(BUCKET_URI)
#     PIPELINE_NAME = "vertex-mlops-pipeline-tutorial"
#     EXPERIMENT_NAME = PIPELINE_NAME + "-experiment"
#     REPO_NAME ="mlops"
#     TEMPLATE_NAME="custom-model-training-evaluation-pipeline"
#     TRAINING_JOB_DISPLAY_NAME="taxifare-prediction-training-job"
#     worker_pool_specs = [{
#                         "machine_spec": {"machine_type": "e2-highmem-2"},
#                         "replica_count": 1,
#                         "python_package_spec":{
#                                 "executor_image_uri": "us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-0:latest",
#                                 "package_uris": [f"{BUCKET_URI}/trainer-0.1.tar.gz"],
#                                 "python_module": "trainer.task",
#                                 "args":["--project-id",PROJECT_ID,"--training-dir",f"/gcs/{BUCKET_NAME}","--bq-source",f"{PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}"]
#                         },
#     }]

#     parameters = {
#         "project": PROJECT_ID,
#         "location": REGION,
#         "training_job_display_name": "taxifare-prediction-training-job",
#         "worker_pool_specs": worker_pool_specs,
#         "base_output_dir": BUCKET_URI,
#         "prediction_container_uri": "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",
#         "model_display_name": "taxifare-prediction-model",
#         "batch_prediction_job_display_name": "taxifare-prediction-batch-job",
#         "target_field_name": "fare",
#         "test_data_gcs_uri": [f"{BUCKET_URI}/test_no_target.csv"],
#         "ground_truth_gcs_source": [f"{BUCKET_URI}/test.csv"],
#         "batch_predictions_gcs_prefix": f"{BUCKET_URI}/batch_predict_output",
#         "existing_model": False
#     }
#     TEMPLATE_URI = f"https://{REGION}-kfp.pkg.dev/{PROJECT_ID}/{REPO_NAME}/{TEMPLATE_NAME}/latest"
#     print("TEMPLATE URI: ", TEMPLATE_URI)
#     request_body = {
#         "name": PIPELINE_NAME,
#         "displayName": PIPELINE_NAME,
#         "runtimeConfig":{
#             "gcsOutputDirectory": PIPELINE_ROOT,
#             "parameterValues": parameters,
#         },
#         "templateUri": TEMPLATE_URI
#     }
#     pipeline_url = "https://us-central1-aiplatform.googleapis.com/v1/projects/{}/locations/{}/pipelineJobs".format(PROJECT_ID, REGION)
#     creds, project = google.auth.default()
#     auth_req = google.auth.transport.requests.Request()
#     creds.refresh(auth_req)
#     headers = {
#     'Authorization': 'Bearer {}'.format(creds.token),
#     'Content-Type': 'application/json; charset=utf-8'
#     }
#     response = requests.request("POST", pipeline_url, headers=headers, data=json.dumps(request_body))
#     print(response.text)

### [4] Click **Deploy** to deploy the function

# Add new data to trigger pipeline

In [1]:
from google.cloud import bigquery

print(f'bigquery SDK version: {bigquery.__version__}')

bigquery SDK version: 3.25.0


In [2]:
PROJECT_ID = "hybrid-vertex"
REGION = "us-central1"

bq_client = bigquery.Client(
    project=PROJECT_ID,
    location=REGION
)

# Set the project id
! gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [3]:
QUERY = f"""
INSERT INTO `{PROJECT_ID}.mlops.chicago`
(
    WITH
      taxitrips AS (
      SELECT
        trip_start_timestamp,
        trip_end_timestamp,
        trip_seconds,
        trip_miles,
        payment_type,
        pickup_longitude,
        pickup_latitude,
        dropoff_longitude,
        dropoff_latitude,
        tips,
        tolls,
        fare,
        pickup_community_area,
        dropoff_community_area,
        company,
        unique_key
      FROM
        `{PROJECT_ID}.mlops.taxi_trips`
      WHERE pickup_longitude IS NOT NULL
      AND pickup_latitude IS NOT NULL
      AND dropoff_longitude IS NOT NULL
      AND dropoff_latitude IS NOT NULL
      AND trip_miles > 0
      AND trip_seconds > 0
      AND fare > 0
      AND EXTRACT(YEAR FROM trip_start_timestamp) = 2022
    )

    SELECT
      trip_start_timestamp,
      EXTRACT(MONTH from trip_start_timestamp) as trip_month,
      EXTRACT(DAY from trip_start_timestamp) as trip_day,
      EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,
      EXTRACT(HOUR from trip_start_timestamp) as trip_hour,
      trip_seconds,
      trip_miles,
      payment_type,
      ST_AsText(
          ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)
      ) AS pickup_grid,
      ST_AsText(
          ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)
      ) AS dropoff_grid,
      ST_Distance(
          ST_GeogPoint(pickup_longitude, pickup_latitude),
          ST_GeogPoint(dropoff_longitude, dropoff_latitude)
      ) AS euclidean,
      CONCAT(
          ST_AsText(ST_SnapToGrid(ST_GeogPoint(pickup_longitude,
              pickup_latitude), 0.1)),
          ST_AsText(ST_SnapToGrid(ST_GeogPoint(dropoff_longitude,
              dropoff_latitude), 0.1))
      ) AS loc_cross,
      IF((tips/fare >= 0.2), 1, 0) AS tip_bin,
      tips,
      tolls,
      fare,
      pickup_longitude,
      pickup_latitude,
      dropoff_longitude,
      dropoff_latitude,
      pickup_community_area,
      dropoff_community_area,
      company,
      unique_key,
      trip_end_timestamp
    FROM
      taxitrips
    LIMIT 1000000
)
"""

# print to inspect
print(QUERY)


INSERT INTO `hybrid-vertex.mlops.chicago`
(
    WITH
      taxitrips AS (
      SELECT
        trip_start_timestamp,
        trip_end_timestamp,
        trip_seconds,
        trip_miles,
        payment_type,
        pickup_longitude,
        pickup_latitude,
        dropoff_longitude,
        dropoff_latitude,
        tips,
        tolls,
        fare,
        pickup_community_area,
        dropoff_community_area,
        company,
        unique_key
      FROM
        `hybrid-vertex.mlops.taxi_trips`
      WHERE pickup_longitude IS NOT NULL
      AND pickup_latitude IS NOT NULL
      AND dropoff_longitude IS NOT NULL
      AND dropoff_latitude IS NOT NULL
      AND trip_miles > 0
      AND trip_seconds > 0
      AND fare > 0
      AND EXTRACT(YEAR FROM trip_start_timestamp) = 2022
    )

    SELECT
      trip_start_timestamp,
      EXTRACT(MONTH from trip_start_timestamp) as trip_month,
      EXTRACT(DAY from trip_start_timestamp) as trip_day,
      EXTRACT(DAYOFWEEK from trip_start_

In [4]:
# # uncomment to submit
job = bq_client.query(query = QUERY)
job.result()
(job.ended-job.started).total_seconds()

8.227