# Environment Setup for triggering Vertex Pipelines

## Install Packages
Run pip requirements.txt in either (1) the notebook cell below or (2) in a notebook terminal window

In [1]:
!pwd

/home/jupyter/trigger-vertex-pipelines


In [2]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

# !pip install --no-cache-dir -r ./requirements.txt --user -q

#### check package installs

In [3]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform

KFP SDK version: 2.3.0
google-cloud-aiplatform==1.34.0


# Set vars

In [4]:
# naming convention for all cloud resources
VERSION        = "v1"                         # TODO - @ param {type:"string"}
PREFIX         = f'pipe-triggers-{VERSION}'   # TODO - @ param {type:"string"}

print(f"PREFIX = {PREFIX}")

PREFIX = pipe-triggers-v1


### GCP project

See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [5]:
# creds, PROJECT_ID = google.auth.default()
GCP_PROJECTS            = !gcloud config get-value project
PROJECT_ID              = GCP_PROJECTS[0]

PROJECT_NUM             = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM             = PROJECT_NUM[0]

SERVICE_ACCOUNT         = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com'

print(f"PROJECT_ID      = {PROJECT_ID}")
print(f"PROJECT_NUM     = {PROJECT_NUM}")
print(f"SERVICE_ACCOUNT = {SERVICE_ACCOUNT}")

PROJECT_ID      = hybrid-vertex
PROJECT_NUM     = 934903580331
SERVICE_ACCOUNT = 934903580331-compute@developer.gserviceaccount.com


In [6]:
# Set the project id
! gcloud config set project {PROJECT_ID}

Updated property [core/project].


#### enable GCP services

In [7]:
# !gcloud services enable pubsub.googleapis.com
# !gcloud services enable run.googleapis.com

### Edit these 

In [8]:
# gcp IAM & Networking
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

# locations / regions for cloud resources
REGION                   = "us-central1"
BQ_REGION                = REGION.split("-")[0].upper()

print(f"VPC_NETWORK_NAME = {VPC_NETWORK_NAME}")
print(f"REGION           = {REGION}")
print(f"BQ_REGION        = {BQ_REGION}")

VPC_NETWORK_NAME = ucaip-haystack-vpc-network
REGION           = us-central1
BQ_REGION        = US


### Resource names for this tutorial

In [9]:
# gcs 
BUCKET_NAME      = f"{PREFIX}-{PROJECT_ID}"
BUCKET_URI       = f"gs://{BUCKET_NAME}"

# networking 
VPC_NETWORK_FULL = f"projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}"

# PubSub topic - for triggering
TOPIC_ID         = f"{PREFIX}-topic"

print(f"BUCKET_URI       : {BUCKET_URI}")
print(f"VPC_NETWORK_FULL : {VPC_NETWORK_FULL}")
print(f"TOPIC_ID         : {TOPIC_ID}")

BUCKET_URI       : gs://pipe-triggers-v1-hybrid-vertex
VPC_NETWORK_FULL : projects/934903580331/global/networks/ucaip-haystack-vpc-network
TOPIC_ID         : pipe-triggers-v1-topic


In [10]:
# bigquery
DATASET_ID       = f"census_{PREFIX}".replace("-","_")  # The Data Set ID where the view sits
VIEW_NAME        = "census_data"                        # BigQuery view you create for input data
BQ_LOG_TABLE     = "census_training_table"

BQ_LOG_DATA_URI  = f'{PROJECT_ID}.{DATASET_ID}.{BQ_LOG_TABLE}'

print(f"DATASET_ID      : {DATASET_ID}")
print(f"VIEW_NAME       : {VIEW_NAME}")
print(f"BQ_LOG_DATA_URI : {BQ_LOG_DATA_URI}")

DATASET_ID      : census_pipe_triggers_v1
VIEW_NAME       : census_data
BQ_LOG_DATA_URI : hybrid-vertex.census_pipe_triggers_v1.census_training_table


In [11]:
# pipelines
PIPELINE_DISPLAY_NAME  = f"census-{PREFIX}"

# This is where all pipeline artifacts are sent
PIPELINE_YAML_FILENAME = "pipeline.yaml"
PIPELINE_ROOT_PATH     = f"{BUCKET_URI}/census_pipeline_root"
PIPELINES_FILEPATH     = f'{PIPELINE_ROOT_PATH}/pipeline-spec/{PIPELINE_YAML_FILENAME}'

print(f"PIPELINE_DISPLAY_NAME  : {PIPELINE_DISPLAY_NAME}")
print(f"PIPELINE_ROOT_PATH     : {PIPELINE_ROOT_PATH}")
print(f"PIPELINE_YAML_FILENAME : {PIPELINE_YAML_FILENAME}")
print(f"PIPELINES_FILEPATH     : {PIPELINES_FILEPATH}")

PIPELINE_DISPLAY_NAME  : census-pipe-triggers-v1
PIPELINE_ROOT_PATH     : gs://pipe-triggers-v1-hybrid-vertex/census_pipeline_root
PIPELINE_YAML_FILENAME : pipeline.yaml
PIPELINES_FILEPATH     : gs://pipe-triggers-v1-hybrid-vertex/census_pipeline_root/pipeline-spec/pipeline.yaml


# Create Cloud Resources

#### GCS bucket

In [13]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Creating gs://pipe-triggers-v1-hybrid-vertex/...


In [14]:
# grant SA permissions to GCS

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

#### BigQuery dataset and tables

In [15]:
!bq mk --location=$BQ_REGION --dataset $PROJECT_ID:$DATASET_ID

Dataset 'hybrid-vertex:census_pipe_triggers_v1' successfully created.


In [12]:
import pandas as pd
from google.cloud import bigquery

In [13]:
bq_client = bigquery.Client(
    project=PROJECT_ID,
    location=BQ_REGION
)

BQ_PUBLIC_DS_URI = 'bigquery-public-data.ml_datasets.census_adult_income'

In [26]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_LOG_DATA_URI}` AS (
   SELECT * FROM `{BQ_PUBLIC_DS_URI}` 
   LIMIT 1000
)
"""
print(query)


CREATE OR REPLACE TABLE `hybrid-vertex.census_pipe_triggers_v1.census_training_table` AS (
   SELECT * FROM `bigquery-public-data.ml_datasets.census_adult_income` 
   LIMIT 1000
)



In [27]:
%%time
bq_client.query(query).result()

CPU times: user 8 ms, sys: 3.19 ms, total: 11.2 ms
Wall time: 2.64 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f7a6d004fd0>

In [28]:
data_table = bq_client.get_table(f"{BQ_LOG_DATA_URI}")

current_rows = data_table.num_rows
current_rows

1000

### Create PubSub Topic

* see [Create a Topic](https://cloud.google.com/pubsub/docs/create-topic#create_a_topic_2) for details

In [16]:
from google.cloud import pubsub_v1

In [17]:
publisher_client = pubsub_v1.PublisherClient()

TOPIC_PATH       = publisher_client.topic_path(PROJECT_ID, TOPIC_ID)

PUBSUB_TOPIC     = publisher_client.create_topic(request={"name": TOPIC_PATH})

print(f"Created topic: {PUBSUB_TOPIC.name}")

In [51]:
# ! gcloud pubsub topics list

### Create PubSub subscription

> Note: Cloud Functions [automatically creates push subscriptions](https://cloud.google.com/functions/docs/tutorials/pubsub) that forward messages from the specified topic to your function, even within a VPC Service Controls environment.

* see [Create a Push Subscription](https://cloud.google.com/pubsub/docs/create-push-subscription) for more details

In [None]:
# from google.cloud import pubsub_v1

# # TODO(developer)
# # project_id = "your-project-id"
# # topic_id = "your-topic-id"
# # subscription_id = "your-subscription-id"
# # endpoint = "https://my-test-project.appspot.com/push"

# publisher = pubsub_v1.PublisherClient()
# subscriber = pubsub_v1.SubscriberClient()
# topic_path = publisher.topic_path(project_id, topic_id)
# subscription_path = subscriber.subscription_path(project_id, subscription_id)

# push_config = pubsub_v1.types.PushConfig(push_endpoint=endpoint)

# # Wrap the subscriber in a 'with' block to automatically call close() to
# # close the underlying gRPC channel when done.
# with subscriber:
#     subscription = subscriber.create_subscription(
#         request={
#             "name": subscription_path,
#             "topic": topic_path,
#             "push_config": push_config,
#         }
#     )

# print(f"Push subscription created: {subscription}.")
# print(f"Endpoint for subscription is: {endpoint}")

## Save Notebook Configuration Data

If you want to avoid having to re-enter these across notebooks

In [19]:
config = f"""
PROJECT_ID             = \"{PROJECT_ID}\"
PROJECT_NUM            = \"{PROJECT_NUM}\"

REGION                 = \"{REGION}\"
BQ_REGION              = \"{BQ_REGION}\"
BQ_PUBLIC_DS_URI       = \"{BQ_PUBLIC_DS_URI}\"
DATASET_ID             = \"{DATASET_ID}\"
VIEW_NAME              = \"{VIEW_NAME}\"
BQ_LOG_DATA_URI        = \"{BQ_LOG_DATA_URI}\"

VPC_NETWORK_NAME       = \"{VPC_NETWORK_NAME}\"
VPC_NETWORK_FULL       = \"{VPC_NETWORK_FULL}\"

SERVICE_ACCOUNT        = \"{SERVICE_ACCOUNT}\"

VERSION                = \"{VERSION}\"
PREFIX                 = \"{PREFIX}\"

BUCKET_NAME            = \"{BUCKET_NAME}\"
BUCKET_URI             = \"{BUCKET_URI}\"

TOPIC_PATH             = \"{TOPIC_PATH}\"
PUBSUB_TOPIC           = \"{PUBSUB_TOPIC.name}\"

PIPELINE_DISPLAY_NAME  = \"{PIPELINE_DISPLAY_NAME}\"
PIPELINE_ROOT_PATH     = \"{PIPELINE_ROOT_PATH}\"
PIPELINE_YAML_FILENAME = \"{PIPELINE_YAML_FILENAME}\"
PIPELINES_FILEPATH     = \"{PIPELINES_FILEPATH}\"
"""
print(config)


PROJECT_ID             = "hybrid-vertex"
PROJECT_NUM            = "934903580331"

REGION                 = "us-central1"
BQ_REGION              = "US"
BQ_PUBLIC_DS_URI       = "bigquery-public-data.ml_datasets.census_adult_income"
DATASET_ID             = "census_pipe_triggers_v1"
VIEW_NAME              = "census_data"
BQ_LOG_DATA_URI        = "hybrid-vertex.census_pipe_triggers_v1.census_training_table"

VPC_NETWORK_NAME       = "ucaip-haystack-vpc-network"
VPC_NETWORK_FULL       = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

SERVICE_ACCOUNT        = "934903580331-compute@developer.gserviceaccount.com"

VERSION                = "v1"
PREFIX                 = "pipe-triggers-v1"

BUCKET_NAME            = "pipe-triggers-v1-hybrid-vertex"
BUCKET_URI             = "gs://pipe-triggers-v1-hybrid-vertex"

TOPIC_PATH             = "projects/hybrid-vertex/topics/pipe-triggers-v1-topic"
PUBSUB_TOPIC           = "projects/hybrid-vertex/topics/pipe-triggers-v1-topic"

PIPELI

In [20]:
!echo '{config}' | gsutil cp - {BUCKET_URI}/config/notebook_env.py

Copying from <STDIN>...
/ [1 files][    0.0 B/    0.0 B]                                                
Operation completed over 1 objects.                                              


In [21]:
!gsutil ls $BUCKET_URI

gs://pipe-triggers-v1-hybrid-vertex/census_pipeline_root/
gs://pipe-triggers-v1-hybrid-vertex/config/


#### write config to py file

In [40]:
!pwd

/home/jupyter/pipe-triggers


In [None]:
CF_DIR = "cloud_function"

# !mkdir CF_DIR 

In [41]:
with open(f'{CF_DIR}/env_config.py', 'w') as f:
    f.write(config)

In [42]:
from cloud_function import env_config as env_config

env_config.PIPELINE_DISPLAY_NAME

'census-pipe-triggers-v1'

### git ignore

In [30]:
%%writefile .gitignore
*.cpython-310.pyc
*checkpoint*
*.ipynb_checkpoints/*
*WIP*
*keep-local*
# .gcloudignore
# .git
# .github
*__pycache__
# *cpython-37.pyc
# .gitignore
# .DS_Store

Overwriting .gitignore


### gcloud ignore

In [23]:
! gcloud config set gcloudignore/enabled true

Updated property [gcloudignore/enabled].


In [28]:
%%writefile .gcloudignore
*__pycache__
*.ipynb_checkpoints/*
*.ipynb
.gcloudignore
*.git
*.gitignore
*.md

Overwriting .gcloudignore


In [29]:
# check eligble files
!gcloud meta list-files-for-upload

pipeline.yaml
requirements.txt
cloud_function/env_config.py
cloud_function/main.py
cloud_function/requirements.txt


### Delete `__pycache__` directories

In [54]:
LIST_CMD = 'find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)"'
DELETE_CMD = 'find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)" | xargs rm -rf'

# set variables if running in terminal
print("copy these commands into terminal:\n")
print(f"{LIST_CMD}")
print(f"{DELETE_CMD}")

copy these commands into terminal:

find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)"
find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)" | xargs rm -rf


**Finished**