## Orchestrate RecSys workflow with Vertex AI Pipelines

In [1]:
PROJECT_ID = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
LOCATION = 'us-central1'

!gcloud config set project {PROJECT_ID}

Updated property [core/project].


### pip & package

In [2]:
# !pip install google-cloud-aiplatform==1.17.0 --upgrade
# !pip install google-cloud-pipeline-components==1.0.19 --upgrade
# !pip install kfp==1.8.13 --upgrade

In [3]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

KFP SDK version: 1.8.13
google_cloud_pipeline_components version: 1.0.19
aiplatform SDK version: 1.17.0


In [4]:
import json
import os
import time
import pandas as pd
import numpy as np
import sys

# Pipelines
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.types import artifact_types

# Kubeflow SDK
import kfp
from kfp.v2 import dsl
import kfp.v2.dsl
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient
# from kfp.v2.google import client as pipelines_client
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

# GCP
from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import pipeline_jobs
# from google.cloud import bigquery
from google.cloud import storage

### Setup clients

In [5]:
# ====================================================
# Setup Clients
# ====================================================
os.environ['GOOGLE_CLOUD_PROJECT']=PROJECT_ID

# colab_auth.authenticate_user()

storage_client = storage.Client(
    project=PROJECT_ID
)

pipeline_client = AIPlatformClient(
  project_id=PROJECT_ID,
  region=LOCATION,
)

vertex_ai.init(
    project=PROJECT_ID, location=LOCATION,
)



### Pipeline job vars

In [6]:
import time 
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

# ====================================================
# Job vars 
# (consistent with manual train job execution)
# ====================================================
APP='sp'
MODEL_TYPE='2tower'
FRAMEWORK = 'tfrs'
MODEL_VERSION = 'v15'
PIPELINE_VERSION = 'v3'
MODEL_ROOT_NAME = f'{APP}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}-{PIPELINE_VERSION}'

EXPERIMENT_PREFIX = 'pipe-dev'                                                  # custom identifier for organizing experiments
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}'
RUN_NAME=f'run-{TIMESTAMP}'

# TODO:
DATA_REGIME = 'small-jt-tfrecord'
IMAGE_URI = 'gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training'

# ====================================================
# Pipeline output repo 
# ====================================================
OUTPUT_BUCKET = 'jt-tfrs-test'
STAGING_BUCKET =f'gs://{OUTPUT_BUCKET}'

DOCKERNAME_TRAIN = 'Dockerfile.tfrs'
REPO_DOCKER_PATH_PREFIX = 'src'


# Stores pipeline executions for each run
PIPELINE_ROOT_PATH = f'gs://{OUTPUT_BUCKET}/{MODEL_ROOT_NAME}/pipeline_root'
# PIPELINE_ROOT_PATH = f'gs://{OUTPUT_BUCKET}/{EXPERIMENT_NAME}/pipeline_root' # TODO: in future version
print('PIPELINE_ROOT_PATH: {}'.format(PIPELINE_ROOT_PATH))

# Optional: save and load pipeline definition
PIPELINES = {}

PIPELINES_FILEPATH = f'{PIPELINE_ROOT_PATH}/pipelines.json'
print("PIPELINES_FILEPATH:", PIPELINES_FILEPATH)

if os.path.isfile(PIPELINES_FILEPATH):
    with open(PIPELINES_FILEPATH) as f:
        PIPELINES = json.load(f)
else:
    PIPELINES = {}

def save_pipelines():
    with open(PIPELINES_FILEPATH, 'w') as f:
        json.dump(PIPELINES, f)

PIPELINE_ROOT_PATH: gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipeline_root
PIPELINES_FILEPATH: gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipeline_root/pipelines.json


## Create Pipeline Components

In [7]:
# Make folder for Python training script
! rm -rf {REPO_DOCKER_PATH_PREFIX}/pipelines
! mkdir {REPO_DOCKER_PATH_PREFIX}/pipelines
# ! touch {REPO_DOCKER_PATH_PREFIX}/pipelines/__init__.py

### TODO: Build custom train image

In [8]:
!export PWD=pwd
!export DOCKERNAME_TRAIN=DOCKERNAME_TRAIN
!export STAGING_BUCKET=STAGING_BUCKET
!export MODEL_VERSION=MODEL_VERSION
!export PIPELINE_VERSION=PIPELINE_VERSION
!export DATA_REGIME=DATA_REGIME
! echo $PWD
! echo $DOCKERNAME_TRAIN
! echo $STAGING_BUCKET
! echo $MODEL_VERSION
! echo $PIPELINE_VERSION
! echo $DATA_REGIME

/home/jupyter/spotify-tfrs
Dockerfile.tfrs
gs://jt-tfrs-test
v15
v3
small-jt-tfrecord


In [9]:
print(f"APP: {APP}")
print(f"MODEL_TYPE: {MODEL_TYPE}")
print(f"FRAMEWORK: {FRAMEWORK}")
print(f"MODEL_VERSION: {MODEL_VERSION}")
print(f"PIPELINE_VERSION: {PIPELINE_VERSION}\n")
print(f"MODEL_ROOT_NAME: {MODEL_ROOT_NAME}")
print(f"OUTPUT_BUCKET: {OUTPUT_BUCKET}")
print(f"IMAGE_URI: {IMAGE_URI}")

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}\n")

# print(f"RUN_NAME: {RUN_NAME}")

BASE_OUTPUT_DIR = f'gs://{OUTPUT_BUCKET}/{MODEL_ROOT_NAME}/{EXPERIMENT_NAME}'
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"PIPELINE_ROOT_PATH: {PIPELINE_ROOT_PATH}")

APP: sp
MODEL_TYPE: 2tower
FRAMEWORK: tfrs
MODEL_VERSION: v15
PIPELINE_VERSION: v3

MODEL_ROOT_NAME: sp-2tower-tfrs-v15-v3
OUTPUT_BUCKET: jt-tfrs-test
IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training
EXPERIMENT_NAME: pipe-dev-2tower-tfrs-v15

BASE_OUTPUT_DIR   : gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15
PIPELINE_ROOT_PATH: gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipeline_root


#### TODO: parameterize...

In [45]:
# # # copy training Dockerfile
# ! gsutil cp $PWD/src/Dockerfile.tfrs $BUCKET_URI/$VERSION/src
!gsutil cp $PWD/src/Dockerfile.tfrs gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/
!gsutil cp $PWD/src/cloudbuild.yaml gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/

# # # copy training application code
! gsutil -m cp -r $PWD/src/trainer/* gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/trainer

print(f"Copied training application code and Dockerfile to {BASE_OUTPUT_DIR}/src")

Copying file:///home/jupyter/spotify-tfrs/src/Dockerfile.tfrs [Content-Type=application/octet-stream]...
/ [1 files][  270.0 B/  270.0 B]                                                
Operation completed over 1 objects/270.0 B.                                      
Copying file:///home/jupyter/spotify-tfrs/src/cloudbuild.yaml [Content-Type=application/octet-stream]...
/ [1 files][  178.0 B/  178.0 B]                                                
Operation completed over 1 objects/178.0 B.                                      
Copying file:///home/jupyter/spotify-tfrs/src/trainer/__init__.py [Content-Type=text/x-python]...
Copying file:///home/jupyter/spotify-tfrs/src/trainer/_data.py [Content-Type=text/x-python]...
Copying file:///home/jupyter/spotify-tfrs/src/trainer/_model.py [Content-Type=text/x-python]...
Copying file:///home/jupyter/spotify-tfrs/src/trainer/interactive_train.py [Content-Type=text/x-python]...
Copying file:///home/jupyter/spotify-tfrs/src/trainer/requirements.t

In [10]:
# # # list copied files from GCS location
! gsutil ls -Rl gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/

       178  2022-09-23T16:36:46Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src
gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/:
       270  2022-09-23T16:38:51Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/Dockerfile.tfrs
       178  2022-09-23T16:38:53Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/cloudbuild.yaml

gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/trainer/:
         0  2022-09-23T16:38:56Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/trainer/__init__.py
      5651  2022-09-23T16:38:56Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/trainer/_data.py
     30991  2022-09-23T16:38:56Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/trainer/_model.py
        46  2022-09-23T16:38:56Z  gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipe-dev-2tower-tfrs-v15/src/trainer/interactive_train.py
       166  

In [11]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/pipelines/build_custom_train_image.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="gcr.io/google.com/cloudsdktool/cloud-sdk:latest",
    packages_to_install=["google-cloud-build"],
    # output_component_file="./pipelines/build_custom_train_image.yaml",
)
def build_custom_train_image(
    project: str, 
    gcs_train_script_path: str,   # TRAIN_APP_CODE_PATH = f"{BUCKET_URI}/{VERSION}/src/" # jt-tfrs-test/pipev1/src
    training_image_uri: str,      # TRAIN_IMAGE_URI = f"gcr.io/{PROJECT_ID}/multiworker:2tower-pipe-{VERSION}"
    train_dockerfile_name: str,   # Dockerfile.tfrs
) -> NamedTuple("Outputs", [("training_image_uri", str)]):

    # TODO: make output Artifact for image_uri
    """
    custom pipeline component to build custom training image using
    Cloud Build and the training application code and dependencies
    defined in the Dockerfile
    """

    import logging
    import os
    import time

    from google.cloud.devtools import cloudbuild_v1 as cloudbuild
    from google.protobuf.duration_pb2 import Duration

    # initialize client for cloud build
    logging.getLogger().setLevel(logging.INFO)
    build_client = cloudbuild.services.cloud_build.CloudBuildClient()
    
    logging.info(f"train_dockerfile_name: {train_dockerfile_name}")

    # parse step inputs to get path to Dockerfile and training application code
    gcs_dockerfile_path = os.path.join(gcs_train_script_path, f"{train_dockerfile_name}")   # two-tower-pipes/2tower-recsys/vertex_train
    # gcs_cloudbuild_path = os.path.join(gcs_train_script_path, f"cloudbuild.yaml")
    gcs_train_script_dir = os.path.join(gcs_train_script_path, "trainer/")  # TRAIN_APP_CODE_PATH = f"{BUCKET_URI}/{APP_NAME}/{VERSION}/vertex_train/"
    
    logging.info(f"gcs_dockerfile_path: {gcs_dockerfile_path}")
    # logging.info(f"gcs_cloudbuild_path: {gcs_cloudbuild_path}")
    logging.info(f"gcs_train_script_dir: {gcs_train_script_dir}")
    
    logging.info(f"training_image_uri: {training_image_uri}") 
     

    start_time = time.time()

    # define build steps to pull the training code and Dockerfile
    # and build/push the custom training container image
    build = cloudbuild.Build()
    build.steps = [
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", "-r", gcs_train_script_dir, "."],
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", gcs_dockerfile_path, f"{train_dockerfile_name}"],
        },
        # enabling Kaniko cache in a Docker build that caches intermediate
        # layers and pushes image automatically to Container Registry
        # https://cloud.google.com/build/docs/kaniko-cache
        # {
        #     "name": "gcr.io/kaniko-project/executor:latest",
        #     # "name": "gcr.io/kaniko-project/executor:v1.8.0",        # TODO; downgraded to avoid error in build
        #     # "args": [f"--destination={training_image_uri}", "--cache=true"],
        #     "args": [f"--destination={training_image_uri}", "--cache=false"],
        # },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ['build','-t', f'{training_image_uri}', '.'],
        },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ['push', f'{training_image_uri}'], 
        },
    ]
    # override default timeout of 10min
    timeout = Duration()
    timeout.seconds = 7200
    build.timeout = timeout

    # create build
    operation = build_client.create_build(project_id=project, build=build)
    logging.info("IN PROGRESS:")
    logging.info(operation.metadata)
    
    end_time = time.time()
    elapsed_time = end_time - start_time

    logging.info(f"Elapsed timefor build: {elapsed_time}")

    # get build status
    result = operation.result()
    logging.info("RESULT:", result.status)
    
    logging.info(f"training_image_uri: {training_image_uri}")

    # return step outputs
    return (
        training_image_uri,
    )

Writing src/pipelines/build_custom_train_image.py


### Create Managed TensorBoard resource

In [12]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/pipelines/create_tensorboard.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image='python:3.9',
    packages_to_install=[
        'google-cloud-aiplatform==1.17.0',
    ],
    # output_component_file="./pipelines/create_tensorboard.yaml",
)
def create_tensorboard(
    project: str,
    location: str,
    model_version: str,
    pipeline_version: str,
    gcs_bucket_name: str,
    model_display_name: str,
    create_tb_resource: bool,
) -> NamedTuple('Outputs', [
                            ('tensorboard', Artifact),
                            ('tensorboard_resource_name', str),
]):

    import google.cloud.aiplatform as vertex_ai
    from datetime import datetime
    import logging

    # TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

    vertex_ai.init(
        project=project,
        location=location,
    )

    TENSORBOARD_DISPLAY_NAME = f"tb-{model_display_name}-{model_version}"

    if create_tb_resource==True:
        logging.info(f"TENSORBOARD_DISPLAY_NAME: {TENSORBOARD_DISPLAY_NAME}")

        tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME)

        tensorboard_resource_name = tensorboard.resource_name # projects/934903580331/locations/us-central1/tensorboards/6275818857298919424

        logging.info(f"Created tensorboard_resource_name: {tensorboard_resource_name}")

    else:
        logging.info(f"Searching for Existing TB: {TENSORBOARD_DISPLAY_NAME}")

        _tb_resource = vertex_ai.TensorboardExperiment.list(
            filter=f'display_name="{TENSORBOARD_DISPLAY_NAME}"'
        )[0]

        # retrieve endpoint uri
        tensorboard_resource_name = _tb_resource.resource_name
        logging.info(f"Found existing TB resource: {tensorboard_resource_name}")

        tensorboard = vertex_ai.Tensorboard(f'{tensorboard_resource_name}')

    return (
        tensorboard,
        f'{tensorboard_resource_name}',
    )

Writing src/pipelines/create_tensorboard.py


### Build vocabs and stats

** TODO:**
* parallel tasks for `candidates` and `playlist` data
* seperate calcs for `train` and `valid` splits 

In [13]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/pipelines/build_vocabs_stats.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)
@kfp.v2.dsl.component(
    base_image='python:3.9',
    packages_to_install=[
        'google-cloud-aiplatform==1.17.0',
        'tensorflow==2.9.2',
        'tensorflow-recommenders==0.7.0',
        'numpy',
        'google-cloud-storage',
    ],
    # output_component_file="./pipelines/build_vocabs_stats.yaml",
)
def build_vocabs_string_lookups(
    project: str,
    location: str,
    # version: str,
    model_version: str,
    pipeline_version: str,
    train_output_gcs_bucket: str,
    train_dir: str,
    train_dir_prefix: str,
    valid_dir: str,
    valid_dir_prefix: str,
    candidate_file_dir: str,
    candidate_files_prefix: str,
    experiment_name: str,
    experiment_run: str, 
    # gcs_bucket_name: str,
    # path_to_train_dir: str,
    # path_to_valid_dir: str,
    # path_to_candidate_dir: str,
    path_to_vocab_file: str,
    vocab_file_name: str,
    generate_vocabs_stats: bool,
    max_padding: int = 375,
    split_names: list = ['train', 'valid'],
) -> NamedTuple('Outputs', [
                            ('vocab_dict', Artifact),
                            ('vocab_gcs_filename', str),
                            ('vocab_gcs_sub_dir', str),
                            ('vocab_gcs_uri', str),
]):
    
    from google.cloud import aiplatform as vertex_ai
    import json
    import logging
    import json
    import tensorflow as tf
    import tensorflow_recommenders as tfrs
    import numpy as np
    import pickle as pkl
    from google.cloud import storage
    from datetime import datetime
    import numpy as np
    import string

    # TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    # logging.info(f'TIMESTAMP: {TIMESTAMP}')
    
    MAX_PLAYLIST_LENGTH = max_padding       # 375
    logging.info(f'MAX_PLAYLIST_LENGTH: {MAX_PLAYLIST_LENGTH}')

    vertex_ai.init(
        project=project,
        location=location,
    )
    
    ################################################################################
    # Helper Functions for feature parsing
    ################################################################################
    
    candidate_features = {
        'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    }

    all_features = {
        'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'pos_seed_track': tf.io.FixedLenFeature(dtype=tf.int64, shape=()),
        'track_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'track_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'duration_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'track_pop_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_pop_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_genres_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_followers_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'pid': tf.io.FixedLenFeature(dtype=tf.int64, shape=()),
        'name': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'collaborative': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'duration_ms_seed_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'n_songs_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'num_artists_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'num_albums_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'description_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        ###ragged
        'track_name_pl': tf.io.RaggedFeature(tf.string),
        'artist_name_pl': tf.io.RaggedFeature(tf.string),
        'album_name_pl': tf.io.RaggedFeature(tf.string),
        'track_uri_pl': tf.io.RaggedFeature(tf.string),
        'duration_ms_songs_pl': tf.io.RaggedFeature(tf.float32),
        'artist_pop_pl': tf.io.RaggedFeature(tf.float32),
        'artists_followers_pl': tf.io.RaggedFeature(tf.float32),
        'track_pop_pl': tf.io.RaggedFeature(tf.float32),
        'artist_genres_pl': tf.io.RaggedFeature(tf.string),
    }
    
    def parse_candidate_tfrecord_fn(example, feature_dict=candidate_features):
        example = tf.io.parse_single_example(
            example, 
            features=feature_dict
        )
        return example
    
    def parse_tfrecord_fn(example, feature_dict=all_features): # =all_features
        example = tf.io.parse_single_example(
            example, 
            features=feature_dict
        )
        return example

    def pad_up_to(t, max_in_dims=[1 ,MAX_PLAYLIST_LENGTH], constant_value=''):
        s = tf.shape(t)
        paddings = [[0, m-s[i]] for (i,m) in enumerate(max_in_dims)]
        return tf.pad(t, paddings, 'CONSTANT', constant_values=constant_value)

    def return_padded_tensors(data):

        a = pad_up_to(tf.reshape(data['track_name_pl'], shape=(1,-1)) , constant_value='')
        b = pad_up_to(tf.reshape(data['artist_name_pl'], shape=(1,-1)) , constant_value='')
        c = pad_up_to(tf.reshape(data['album_name_pl'], shape=(1,-1)) , constant_value='')
        d = pad_up_to(tf.reshape(data['track_uri_pl'], shape=(1, -1,)) , constant_value='')
        e = pad_up_to(tf.reshape(data['duration_ms_songs_pl'], shape=(1,-1)) , constant_value=-1.)
        f = pad_up_to(tf.reshape(data['artist_pop_pl'], shape=(1,-1)) , constant_value=-1.)
        g = pad_up_to(tf.reshape(data['artists_followers_pl'], shape=(1,-1)) , constant_value=-1.)
        h = pad_up_to(tf.reshape(data['track_pop_pl'], shape=(1,-1)) , constant_value=-1.)
        i = pad_up_to(tf.reshape(data['artist_genres_pl'], shape=(1,-1)) , constant_value='')

        padded_data = data.copy()
        padded_data['track_name_pl'] = a
        padded_data['artist_name_pl'] = b
        padded_data['album_name_pl'] = c
        padded_data['track_uri_pl'] = d
        padded_data['duration_ms_songs_pl'] = e
        padded_data['artist_pop_pl'] = f
        padded_data['artists_followers_pl'] = g
        padded_data['track_pop_pl'] = h
        padded_data['artist_genres_pl'] = i

        return padded_data
    
    storage_client = storage.Client()
    
    if generate_vocabs_stats == False:
        VOCAB_FILENAME = f'{vocab_file_name}'
        VOCAB_GCS_SUB_DIR = f'{path_to_vocab_file}'
        VOCAB_GCS_URI = f'gs://{train_output_gcs_bucket}/{VOCAB_GCS_SUB_DIR}/{VOCAB_FILENAME}'
        
        logging.info(f"Loading Vocab File: {VOCAB_FILENAME} from {VOCAB_GCS_URI}")
        
        with open(f'{VOCAB_FILENAME}', 'wb') as file_obj:
            storage_client.download_blob_to_file(
                f'gs://{VOCAB_GCS_URI}', file_obj)


        with open(f'{VOCAB_FILENAME}', 'rb') as pickle_file:
            vocab_dict = pkl.load(pickle_file)
        
    else:
        # ========================================================================
        # Candidate dataset
        # ========================================================================

        candidate_files = []
        for blob in storage_client.list_blobs(f'{candidate_file_dir}', prefix=f'{candidate_files_prefix}', delimiter="/"):
            candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

        raw_candidate_dataset = tf.data.TFRecordDataset(candidate_files)
        parsed_candidate_dataset = raw_candidate_dataset.map(parse_candidate_tfrecord_fn)

        '''
        Get vocabularies of unique values for strings
        '''
        logging.info(f"Getting unqiue values for `artist_name_can`...")
        v_artist_name_can = np.unique(
          np.concatenate(
              list(
                  parsed_candidate_dataset.map(
                      lambda x: x['artist_name_can']
                  )
                  .batch(1000)
              )
          )
        )
        v_artist_name_can_cleaned = [str(z).replace("b'","").translate(str.maketrans('', '', string.punctuation)) for z in v_artist_name_can]
    
        # v_track_uri_can = np.unique(
        #   np.concatenate(
        #       list(
        #           parsed_candidate_dataset.map(
        #               lambda x: x['track_uri_can']
        #           )
        #           .batch(1000)
        #       )
        #   )
        # )
        # v_track_uri_can_cleaned = [str(z).replace("b'","").translate(str.maketrans('', '', string.punctuation)) for z in v_track_uri_can]
        
        logging.info(f"Getting unqiue values for `track_name_can`...") 
        v_track_name_can = np.unique(
          np.concatenate(
              list(
                  parsed_candidate_dataset.map(
                      lambda x: x['track_name_can']
                  )
                  .batch(1000)
              )
          )
        )
        v_track_name_can_cleaned = [str(z).replace("b'","").translate(str.maketrans('', '', string.punctuation)) for z in v_track_name_can]
    
        logging.info(f"Getting unqiue values for `album_name_can`...")
        v_album_name_can = np.unique(
          np.concatenate(
              list(
                  parsed_candidate_dataset.map(
                      lambda x: x['album_name_can']
                  )
                  .batch(1000)
              )
          )
        )
        v_album_name_can_cleaned = [str(z).replace("b'","").translate(str.maketrans('', '', string.punctuation)) for z in v_album_name_can]

        logging.info(f"Getting unqiue values for `artist_genres_can`...")
        v_artist_genres_can = np.unique(
          np.concatenate(
              list(
                  parsed_candidate_dataset.map(
                      lambda x: x['artist_genres_can']
                  )
                  .batch(1000)
              )
          )
        )
        v_artist_genres_can_cleaned = [str(z).replace("b'","").translate(str.maketrans('', '', string.punctuation)) for z in v_artist_genres_can]

        # ========================================================================
        # TODO: parameterize / automate
        # ========================================================================
        avg_duration_ms_seed_pl = 13000151.68
        var_duration_ms_seed_pl = 133092900971233.58

        avg_n_songs_pl = 55.21
        var_n_songs_pl = 2317.54

        avg_n_artists_pl = 30.56
        var_n_artists_pl = 769.26

        avg_n_albums_pl = 40.25
        var_n_albums_pl = 1305.54

        avg_artist_pop = 16.08
        var_artist_pop = 300.64

        avg_duration_ms_songs_pl = 234823.14
        var_duration_ms_songs_pl = 5558806228.41

        avg_artist_followers = 43337.77
        var_artist_followers = 377777790193.57

        avg_track_pop = 10.85
        var_track_pop = 202.18

        # ========================================================================
        # Create Vocab Dict
        # ========================================================================
    
        vocab_dict = {
            'artist_name_can': v_artist_name_can_cleaned,
            'track_name_can': v_track_name_can_cleaned,
            'album_name_can': v_album_name_can_cleaned,
            'artist_genres_can': v_artist_genres_can_cleaned,
            'avg_duration_ms_seed_pl': avg_duration_ms_seed_pl,
            'var_duration_ms_seed_pl': var_duration_ms_seed_pl,
            'avg_n_songs_pl': avg_n_songs_pl,
            'var_n_songs_pl': var_n_songs_pl,
            'avg_n_artists_pl': avg_n_artists_pl,
            'var_n_artists_pl': var_n_artists_pl,
            'avg_n_albums_pl': avg_n_albums_pl,
            'var_n_albums_pl': var_n_albums_pl,
            'avg_artist_pop': avg_artist_pop,
            'var_artist_pop': var_artist_pop,
            'avg_duration_ms_songs_pl': avg_duration_ms_songs_pl,
            'var_duration_ms_songs_pl': var_duration_ms_songs_pl,
            'avg_artist_followers': avg_artist_followers,
            'var_artist_followers': var_artist_followers,
            'avg_track_pop': avg_track_pop,
            'var_track_pop': var_track_pop,
        }
        VOCAB_FILENAME = f'vocab_dict_{experiment_run}.pkl'
        VOCAB_GCS_SUB_DIR = f'{experiment_name}/{experiment_run}/vocabs_stats'
        VOCAB_GCS_URI = f'gs://{train_output_gcs_bucket}/{VOCAB_GCS_SUB_DIR}/{VOCAB_FILENAME}'
        
        logging.info(f"Saving Vocab File: {VOCAB_FILENAME} to {VOCAB_GCS_URI}")
        
        # Upload vocab_dict to GCS
        bucket = storage_client.bucket(train_output_gcs_bucket)
        blob = bucket.blob(f'{VOCAB_GCS_SUB_DIR}/{VOCAB_FILENAME}')
        pickle_out = pkl.dumps(vocab_dict)
        blob.upload_from_string(pickle_out)
        
        return (
            vocab_dict,
            f'{VOCAB_FILENAME}',
            f'{VOCAB_GCS_SUB_DIR}',
            f'{VOCAB_GCS_URI}',
        )

Writing src/pipelines/build_vocabs_stats.py


### Train Custom Two-Tower model

In [14]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/pipelines/train_custom_model.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)
@kfp.v2.dsl.component(
    base_image='python:3.9',
    packages_to_install=[
        'google-cloud-aiplatform==1.17.0',
        'tensorflow==2.9.2',
        'tensorflow-recommenders==0.7.0',
        'numpy',
        'google-cloud-storage',
    ],
    # output_component_file="./pipelines/train_custom_model.yaml",
)
def train_custom_model(
    project: str,
    # version: str,
    model_version: str,
    pipeline_version: str,
    model_name: str, 
    worker_pool_specs: dict,
    vocab_dict_uri: str, 
    train_output_gcs_bucket: str,                         # change to workdir?
    training_image_uri: str,
    tensorboard_resource_name: str,
    service_account: str,
    experiment_name: str,
    experiment_run: str,
) -> NamedTuple('Outputs', [
    ('query_tower_dir_uri', str),
    ('candidate_tower_dir_uri', str),
    # ('candidate_index_dir_uri', str),
]):
    
    from google.cloud import aiplatform as vertex_ai
    import logging
    import numpy as np
    
    vertex_ai.init(
        project=project,
        location='us-central1',
    )
    
    JOB_NAME = f'train-{model_name}'
    logging.info(f'JOB_NAME: {JOB_NAME}')
    
    BASE_OUTPUT_DIR = f'gs://{train_output_gcs_bucket}/{experiment_name}/{experiment_run}'
    logging.info(f'BASE_OUTPUT_DIR: {BASE_OUTPUT_DIR}')
    
    logging.info(f'vocab_dict_uri: {vocab_dict_uri}')
    
    logging.info(f'tensorboard_resource_name: {tensorboard_resource_name}')
    logging.info(f'service_account: {service_account}')
    logging.info(f'worker_pool_specs: {worker_pool_specs}')
  
    job = vertex_ai.CustomJob(
        display_name=JOB_NAME,
        worker_pool_specs=worker_pool_specs,
        staging_bucket=BASE_OUTPUT_DIR,
    )
    
    logging.info(f'Submitting train job to Vertex AI...')

    job.run(
        tensorboard=tensorboard_resource_name,
        service_account=f'{service_account}',
        restart_job_on_worker_restart=False,
        enable_web_access=True,
        sync=False,
    )
    
    # TODO: this is hardcoded, but created in train job
    query_tower_dir_uri = f"gs://{output_dir_gcs_bucket_name}/{experiment_name}/{experiment_run}/model-dir/query_tower" 
    candidate_tower_dir_uri = f"gs://{output_dir_gcs_bucket_name}/{experiment_name}/{experiment_run}/model-dir/candidate_tower"
    # candidate_index_dir_uri = f"gs://{output_dir_gcs_bucket_name}/{experiment_name}/{experiment_run}/candidate-index"
    
    return (
        f'{query_tower_dir_uri}',
        f'{candidate_tower_dir_uri}',
        # f'{candidate_index_dir_uri}',
    )

Writing src/pipelines/train_custom_model.py


### Find Model Endpoint

> TODO: inspect component behavior for different scenarios

In [15]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/pipelines/find_model_endpoint_test.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image='python:3.9',
    packages_to_install=['google-cloud-aiplatform==1.17.0'],
    # output_component_file="./pipelines/find_model_endpoint.yaml",
)
def find_model_endpoint_test(
    project: str,
    location: str,
    endpoint_name: str,
) -> NamedTuple('Outputs', [
                            ('create_new_endpoint', str),
                            ('existing_endpoint_uri', str),
                            ('deployed_models_count', int),
                            ('undeploy_model_needed', str),
                            ('deployed_model_list', list),
                            ('endpoint_traffic_split', str),
]):

    from google.cloud import aiplatform as vertex_ai
    import json
    import logging

    vertex_ai.init(
        project=project,
        location=location,
    )

    deployed_model_list = []

    logging.info(f"Searching for model endpoint: {endpoint_name}")

    if vertex_ai.Endpoint.list(
        filter=f'display_name="{endpoint_name}"'):
        '''
        Because existing Endpoint found: 
            (1) will not create new
            (2) Need the endpoint uri
            (3) Need list of deployed models on this endpoint;
            (4) If more than 1 deployed model exists, trigger subsequent conditional step
                to undeploy all but 1 (latest) model 

        '''
        logging.info(f"Model endpoint, {endpoint_name}, already exists")
        create_new_endpoint="False"
    
        # create endpoint list resource in memory
        _endpoint = vertex_ai.Endpoint.list(
            filter=f'display_name="{endpoint_name}"'
        )[0]
        
        logging.info(f"Parsing details for _endpoint: {_endpoint}")

        # retrieve endpoint uri
        existing_endpoint_uri = _endpoint.resource_name
        logging.info(f"existing_endpoint_uri: {existing_endpoint_uri}")
        _traffic_split = _endpoint.traffic_split

        # retrieve deployed model IDs
        deployed_models = _endpoint.gca_resource.deployed_models
        deployed_models_count = len(deployed_models)
        logging.info(f"deployed_models_count: {deployed_models_count}")

        if deployed_models_count > 1:
            # deployed_model_id_0 = _endpoint.gca_resource.deployed_models[0].id
            # deployed_model_id_1 = _endpoint.gca_resource.deployed_models[1].id
            undeploy_model_needed = "True"                               # arbitrary assumption: no more than 2 (3) models per model_endpoint
            for model in deployed_models:
                deployed_model_list.append(model.id)
        elif deployed_models_count == 0:
            undeploy_model_needed = "False"
        else:
            undeploy_model_needed = "False"
            deployed_model_list.append(_endpoint.gca_resource.deployed_models[0].id)

        # deployed_model_id = _endpoint.gca_resource.deployed_models[0].id
        logging.info(f"Currently deployed_model_list {deployed_model_list}")

    else:
        logging.info(f"Model endpoint, {endpoint_name}, does not exist")

        create_new_endpoint="True"
        deployed_models_count=0
        existing_endpoint_uri="N/A"
        undeploy_model_needed = "N/A"
        _traffic_split = "N/A"
        # deployed_model_list = []

        logging.info(f"create_new_endpoint {create_new_endpoint}")
        logging.info(f"existing_endpoint_uri {existing_endpoint_uri}")
        logging.info(f"deployed_models_count {deployed_models_count}")
        logging.info(f"undeploy_model_needed {undeploy_model_needed}")
        logging.info(f"deployed_model_list {deployed_model_list}")
        logging.info(f"_traffic_split {_traffic_split}")


    return (
        f'{create_new_endpoint}',
        f'{existing_endpoint_uri}',
        deployed_models_count,
        f'{undeploy_model_needed}',
        deployed_model_list,
        f'{_traffic_split}',
    )

Writing src/pipelines/find_model_endpoint_test.py


### Generate Candidate Embeddings

In [16]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/pipelines/generate_candidate_embedding_index.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)
@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        'google-cloud-aiplatform==1.17.0',
        'tensorflow==2.9.2',
        'google-cloud-storage',
    ],
    # output_component_file="./pipelines/generate_candidate_embedding_index.yaml",
)
def generate_candidate_embedding_index(
    project: str,
    location: str,
    # version: str,
    model_version: str,
    pipeline_version: str,
    train_output_gcs_bucket: str,
    # data_bucket_name: str,
    model_dir: str,
    candidate_file_dir: str,
    candidate_files_prefix: str,
    # candidate_items_prefix: str,
    experiment_name: str,
    experiment_run: str,
    embedding_index_destination_gcs_uri: str,
    uploaded_candidate_model_resources: str,
) -> NamedTuple('Outputs', [('candidate_embedding_index_file_uri', str),
                            ('embedding_index_gcs_bucket', str),
]):
    
    from google.cloud import storage
    from google.cloud.storage.bucket import Bucket
    from google.cloud.storage.blob import Blob
    
    from google.cloud import aiplatform as vertex_ai
    import tensorflow as tf
    from datetime import datetime
    import logging
    import os
    import numpy as np
    
    # TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    
    # initialize clients
    storage_client = storage.Client(project=project)
    vertex_ai.init(project=project,location=location)
    
    # ========================================================================
    # Helper Functions
    # ========================================================================
    
    candidate_features = {
        'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    }

    def parse_candidate_tfrecord_fn(example):
        example = tf.io.parse_single_example(
            example, 
            features=candidate_features
        )
        return example
    
    # ========================================================================
    # Load Saved Model
    # ========================================================================
    # TODO: use model uploaded to Vertex Registry
    
    logging.info(f"loaded_candidate_tower from model_dir: {model_dir}")
    loaded_candidate_tower = tf.saved_model.load(model_dir)
    
    candidate_predictor = loaded_candidate_tower.signatures["serving_default"]
    
    # ========================================================================
    # Candidate Dataset
    # ========================================================================
    
    # data_bucket_name = 'spotify-beam-v3'
    # candidate_items_prefix = 'v3/candidates-jt-tmp/'

    candidate_files = []
    for blob in storage_client.list_blobs(f'{candidate_file_dir}', prefix=f'{candidate_items_prefix}', delimiter="/"):
        if blob.name[-9:] == 'tfrecords':
            candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        else:
            pass
        
    raw_dataset = tf.data.TFRecordDataset(candidate_files)
    parsed_candidate_dataset = raw_dataset.map(parse_candidate_tfrecord_fn)
    
    # ========================================================================
    # Convert candidates to embeddings
    # ========================================================================
    
    embs_iter = parsed_candidate_dataset.batch(1).map(
        lambda data: candidate_predictor(
            artist_name_can = data["artist_name_can"],
            track_name_can = data['track_name_can'],
            album_name_can = data['album_name_can'],
            track_uri_can = data['track_uri_can'],
            artist_uri_can = data['artist_uri_can'],
            album_uri_can = data['album_uri_can'],
            duration_ms_can = data['duration_ms_can'],
            track_pop_can = data['track_pop_can'],
            artist_pop_can = data['artist_pop_can'],
            artist_followers_can = data['artist_followers_can'],
            artist_genres_can = data['artist_genres_can']
        )
    )

    embs = []
    for emb in embs_iter:
        embs.append(emb)

    print(f"Length of embs: {len(embs)}")
    
    cleaned_embs = [x['output_1'].numpy()[0] for x in embs] #clean up the output

    print(f"Length of cleaned_embs: {len(cleaned_embs)}")
    
    # ========================================================================
    # candidate track uri (IDs)
    # ========================================================================
    
    # clean product IDs
    track_uris = [x['track_uri_can'].numpy() for x in parsed_candidate_dataset]
    track_uris_cleaned = [str(z).replace("b'","").replace("'","") for z in track_uris]
    
    logging.info(f"Length of track_uris: {len(track_uris)}")
    logging.info(f"Length of track_uris_cleaned: {len(track_uris_cleaned)}")

    # ========================================================================
    # Remove bad records (e.g., nans)
    # ========================================================================
    
    bad_records = []

    for i, emb in enumerate(cleaned_embs):
        bool_emb = np.isnan(emb)
        for val in bool_emb:
            if val:
                bad_records.append(i)

    bad_record_filter = np.unique(bad_records)

    logging.info(f"bad_records: {len(bad_records)}")
    logging.info(f"bad_record_filter: {len(bad_record_filter)}")
    
    # ========================================================================
    # zip good and bad records seperately
    # ========================================================================
    
    track_uris_valid = []
    emb_valid = []

    bad_record_ids = []
    bad_record_embs = []

    for i, pair in enumerate(zip(track_uris_cleaned, cleaned_embs)):
        if i in bad_record_filter:
            t_uri, embed = pair
            bad_record_ids.append(t_uri)
            bad_record_embs.append(embed)
            # pass
        else:
            t_uri, embed = pair
            track_uris_valid.append(t_uri)
            emb_valid.append(embed)
            
    logging.info(f"Length of emb_valid             : {len(emb_valid)}")
    logging.info(f"Length of track_uris_valid      : {len(track_uris_valid)}")
    
    logging.info(f"Length of bad_record_ids        : {len(bad_record_ids)}")
    logging.info(f"Length of bad_record_embs       : {len(bad_record_embs)}")

    # ========================================================================
    # Write json file
    # ========================================================================

    embeddings_index_filename = f'candidate_embeddings_{model_version}_{experiment_run}.json'
    
    logging.info(f"Writting embedding vectors to file: {embeddings_index_filename}")

    with open(f'{embeddings_index_filename}', 'w') as f:
        for prod, emb in zip(track_uris_valid, emb_valid):
            f.write('{"id":"' + str(prod) + '",')
            f.write('"embedding":[' + ",".join(str(x) for x in list(emb)) + "]}")
            f.write("\n")
        

    # DESTINATION_BLOB_NAME = embeddings_index_filename
    # SOURCE_FILE_NAME = embeddings_index_filename

    # ========================================================================
    # Upload vocab_dict to GCS
    # ========================================================================
    # spotify-tfrs/candidate_embeddings_local_v2_092122.json
    # bucket = storage_client.bucket(gcs_bucket_name)
    # candidate_tower_dir_uri = f"gs://{output_dir_gcs_bucket_name}/{experiment_name}/{experiment_run}/model-dir/candidate_tower/"
    
    EMBEDDING_INDEX_DIR_GCS_URI = f'gs://{train_output_gcs_bucket}/{experiment_name}/{experiment_run}/candidate-embeddings/corpus-index-dir'
    
    logging.info(f"Uploading emebdding vectors to : {EMBEDDING_INDEX_DIR_GCS_URI}/{embeddings_index_filename}")
    blob = Blob.from_string(os.path.join(EMBEDDING_INDEX_DIR_GCS_URI, embeddings_index_filename))
    blob.bucket._client = storage_client
    blob.upload_from_filename(embeddings_index_filename)
    
    embedding_index_file_uri = f'{embedding_index_destination_gcs_uri}/{embeddings_index_filename}'
    logging.info(f"Saved embedding vectors for candidate corpus: {embedding_index_file_uri}")
    
    # Note: TODO: considerations for storing files used to create index and index updates
    
    return (
        f'{embedding_index_file_uri}',
        f'{EMBEDDING_INDEX_DIR_GCS_URI}',
    )

Writing src/pipelines/generate_candidate_embedding_index.py


## Prepare Job Specs

### Vertex Train: workerpool specs

In [17]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd,
    replica_count=1,
    machine_type="n1-standard-16",
    accelerator_count=1,
    accelerator_type="ACCELERATOR_TYPE_UNSPECIFIED",
    reduction_server_count=0,
    reduction_server_machine_type="n1-highcpu-16",
    reduction_server_image_uri="us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest",
):

    if accelerator_count > 0:
        machine_spec = {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        }
    else:
        machine_spec = {"machine_type": machine_type}

    container_spec = {
        "image_uri": image_uri,
        "args": args,
        "command": cmd,
    }

    chief_spec = {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": container_spec,
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            "replica_count": replica_count - 1,
            "machine_spec": machine_spec,
            "container_spec": container_spec,
        }
        worker_pool_specs.append(workers_spec)
    if reduction_server_count > 1:
        workers_spec = {
            "replica_count": reduction_server_count,
            "machine_spec": {
                "machine_type": reduction_server_machine_type,
            },
            "container_spec": {"image_uri": reduction_server_image_uri},
        }
        worker_pool_specs.append(workers_spec)

    return worker_pool_specs

#### Accelerators and Device Strategy

In [18]:
import time

# # Single machine, single GPU
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
PER_MACHINE_ACCELERATOR_COUNT = 1
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
DISTRIBUTE_STRATEGY = 'single'

# # # Single Machine; multiple GPU
# WORKER_MACHINE_TYPE = 'a2-highgpu-4g' # a2-ultragpu-4g
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 4
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'mirrored'

# # # Multiple Machines, 1 GPU per Machine
# WORKER_MACHINE_TYPE = 'n1-standard-16'
# REPLICA_COUNT = 9
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 10                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

In [19]:
print(f"APP: {APP}")
print(f"MODEL_TYPE: {MODEL_TYPE}")
print(f"FRAMEWORK: {FRAMEWORK}")
print(f"MODEL_VERSION: {MODEL_VERSION}")
print(f"PIPELINE_VERSION: {PIPELINE_VERSION}\n")

print(f"MODEL_ROOT_NAME: {MODEL_ROOT_NAME}")
print(f"IMAGE_URI: {IMAGE_URI}\n")

print(f"EXPERIMENT_PREFIX: {EXPERIMENT_PREFIX}")
print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}\n")

print(f"DATA_REGIME: {DATA_REGIME}")
print(f"OUTPUT_BUCKET: {OUTPUT_BUCKET}")
print(f"DOCKERNAME_TRAIN: {DOCKERNAME_TRAIN}")
print(f"PIPELINE_ROOT_PATH: {PIPELINE_ROOT_PATH}")

APP: sp
MODEL_TYPE: 2tower
FRAMEWORK: tfrs
MODEL_VERSION: v15
PIPELINE_VERSION: v3

MODEL_ROOT_NAME: sp-2tower-tfrs-v15-v3
IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training

EXPERIMENT_PREFIX: pipe-dev
EXPERIMENT_NAME: pipe-dev-2tower-tfrs-v15
RUN_NAME: run-20220923-193149

DATA_REGIME: small-jt-tfrecord
OUTPUT_BUCKET: jt-tfrs-test
DOCKERNAME_TRAIN: Dockerfile.tfrs
PIPELINE_ROOT_PATH: gs://jt-tfrs-test/sp-2tower-tfrs-v15-v3/pipeline_root


#### Training Config

In [20]:
# TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

# PREFIX = '2tower-pipe'
# FRAMEWORK = 'tfrs'
# model_VERSION = 'v14'
# MODEL_NAME = f'{PREFIX}-{FRAMEWORK}-{model_VERSION}'

# =================================================
# GCP project
# =================================================
PROJECT= 'hybrid-vertex'
REGION='us-central1'
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

# =================================================
# recsys jobs
# =================================================
APP='sp'
MODEL_TYPE='2tower'
FRAMEWORK = 'tfrs'
MODEL_VERSION = 'v15'
PIPELINE_VERSION = 'v3'
MODEL_ROOT_NAME = f'{APP}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}-{PIPELINE_VERSION}'

# =================================================
# Vertex Experiment tracking
# =================================================
# timestamp previously calculated; keep everything together
EXPERIMENT_PREFIX = 'pipe-dev'                                                 
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}'
RUN_NAME=f'run-{TIMESTAMP}'  
DATA_REGIME = 'small-jt-tfrecord'

# deprecated
# EXPERIMENT_NAME=f'data1-small-{model_VERSION}'
# RUN_NAME=f'run-{TIMESTAMP}'

# =================================================
# Data sources
# =================================================
# # "gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00000-of-00796.tfrecord"
# # gs://spotify-tfrs-dir/small-dataset/output-00000-of-00796.tfrecord

CANDIDATE_FILE_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
CANDIDATE_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

TRAIN_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
TRAIN_DIR_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

VALID_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
VALID_DIR_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

# MODEL_DIR='jt-tfrs-test'


# =================================================
# train image
# =================================================
# Existing image URI or name for image to create
IMAGE_URI = 'gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training'
# IMAGE_NAME = f'{MODEL_NAME}-training'
# IMAGE_URI = f'gcr.io/{PROJECT}/{IMAGE_NAME}'

# =================================================
# train job config
# =================================================
VALID_FREQUENCY = 10
# VALID_SIZE = 20_000

NUM_EPOCHS = 1
BATCH_SIZE = 256
LEARNING_RATE = 0.01

# MAX_PADDING = 375
EMBEDDING_DIM = 32
PROJECTION_DIM = 5

DROPOUT_RATE = 0.4
LAYER_SIZES = '[64,32]'

WORKER_CMD = ["python", "trainer/task.py"]
# WORKER_CMD ["python", "-m", "trainer.task"]

WORKER_ARGS = [
    f'--project={PROJECT}',
    f'--train_output_gcs_bucket={OUTPUT_BUCKET}',
    f'--train_dir={TRAIN_DIR}',
    f'--train_dir_prefix={TRAIN_DIR_PREFIX}',
    f'--valid_dir={VALID_DIR}',
    f'--valid_dir_prefix={VALID_DIR_PREFIX}',
    f'--candidate_file_dir={CANDIDATE_FILE_DIR}',
    f'--candidate_files_prefix={CANDIDATE_PREFIX}',
    f'--experiment_name={EXPERIMENT_NAME}',
    f'--experiment_run={RUN_NAME}',
    f'--num_epochs={NUM_EPOCHS}',
    f'--batch_size={BATCH_SIZE}',
    f'--embedding_dim={EMBEDDING_DIM}',
    f'--projection_dim={PROJECTION_DIM}',
    f'--layer_sizes={LAYER_SIZES}',
    f'--learning_rate={LEARNING_RATE}',
    f'--valid_frequency={VALID_FREQUENCY}',
    f'--distribute={DISTRIBUTE_STRATEGY}',
    f'--model_version={MODEL_VERSION}',
    f'--pipeline_version={PIPELINE_VERSION}',
    f'--data_regime={DATA_REGIME}',
]

WORKER_POOL_SPECS = prepare_worker_pool_specs(
    image_uri=IMAGE_URI,
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--train_output_gcs_bucket=jt-tfrs-test',
                              '--train_dir=spotify-tfrs-dir',
                              '--train_dir_prefix=small-dataset/',
                              '--valid_dir=spotify-tfrs-dir',
                              '--valid_dir_prefix=small-dataset/',
                              '--candidate_file_dir=spotify-tfrs-dir',
                              '--candidate_files_prefix=small-dataset/',
                              '--experiment_name=pipe-dev-2tower-tfrs-v15',
                              '--experiment_run=run-20220923-193149',
                              '--num_epochs=1',
                              '--batch_size=256',
                              '--embedding_dim=32',
                              '--projection_dim=5',
                              '--layer_sizes=[64,32]',
                              '--learning_rate=0.01',
            

In [21]:
!export PWD=pwd
! echo $PWD

/home/jupyter/spotify-tfrs


## Build Pipeline

In [22]:
# PROJECT_ID = 'hybrid-vertex'
# LOCATION = 'us-central1'

# model_VERSION = model_VERSION                            # Matching Engine & TwoTwoer Code
# PIPELINE_VERSION = VERSION                                  # pipeline code

PIPELINE_TAG = f'2tower-recsys-{PIPELINE_VERSION}'
print("PIPELINE_TAG:", PIPELINE_TAG)

PIPELINE_NAME = f'modeling-{MODEL_VERSION}-{PIPELINE_TAG}'.replace('_', '-')
print("PIPELINE_NAME:", PIPELINE_NAME)

PIPELINE_TAG: 2tower-recsys-v3
PIPELINE_NAME: modeling-v15-2tower-recsys-v3


In [23]:
# from src.pipelines import build_custom_train_image, build_vocabs_stats, create_tensorboard, find_model_endpoint_test, generate_candidate_embedding_index

from src.pipelines import build_custom_train_image, build_vocabs_stats, \
                          create_tensorboard, find_model_endpoint_test, \
                          generate_candidate_embedding_index, train_custom_model

@kfp.v2.dsl.pipeline(
    name=f'{PIPELINE_NAME}'.replace('_', '-')
)
def pipeline(
    project: str,
    project_number: str,
    location: str,
    service_account: str,
    # version:str,
    model_version: str,
    pipeline_version: str,
    pipeline_tag: str,
    train_image_uri: str,
    train_output_gcs_bucket: str,
    embedding_index_destination_gcs_uri: str,
    gcs_train_script_path: str,
    create_tb_resource: bool,
    model_display_name: str,
    train_dockerfile_name: str,
    # data_bucket_name: str,
    # path_to_train_dir: str,
    # path_to_valid_dir: str,
    # path_to_candidate_dir: str,
    # candidate_items_prefix: str,
    train_dir: str,
    train_dir_prefix: str,
    valid_dir: str,
    valid_dir_prefix: str,
    candidate_file_dir: str,
    candidate_files_prefix: str,
    path_to_vocab_file: str,
    vocab_file_name: str,
    experiment_name: str,
    experiment_run: str,
    vpc_network_name: str,
    model_endpoint_name: str,
    generate_vocabs_stats: bool,
    serving_machine_type: str,
    serving_replica_count_min: int = 1,
    serving_replica_count_max: int = 3,
    max_padding: int = 375,
    split_names: list = ['train', 'valid'],
):
    
    from kfp.v2.components import importer_node
    from google_cloud_pipeline_components.types import artifact_types
    
    # ========================================================================
    # TODO: data preocessing coponents
    # ========================================================================
    
    # ========================================================================
    # Build Custom Train Image
    # ========================================================================
    
    # build_custom_train_image_op = (
    #     build_custom_train_image.build_custom_train_image(
    #         project=project,
    #         gcs_train_script_path=gcs_train_script_path,
    #         training_image_uri=train_image_uri,
    #         train_dockerfile_name=train_dockerfile_name,
    #     )
    #     .set_display_name("Build custom train image")
    #     .set_caching_options(False)
    # )
    
    # create_tensorboard_op = (
    #     create_tensorboard.create_tensorboard(
    #         project=project,
    #         location=location,
    #         version=version,
    #         model_display_name=model_display_name,
    #         gcs_bucket_name=train_output_gcs_bucket,
    #         create_tb_resource=create_tb_resource
    #     )
    #     .set_display_name("Tensorboard Instance")
    #     .set_caching_options(True)
    # )
    
    build_vocabs_string_lookups_op = (
        build_vocabs_stats.build_vocabs_string_lookups(
            project=project,
            location=location,
            model_version=model_version,
            pipeline_version=pipeline_version,
            # version=version,
            # gcs_bucket_name=data_bucket_name,
            # path_to_train_dir=path_to_train_dir,
            # path_to_valid_dir=path_to_valid_dir,
            # path_to_candidate_dir=path_to_candidate_dir,
            train_output_gcs_bucket=train_output_gcs_bucket,
            train_dir=train_dir,
            train_dir_prefix=train_dir_prefix,
            valid_dir=valid_dir,
            valid_dir_prefix=valid_dir_prefix,
            candidate_file_dir=candidate_file_dir,
            candidate_files_prefix=candidate_files_prefix,
            experiment_name=experiment_name,
            experiment_run=experiment_run, 
            path_to_vocab_file=path_to_vocab_file,
            vocab_file_name=vocab_file_name,
            max_padding=max_padding,
            split_names=split_names,
            generate_vocabs_stats=generate_vocabs_stats,
        )
        .set_display_name("Build Vocab File")
        .set_caching_options(True)
        # .after(build_custom_train_image_op)
    )
    
    run_train_task_op = (
        train_custom_model.train_custom_model(
            project=project,
            model_version=model_version,
            pipeline_version=pipeline_version,
            # version=version,
            model_name=model_display_name,
            worker_pool_specs=WORKER_POOL_SPECS, 
            train_output_gcs_bucket=train_output_gcs_bucket,
            vocab_dict_uri=build_vocabs_string_lookups_op.outputs['vocab_gcs_uri'],
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            training_image_uri=train_image_uri,     # build_custom_train_image_op.outputs['training_image_uri'],
            tensorboard_resource_name="projects/934903580331/locations/us-central1/tensorboards/5925030667573264384", # create_tensorboard_op.outputs['tensorboard_resource_name'],
            service_account=service_account,
        )
        .set_display_name("2Tower Training")
        .set_caching_options(True)
        # .after(build_custom_train_image_op)
    )
    
    # ========================================================================
    # Upload Query and Candidate Towers
    # ========================================================================
    
    import_unmanaged_query_model_task = (
        importer_node.importer(
            artifact_uri=run_train_task_op.outputs['query_tower_dir_uri'],
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                'containerSpec': {
                    'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-9:latest',
                },
            },
        )
        .set_display_name("Import Query Tower")
        .after(run_train_task_op)
        .set_caching_options(True)
    )
    
    query_model_upload_op = (
        gcc_aip.ModelUploadOp(
            project=project,
            location=location,
            display_name=f'query-tower-{model_display_name}',
            unmanaged_container_model=import_unmanaged_query_model_task.outputs["artifact"],
            labels={"tower": "query"},
        )
        .after(import_unmanaged_query_model_task)
        .set_display_name("Upload Query Tower")
        .set_caching_options(True)
    )
    
    import_unmanaged_candidate_model_task = (
        importer_node.importer(
            artifact_uri=run_train_task_op.outputs['candidate_tower_dir_uri'],
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                'containerSpec': {
                    'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-9:latest',
                },
            },
        )
        .set_display_name("Import Candidate Tower")
        .after(run_train_task_op)
        .set_caching_options(True)
    )

    candidate_model_upload_op = (
        gcc_aip.ModelUploadOp(
            project=project,
            location=location,
            display_name=f'candidate-tower-{model_display_name}',
            unmanaged_container_model=import_unmanaged_candidate_model_task.outputs["artifact"],
            labels={"tower": "candidate"},
        )
        # .after(import_unmanaged_query_model_task)
        .set_display_name("Upload Query Tower to Vertex")
        .set_caching_options(True)
    )
    
    find_model_endpoint_op = (
      find_model_endpoint_test.find_model_endpoint_test(
          project=project,
          location=location,
          endpoint_name=model_endpoint_name,
      )
      .set_display_name("Find Model Endpoint")
      .set_caching_options(False)
    )
    
    
    # ========================================================================
    # Scalable ANN Index with Vertex Matching Engine 
    # ========================================================================

    # Generate Embeddings
    generate_candidate_embedding_index_op = (
        generate_candidate_embedding_index.generate_candidate_embedding_index(
            project=project,
            location=location,
            model_version=model_version,
            pipeline_version=pipeline_version,
            # version=version,
            # data_bucket_name=data_bucket_name,
            train_output_gcs_bucket=train_output_gcs_bucket,
            # model_dir=run_train_task_op.outputs['candidate_tower_dir_uri'],
            model_dir=run_train_task_op.outputs['candidate_tower_dir_uri'],
            # candidate_items_prefix=candidate_items_prefix,
            candidate_file_dir=candidate_file_dir,
            candidate_files_prefix=candidate_files_prefix,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            embedding_index_destination_gcs_uri=embedding_index_destination_gcs_uri,
            uploaded_candidate_model_resources=candidate_model_upload_op.outputs['gcp_resources'],
        )
        .after(run_train_task_op)
        .set_display_name("Generate Candidate embeddings")
        .set_caching_options(True)
    )
    
    # ========================================================================
    # Conditional: create New Model Endpoint - True 
    # ========================================================================
    with kfp.v2.dsl.Condition(
      find_model_endpoint_op.outputs["create_new_endpoint"] == "True",
      name="Create New Endpoint",
    ):
        endpoint_create_op = (
            gcc_aip.EndpointCreateOp(
                project=project,
                display_name=model_endpoint_name, #f'{pipeline_tag}-model-endpoint',
            )
            .after(run_train_task_op)
            .set_display_name("Create New Endpoint | Query Tower")
            .set_caching_options(True)
        )
        
        model_deploy_op = (
            gcc_aip.ModelDeployOp(
                endpoint=endpoint_create_op.outputs['endpoint'],
                model=query_model_upload_op.outputs['model'],
                deployed_model_display_name=f'deployed-query-tower-{model_display_name}',
                dedicated_resources_machine_type=serving_machine_type,
                dedicated_resources_min_replica_count=serving_replica_count_min,
                dedicated_resources_max_replica_count=serving_replica_count_max,
                traffic_split={"0": 100},
            )
            .set_display_name("Deploy Query Tower | New Endpoint")
            .set_caching_options(True)
        )


In [24]:
!export PWD=pwd
! echo $PWD

pwd='/home/jupyter/spotify-tfrs'

/home/jupyter/spotify-tfrs


### Compile

In [25]:
! rm -f pipelines/custom_container_pipeline_spec.json

PIPELINE_JSON_SPEC_PATH = "./src/pipelines/custom_container_pipeline_spec.json"

kfp.v2.compiler.Compiler().compile(
    pipeline_func=pipeline, package_path=PIPELINE_JSON_SPEC_PATH,
)

# copy pipeline spec to version location
# !gsutil cp /home/jupyter/vertex-examples/tower_pipes/custom_container_pipeline_spec.json {BUCKET_URI}/{APP_NAME}/{VERSION}/
!gsutil cp $PWD/src/pipelines/custom_container_pipeline_spec.json {BUCKET_URI}/{MODEL_ROOT_NAME}/src/custom_container_pipeline_spec.json



Copying file:///home/jupyter/spotify-tfrs/src/pipelines/custom_container_pipeline_spec.json...
/ [1 files][ 87.9 KiB/ 87.9 KiB]                                                
Operation completed over 1 objects/87.9 KiB.                                     


In [26]:
RUN_NAME

'run-20220923-193149'

### Submit

In [27]:
# TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

# model_VERSION = model_VERSION                         
# PIPELINE_VERSION = 'v1'

# EXPERIMENT_NAME = f'pipe-test-{PIPELINE_VERSION}'
# EXPERIMENT_RUN = f'run-{TIMESTAMP}'
# MODEL_DISPLAY_NAME='2Tower-spot'

PROJECT_NUMBER='934903580331'

# BUCKET_NAME='jt-tfrs-test'
OUTPUT_BUCKET = 'jt-tfrs-test'
STAGING_BUCKET =f'gs://{OUTPUT_BUCKET}'


DATA_BUCKET_NAME= 'spotify-tfrs-dir'      # 'spotify-beam-v3'
SPLIT_NAMES = ['train','valid']           # ['dif_artist','dif_artist_valid']

# {BASE_OUTPUT_DIR}/src"
TRAIN_APP_CODE_PATH = f'{BASE_OUTPUT_DIR}/src/trainer'

TRAIN_IMAGE_URI = 'gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training'
# DOCKERNAME_TRAIN = 'Dockerfile.tfrs'

CANDIDATE_FILE_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
CANDIDATE_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

TRAIN_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
TRAIN_DIR_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

VALID_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
VALID_DIR_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'



vpc_network_name = 'ucaip-haystack-vpc-network'
embedding_index_destination_gcs_uri = f'{BASE_OUTPUT_DIR}/{RUN_NAME}/candidate-index' 
# index = f"gs://{output_dir_gcs_bucket_name}/{version}/{experiment_run}/candidate-index/"

# f'{experiment_name}/{experiment_run}/vocabs_stats'
PATH_TO_VOCAB_FILE = f'{EXPERIMENT_NAME}/{RUN_NAME}/vocabs_stats'

VOCAB_FILENAME = f'vocab_and_stats_{RUN_NAME}.pkl'
GENERATE_VOCABS_AND_STATS=True

MODEL_ENDPOINT_NAME = f'model-endpoint-{MODEL_VERSION}'
CREATE_TENSORBOARD_RESOURCE = ''
SERVICE_ACCOUNT = '934903580331-compute@developer.gserviceaccount.com'


# Serving
SERVING_MACHINE_TYPE = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-9:latest'


# ========================================================================
# Submit Pipeline Job
# ========================================================================
# # "gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00000-of-00796.tfrecord"
# # gs://spotify-tfrs-dir/small-dataset/output-00000-of-00796.tfrecord

overwrite = True

if not PIPELINES.get('train') or overwrite:
    response = pipeline_client.create_run_from_job_spec(
        job_spec_path=PIPELINE_JSON_SPEC_PATH,
        enable_caching=False,
        network=f'projects/{PROJECT_NUMBER}/global/networks/{vpc_network_name}',            #
        # service_account=SERVICE_ACCOUNT,                                                 
        parameter_values={
            'project': PROJECT_ID,
            'project_number': PROJECT_NUMBER,
            'location': LOCATION,
            # 'version': VERSION,
            'model_version': MODEL_VERSION,
            'pipeline_version': PIPELINE_VERSION,
            'model_display_name': MODEL_ROOT_NAME,
            'pipeline_tag': PIPELINE_TAG,
            # 'gcs_bucket_name': BUCKET_NAME,
            'gcs_train_script_path': TRAIN_APP_CODE_PATH,
            'train_image_uri': IMAGE_URI,
            'train_output_gcs_bucket': OUTPUT_BUCKET,
            # 'data_bucket_name': DATA_BUCKET_NAME,
            # 'path_to_train_dir': TRAIN_DIR_PREFIX,
            # 'path_to_valid_dir': TRAIN_DIR_PREFIX,
            # 'path_to_candidate_dir': CANDIDATE_PREFIX,
            # 'candidate_items_prefix': CANDIDATE_PREFIX,
            'train_dir': TRAIN_DIR,
            'train_dir_prefix': TRAIN_DIR_PREFIX,
            'valid_dir': VALID_DIR,
            'valid_dir_prefix': VALID_DIR_PREFIX,
            'candidate_file_dir': CANDIDATE_FILE_DIR,
            'candidate_files_prefix': CANDIDATE_PREFIX,
            'vpc_network_name': vpc_network_name,
            'train_dockerfile_name': DOCKERNAME_TRAIN,
            'path_to_vocab_file': PATH_TO_VOCAB_FILE,
            'vocab_file_name':VOCAB_FILENAME,
            'embedding_index_destination_gcs_uri': embedding_index_destination_gcs_uri,
            'generate_vocabs_stats': GENERATE_VOCABS_AND_STATS,
            'experiment_name': EXPERIMENT_NAME,
            'experiment_run': RUN_NAME,
            'serving_machine_type': SERVING_MACHINE_TYPE,
            'model_endpoint_name': MODEL_ENDPOINT_NAME,
            'create_tb_resource': CREATE_TENSORBOARD_RESOURCE,
            'service_account': SERVICE_ACCOUNT,
            # 'max_padding': 375,
            'split_names': SPLIT_NAMES,
            'generate_vocabs_stats': GENERATE_VOCABS_AND_STATS,
        },
        pipeline_root=f'{PIPELINE_ROOT_PATH}/{PIPELINE_VERSION}',
    )
    PIPELINES['train'] = response['name']

In [None]:
# Will be using these pipeline params 

            # 'dimensions': dimensions,
            # 'ann_index_display_name': ann_index_display_name,
            # 'approximate_neighbors_count': approximate_neighbors_count,
            # 'distance_measure_type': distance_measure_type,
            # 'leaf_node_embedding_count': leaf_node_embedding_count,
            # 'leaf_nodes_to_search_percent': leaf_nodes_to_search_percent, 
            # 'ann_index_description': ann_index_description,
            # 'ann_index_labels': ann_index_labels,
            # 'brute_force_index_display_name': brute_force_index_display_name,
            # 'brute_force_index_description': brute_force_index_description,
            # 'brute_force_index_labels': brute_force_index_labels,
            # 'deployed_ann_index_name': deployed_ann_index_name,
            # 'deployed_brute_force_index_name': deployed_brute_force_index_name,
            # 'deployed_test_destination_gcs_uri': deployed_test_destination_gcs_uri,
            # "endpoint_resource_uri": "https://us-central1-aiplatform.googleapis.com/v1/",
            # "serving_image": "N/A TODO",
            # "artifact_uri": "N/A TODO",
            # "resource_name":"TODO NONE",
            # 'ann_index_endpoint_display_name': ann_index_endpoint_display_name,
            # 'ann_index_endpoint_description': ann_index_endpoint_description,
            # 'brute_index_endpoint_display_name': brute_index_endpoint_display_name,
            # 'brute_index_endpoint_description': brute_index_endpoint_description,
            # 'app_name': APP_NAME,