# Deploying Merlin Query Tower with Vertex AI

* Create custom prediction routine (CPR)
* Upload query model to Vertex AI Model Registry
* Test registered models predictions

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


In [2]:
from google.cloud import aiplatform as vertex_ai
import os
import time

BUCKET = 'jt-merlin-scaling'
BUCKET_URI = 'gs://jt-merlin-scaling'

vertex_ai.init(project=PROJECT_ID, location=LOCATION)

In [3]:
# !gcloud auth application-default login

## Build serving app

In [4]:
REPO_DOCKER_PATH_PREFIX = 'src'
SERVING_SUB_DIR = 'serving'
SERVING_APPLICATION_DIR = 'app'
SERVING_DOCKERNAME = 'mm-query-serve'
LOCAL_WORKFLOW_DIR = f'{REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/local_workflow'

### write pred files to local dir

In [5]:
# Make the training subfolder
! rm -rf {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}
! touch {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/__init__.py

!rm -rf $LOCAL_WORKFLOW_DIR
!mkdir $LOCAL_WORKFLOW_DIR

#### requirements

In [6]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/requirements.txt
uvicorn[standard]==0.15.0
gunicorn==20.1.0
fastapi==0.68.1
# uvloop==0.15.2
# fastapi-utils
google-cloud-aiplatform
merlin-models
nvtabular
gcsfs
google-cloud-storage

Writing src/serving/app/requirements.txt


### dataset to tensors

In [7]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/dataset_to_tensors.py

try:
    import cudf
except ImportError:
    cudf = None
import pandas as pd
import tensorflow as tf
from typing import Dict
from merlin.io import Dataset
import itertools


def cupy_array_to_tensor(array):
    return tf.experimental.dlpack.from_dlpack(array.reshape(-1, 1).toDlpack())

def numpy_array_to_tensor(array):
    return tf.convert_to_tensor(array.reshape(-1, 1))

def cudf_series_to_tensor(col) -> tf.Tensor:
    "Convert a cudf.Series to a TensorFlow Tensor with DLPack"
    if isinstance(col.dtype, cudf.ListDtype):
        values = col.list.leaves.values
        offsets = col.list._column.offsets.values
        row_lengths = offsets[1:] - offsets[:-1]
        return cupy_array_to_tensor(values), cupy_array_to_tensor(row_lengths)
    else:
        return cupy_array_to_tensor(col.values)

def pandas_series_to_tensor(col) -> tf.Tensor:
    if len(col) and pd.api.types.is_list_like(col.values[0]):
        values = pd.Series(itertools.chain(*col)).values
        row_lengths = col.map(len).values
        return numpy_array_to_tensor(values), numpy_array_to_tensor(row_lengths)
    else:
        return numpy_array_to_tensor(col.values)
        
    
def dataset_to_tensors(dataset: Dataset) -> Dict[str, tf.Tensor]:
    """Convert a DataFrame to Dict of Tensors"""
    df = dataset.to_ddf().compute()
    if isinstance(df, pd.DataFrame):
        col_to_tensor = pandas_series_to_tensor
    else:
        col_to_tensor = cudf_series_to_tensor
    return {
        column: col_to_tensor(df[column])
        for column in df.columns
    }

Writing src/serving/app/dataset_to_tensors.py


#### predictor

In [8]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/predictor.py
import nvtabular as nvt
import pandas as pd
import os
import json
import merlin.models.tf as mm
from nvtabular.loader.tf_utils import configure_tensorflow
configure_tensorflow()
import tensorflow as tf
import time
import logging
from dataset_to_tensors import *


# These are helper functions that ensure the dictionary input is in a certain order and types are preserved
# this is to get scalar values to appear first in the dict to not confuse pandas with lists https://github.com/pandas-dev/pandas/issues/46092
reordered_keys = [
    'collaborative', 
    'album_name_pl', 
    'artist_genres_pl', 
    'artist_name_pl', 
    'artist_pop_can', 
    'description_pl', 
    'duration_ms_songs_pl', 
    'n_songs_pl', 
    'name', 
    'num_albums_pl', 
    'num_artists_pl', 
    'track_name_pl', 
    'track_pop_pl', 
    'duration_ms_seed_pl', 
    'pid', 
    'track_uri_pl'
]

float_num_fix = ['n_songs_pl','num_albums_pl','num_artists_pl','duration_ms_seed_pl']
float_list_fix = ['track_pop_pl', 'duration_ms_songs_pl']
    
def fix_list_num_dtypes(num_list):
    "this fixes lists of ints to list of floats converted in json input"
    return [float(x) for x in num_list]

def fix_num_dtypes(num):
    "this fixes ints and casts to floats"
    return float(num)

def fix_types(k, v):
    if k in float_num_fix:
        return fix_num_dtypes(v)
    if k in float_list_fix:
        return fix_list_num_dtypes(v)
    else:
        return v

def create_pandas_instance(inputs):
    """
    Helper function to reorder the input to have a sclar first for pandas
    And fix the types converted when data is imported by fastAPI
    """
    if type(inputs) == list:
        header = inputs[0]
        reordered_header_dict = {k: fix_types(k,header[k]) for k in reordered_keys}
        pandas_instance = pd.DataFrame.from_dict(reordered_header_dict, orient='index').T
        if len(inputs) > 1:
            for ti in inputs[1:]:
                reordered_dict = {k: fix_types(k,ti[k]) for k in reordered_keys}
                pandas_instance = pandas_instance.append(pd.DataFrame.from_dict(reordered_dict, orient='index').T)
    else:
        reordered_dict = {k: fix_types(k,inputs[k]) for k in reordered_keys}
        pandas_instance = pd.DataFrame.from_dict(reordered_dict, orient='index').T
    return pandas_instance

class Predictor():
    """Interface of the Predictor class for Custom Prediction Routines.
    The Predictor is responsible for the ML logic for processing a prediction request.
    Specifically, the Predictor must define:
    (1) How to load all model artifacts used during prediction into memory.
    (2) The logic that should be executed at predict time.
    When using the default PredictionHandler, the Predictor will be invoked as follows:
      predictor.postprocess(predictor.predict(predictor.preprocess(prediction_input)))
    """
    def __init__(self):
        return
    
    def load(self, artifacts_uri):
        """Loads the model artifact.
        Args:
            artifacts_uri (str):
                Required. The value of the environment variable AIP_STORAGE_URI.
        """
        logging.info("loading model and workflow")
        start_init = time.process_time()
        
        #test_bucket = 'gs://jt-merlin-scaling'
        # self.model = tf.keras.models.load_model(os.path.join(artifacts_uri, "query_model_merlin" ))
        # self.workflow = nvt.Workflow.load(os.path.join(artifacts_uri, "workflow/2t-spotify-workflow")) # TODO: parameterize
        self.model = tf.keras.models.load_model(artifacts_uri)
        
        # self.workflow = nvt.Workflow.load(os.path.join(artifacts_uri, "workflow/2t-spotify-workflow"))
        # self.workflow = nvt.Workflow.load('gs://jt-merlin-scaling/nvt-last5-v1full/nvt-analyzed') # TODO: parametrize
        self.workflow = nvt.Workflow.load("/docker_workflow/workflow")
        
        self.workflow = self.workflow.remove_inputs(
            [
                'track_pop_can', 
                'track_uri_can', 
                'duration_ms_can', 
                'track_name_can', 
                'artist_name_can',
                'album_name_can',
                'album_uri_can',
                'artist_followers_can', 
                'artist_genres_can',
                'artist_name_can', 
                'artist_pop_can',
                'artist_pop_pl',
                'artist_uri_can', 
                'artists_followers_pl'
            ]
        )
        return self
        
    def predict(self, prediction_input):
        """Preprocesses the prediction input before doing the prediction.
        Args:
            prediction_input (Any):
                Required. The prediction input that needs to be preprocessed.
        Returns:
            The preprocessed prediction input.
        """
        # handle different input types, can take a dict or list of dicts
        self.n_rows = len(prediction_input)
        start = time.process_time()
        pandas_instance = create_pandas_instance(prediction_input[0])
        #logging.info(f"Pandas conversion took {time.process_time() - start} seconds")
        print(f"Pandas conversion took {time.process_time() - start} seconds")
        start = time.process_time()
        transformed_inputs = nvt.Dataset(pandas_instance)
        #logging.info(f"NVT data loading took {time.process_time() - start} seconds")
        print(f"NVT data loading took {time.process_time() - start} seconds")
        start = time.process_time()
        transformed_instance = self.workflow.transform(transformed_inputs)
        print(f"Workflow transformation took {time.process_time() - start} seconds")

        # def predict(self, instances):
        start = time.process_time()
        
        batch = dataset_to_tensors(transformed_instance)
        print(f"converting to dict_tensors took {time.process_time() - start} seconds")
        start = time.process_time()
        output = self.model(batch)
        print(f"Generating query embeddings took {time.process_time() - start} seconds")
        return transformed_instance, output, batch

Writing src/serving/app/predictor.py


#### main

In [9]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/main.py
from fastapi import FastAPI, Request

import json
import numpy as np
import os
import logging


from google.cloud import storage
from predictor import Predictor

app = FastAPI()

predictor_instance = Predictor()
loaded_predictor = predictor_instance.load(artifacts_uri = os.environ['AIP_STORAGE_URI'])

@app.get(os.environ['AIP_HEALTH_ROUTE'], status_code=200)
def health():
    return {}


@app.post(os.environ['AIP_PREDICT_ROUTE'])
async def predict(request: Request):
    body = await request.json()
    instances = body["instances"]
    outputs = loaded_predictor.predict(instances)

    return {"predictions": outputs[1].numpy().tolist()}

Writing src/serving/app/main.py


In [11]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/prestart.sh
#!/bin/bash
export PORT=$AIP_HTTP_PORT

Writing src/serving/app/prestart.sh


In [12]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{SERVING_SUB_DIR}/{SERVING_APPLICATION_DIR}/instances.json
{"instances": {"collaborative": "false", "album_name_pl": ["There's Really A Wolf", "Late Nights: The Album", "American Teen", "Crazy In Love", "Pony"], "album_uri_can": "spotify:album:5l83t3mbVgCrIe1VU9uJZR", "artist_followers_can": 4339757.0, "artist_genres_can": "'hawaiian hip hop', 'rap'", "artist_genres_pl": ["'hawaiian hip hop', 'rap'", "'chicago rap', 'dance pop', 'pop', 'pop rap', 'r&b', 'southern hip hop', 'trap', 'urban contemporary'", "'pop', 'pop r&b'", "'dance pop', 'pop', 'r&b'", "'chill r&b', 'pop', 'pop r&b', 'r&b', 'urban contemporary'"], "artist_name_can": "Russ", "artist_name_pl": ["Russ", "Jeremih", "Khalid", "Beyonc\u00c3\u00a9", "William Singe"], "artist_pop_can": 82.0, "artist_pop_pl": [82.0, 80.0, 90.0, 87.0, 65.0], "artist_uri_can": "spotify:artist:1z7b1Pr1rSlvWRzsW3HOrS", "artists_followers_pl": [4339757.0, 5611842.0, 15046756.0, 30713126.0, 603837.0], "description_pl": "", "duration_ms_can": 237322.0, "duration_ms_songs_pl": [237506.0, 217200.0, 219080.0, 226400.0, 121739.0], "n_songs_pl": 8.0, "name": "Lit Tunes ", "num_albums_pl": 8.0, "num_artists_pl": 8.0, "track_name_can": "We Just Havent Met Yet", "track_name_pl": ["Losin Control", "Paradise", "Location", "Crazy In Love - Remix", "Pony"], "track_pop_can": 57.0, "track_pop_pl": [79.0, 58.0, 83.0, 71.0, 57.0], "duration_ms_seed_pl": 51023.1, "pid": 1, "track_uri_can": "spotify:track:0VzDv4wiuZsLsNOmfaUy2W", "track_uri_pl": ["spotify:track:4cxMGhkinTocPSVVKWIw0d", "spotify:track:1wNEBPo3nsbGCZRryI832I", "spotify:track:152lZdxL1OR0ZMW6KquMif", "spotify:track:2f4IuijXLxYOeBncS60GUD", "spotify:track:4Lj8paMFwyKTGfILLELVxt"]}}

Writing src/serving/app/instances.json


In [13]:
# !pwd
# !tree /home/jupyter/merlin-on-vertex

### local nvtabular workflow

In [13]:
WORKFLOW_DIR='gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow'

!gsutil cp -r $WORKFLOW_DIR $LOCAL_WORKFLOW_DIR

Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.album_name_pl.parquet...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_genres_can.parquet...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_genres_pl.parquet...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_name_can.parquet...
- [4 files][ 23.6 MiB/ 23.6 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_name_pl.parquet...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflo

### dockerfile

In [112]:
! rm -rf {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{SERVING_DOCKERNAME}

In [113]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{SERVING_DOCKERNAME}

FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.12

# WORKDIR /src
WORKDIR /app

# Copies the serving code to the docker image.
# COPY serving/* serving/ 

# COPY /serving/requirements.txt /requirements.txt
COPY ./serving/app/requirements.txt /requirements.txt
RUN pip install -r /requirements.txt

#DEBUG CHANGES!!
# RUN mkdir /docker_workflow
# RUN mkdir /docker_model
# ADD local_model /docker_model
# ADD /serving/local_workflow /docker_workflow # TODO - jt paramterize
#END DEBUG!
# COPY /serving/local_workflow /docker_workflow #chaned to local_workflow

RUN mkdir /docker_workflow
# ADD ./serving/local_workflow /docker_workflow
COPY ./serving/local_workflow /docker_workflow
# ADD local_workflow /docker_workflow

# COPY /serving/app /app
COPY ./serving/app /app

EXPOSE 80
    
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port $AIP_HTTP_PORT"]

Writing src/Dockerfile.mm-query-serve


In [114]:
# !pwd

# !cp /src/serving/local_workflow /docker_workflow

## Copy serving assets to `MODEL_DIR`

In [115]:
# MODEL_DIR_old = 'gs://jt-merlin-scaling/pipes-2tower-merlin-tf-v5/run-v1-20221110-024710/model-dir'
# MODEL_DIR = 'gs://jt-merlin-scaling/pipes-2tower-merlin-tf-v5/run-v1-20221110-024710/model-dir2'

EXPERIMENT_RUN_DIR='gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848'
MODEL_DIR=f'{EXPERIMENT_RUN_DIR}/model_dir'

print(f"EXPERIMENT_RUN_DIR: {EXPERIMENT_RUN_DIR}")
print(f"MODEL_DIR: {MODEL_DIR}")

EXPERIMENT_RUN_DIR: gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848
MODEL_DIR: gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir


In [116]:
!gsutil ls $MODEL_DIR

gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/candidate_embeddings/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/candidate_tower/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/query_tower/


In [90]:
# !gsutil cp -r $MODEL_DIR_old/candidate-embeddings $MODEL_DIR/candidate-embeddings

In [91]:
# !gsutil cp -r $MODEL_DIR_old/query-tower $MODEL_DIR/query-tower
# !gsutil cp -r $MODEL_DIR_old/candidate-tower $MODEL_DIR/candidate-tower
# !gsutil cp -r $MODEL_DIR_old/workflow $MODEL_DIR/workflow

### copy Dockerfile

In [117]:
!gsutil cp ./src/Dockerfile.$SERVING_DOCKERNAME $EXPERIMENT_RUN_DIR/

Copying file://./src/Dockerfile.mm-query-serve [Content-Type=application/octet-stream]...
/ [1 files][  862.0 B/  862.0 B]                                                
Operation completed over 1 objects/862.0 B.                                      


### copy serving application

In [22]:
!gsutil -m cp -r ./$REPO_DOCKER_PATH_PREFIX/$SERVING_SUB_DIR $EXPERIMENT_RUN_DIR/

Copying file://./src/serving/app/requirements.txt [Content-Type=text/plain]...
Copying file://./src/serving/app/dataset_to_tensors.py [Content-Type=text/x-python]...
Copying file://./src/serving/app/__init__.py [Content-Type=text/x-python]...    
Copying file://./src/serving/app/instances.json [Content-Type=application/json]...
Copying file://./src/serving/app/predictor.py [Content-Type=text/x-python]...   
Copying file://./src/serving/app/main.py [Content-Type=text/x-python]...        
Copying file://./src/serving/app/prestart.sh [Content-Type=text/x-sh]...        
Copying file://./src/serving/local_workflow/workflow/workflow.pkl [Content-Type=application/octet-stream]...
Copying file://./src/serving/local_workflow/workflow/metadata.json [Content-Type=application/json]...
Copying file://./src/serving/local_workflow/workflow/categories/unique.track_uri_pl.parquet [Content-Type=application/octet-stream]...
Copying file://./src/serving/local_workflow/workflow/categories/unique.collaborat

### copy workflow

* easier for prediction container if a model's related workflow artifacts are stored in the `model_dir`

In [24]:
# WORKFLOW_DIR = "gs://jt-merlin-scaling/nvt-last5-v1full/nvt-analyzed"

In [25]:
!gsutil -m cp -r $WORKFLOW_DIR $EXPERIMENT_RUN_DIR/workflow

Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.album_name_pl.parquet [Content-Type=application/octet-stream]...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_genres_can.parquet [Content-Type=application/octet-stream]...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_genres_pl.parquet [Content-Type=application/octet-stream]...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_name_can.parquet [Content-Type=application/octet-stream]...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.artist_name_pl.parquet [Content-Type=application/octet-stream]...
Copying gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow/categories/unique.collaborative.parquet [Content-Type=application/octet-stream]...
Copying gs://jt-merlin-scaling/nvt-last5-latest-

In [119]:
# check experiment run dir
!gsutil ls $EXPERIMENT_RUN_DIR

gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/Dockerfile.mm-query-serve
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/logs/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/serving/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/workflow/


In [118]:
# check model_dir
!gsutil ls $MODEL_DIR

gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/candidate_embeddings/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/candidate_tower/
gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/query_tower/


## Build Serving Image

### gcloud ignore (optional)

In [120]:
! gcloud config set gcloudignore/enabled true
# !pwd

Updated property [gcloudignore/enabled].


In [121]:
%%writefile .gcloudignore
.gcloudignore
/archive/
/imgs/
/mm_src/
src/process_pipes/*
src/preprocessor/*
src/train_pipes/*
src/trainer/*
/local_workflow/*
# *.json
*.ipynb
.git
.github
.ipynb_checkpoints/*
*__pycache__
*cpython-37.pyc
README.md
src/Dockerfile.triton-cpr
src/Dockerfile.merlintf-22_12_v4
src/Dockerfile.nvt
src/Dockerfile.train
src/Dockerfile.nvt-133
Dockerfile
/app/*
src/Dockerfile.merlin-retriever
custom_container_pipeline_spec.json
nvt-parquet-full-1a100.json
nvt-parquet-latest-12.json
nvt-parquet-full-4t4.json
nvt-parquet-full-2a100.json

Overwriting .gcloudignore


In [122]:
!gcloud meta list-files-for-upload

.gitignore
src/cloudbuild.yaml
src/Dockerfile.mm-query-serve
src/serving/app/requirements.txt
src/serving/app/dataset_to_tensors.py
src/serving/app/__init__.py
src/serving/app/instances.json
src/serving/app/predictor.py
src/serving/app/main.py
src/serving/app/prestart.sh
src/serving/local_workflow/workflow/metadata.json
src/serving/local_workflow/workflow/workflow.pkl
src/serving/local_workflow/workflow/categories/unique.track_uri_pl.parquet
src/serving/local_workflow/workflow/categories/unique.collaborative.parquet
src/serving/local_workflow/workflow/categories/unique.track_name_can.parquet
src/serving/local_workflow/workflow/categories/unique.artist_name_can.parquet
src/serving/local_workflow/workflow/categories/unique.track_name_pl.parquet
src/serving/local_workflow/workflow/categories/unique.track_uri_can.parquet
src/serving/local_workflow/workflow/categories/unique.artist_genres_can.parquet
src/serving/local_workflow/workflow/categories/unique.album_name_pl.parquet
src/serving/loc

### submit to Cloud Build

In [123]:
# !pwd
SERVING_VERSION = 'v27'

In [124]:
# Docker definitions for training
IMAGE_NAME = f'mm2t-vertex-serv-{SERVING_VERSION}'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

DOCKERNAME = f'{SERVING_DOCKERNAME}'
MACHINE_TYPE ='e2-highcpu-32'
FILE_LOCATION = './src'

print(f"IMAGE_NAME: {IMAGE_NAME}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"DOCKERNAME: {DOCKERNAME}")
print(f"FILE_LOCATION: {FILE_LOCATION}")
print(f"MACHINE_TYPE: {MACHINE_TYPE}")

IMAGE_NAME: mm2t-vertex-serv-v27
IMAGE_URI: gcr.io/hybrid-vertex/mm2t-vertex-serv-v27
DOCKERNAME: mm-query-serve
FILE_LOCATION: ./src
MACHINE_TYPE: e2-highcpu-32


In [125]:
! gcloud builds submit --config src/cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 25 file(s) totalling 284.9 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1678202800.337596-85a843a85d4d4760a913dcf6062b93a8.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/ff893e96-f561-4a38-b917-de0424012a55].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/ff893e96-f561-4a38-b917-de0424012a55?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "ff893e96-f561-4a38-b917-de0424012a55"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1678202800.337596-85a843a85d4d4760a913dcf6062b93a8.tgz#1678202905110717
Copying gs://hybrid-vertex_cloudbuild/source/1678202800.337596-85a843a85d4d4760a913dcf6062b93a8.tgz#1678202905110717...
\ [1 files][117.6 MiB/117.6 MiB]                                                
Operation completed over 1 objects/11

In [146]:
IMAGE_URI

'gcr.io/hybrid-vertex/mm2t-vertex-serv-v27'

# Deploy Query Model to Vertex

## Upload to Model Regsitry

* `parent_model` [src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/models.py#L2831) code

In [127]:
MODEL_DEPLOY_VERSION = "v27"

In [128]:
MODEL_DISPLAY_NAME=f"mm-qtower-{MODEL_DEPLOY_VERSION}"
MODEL_ARTIFACT_URI = f'{MODEL_DIR}/query_tower'

print(f"MODEL_ARTIFACT_URI: {MODEL_ARTIFACT_URI}")

MODEL_ARTIFACT_URI: gs://jt-merlin-scaling/new9-2tower-merlin-tf-jtv24/run-20230228-170848/model_dir/query_tower


In [129]:
# existing model resource
# MODEL_URI = 'projects/934903580331/locations/us-central1/models/2330386307768909824@1'
# MODEL_URI = "projects/934903580331/locations/us-central1/models/3574305593713754112" # 50 epoch
# model = vertex_ai.Model(MODEL_URI)

# vertex_ai.Model.delete(model)

# model

In [130]:
start = time.process_time()

model = vertex_ai.Model.upload(
        display_name=MODEL_DISPLAY_NAME,
        artifact_uri=MODEL_ARTIFACT_URI,
        serving_container_image_uri=IMAGE_URI,
        serving_container_predict_route='/predict',
        serving_container_health_route='/health',
        serving_container_command=["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port $AIP_HTTP_PORT"],
        serving_container_args='--gpus all',
        # parent_model=PARENT_MODEL,
        sync=True,
    )

print(f"Created model in {round((time.process_time() - start),2)} seconds\n")

Creating Model
Create Model backing LRO: projects/934903580331/locations/us-central1/models/2551484901975130112/operations/4643987264869761024
Model created. Resource name: projects/934903580331/locations/us-central1/models/2551484901975130112@1
To use this Model in another session:
model = aiplatform.Model('projects/934903580331/locations/us-central1/models/2551484901975130112@1')
Created model in 0.26 seconds



## Deploy to Vertex AI Endpoint

### deployment config

In [132]:
# service account for predictions
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

# model and endpoint display names
# MODEL_DISPLAY_NAME=f"mm-retrieval-{VERSION_serving}"
DEPLOYED_MODEL_DISPLAY_NAME=f"nb-deployed-{MODEL_DISPLAY_NAME}"
ENDPOINT_DISPLAY_NAME = f"mm-endpoint-{MODEL_DEPLOY_VERSION}"

# Endpoint resource config
# DEPLOY_COMPUTE="n1-standard-4"
# DEPLOY_GPU="NVIDIA_TESLA_T4"
# DEPLOY_NGPU=1
# MIN_NODES=1
# MAX_NODES=1
# TRAFFIC=100
traffic_percentage = 100
machine_type = "n1-highmem-16"
accelerator_type = "NVIDIA_TESLA_T4"
accelerator_count = 1
min_replica_count = 1
max_replica_count = 1

print(f"DEPLOYED_MODEL_DISPLAY_NAME: {DEPLOYED_MODEL_DISPLAY_NAME}")
print(f"ENDPOINT_DISPLAY_NAME: {ENDPOINT_DISPLAY_NAME}")

DEPLOYED_MODEL_DISPLAY_NAME: nb-deployed-mm-qtower-v27
ENDPOINT_DISPLAY_NAME: mm-endpoint-v27


### create model endpoint

In [133]:
# ENDPOINT_URI = 'projects/934903580331/locations/us-central1/endpoints/2071809760218316800'
# endpoint = vertex_ai.Endpoint(ENDPOINT_URI)
# vertex_ai.Endpoint.delete(endpoint)

In [134]:
start = time.process_time()

endpoint = vertex_ai.Endpoint.create(
    display_name=ENDPOINT_DISPLAY_NAME,
    project=PROJECT_ID,
    location=LOCATION,
)

print(f"Created endpoint in {round((time.process_time() - start),2)} seconds\n")

print(endpoint)

Creating Endpoint
Create Endpoint backing LRO: projects/934903580331/locations/us-central1/endpoints/5183797102731329536/operations/5279648678150144
Endpoint created. Resource name: projects/934903580331/locations/us-central1/endpoints/5183797102731329536
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/934903580331/locations/us-central1/endpoints/5183797102731329536')
Created endpoint in 0.04 seconds

<google.cloud.aiplatform.models.Endpoint object at 0x7f4a03ac4950> 
resource name: projects/934903580331/locations/us-central1/endpoints/5183797102731329536


### deploy model to endpoint

In [135]:
start = time.process_time()

model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=DEPLOYED_MODEL_DISPLAY_NAME,
    machine_type=machine_type,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    traffic_percentage=traffic_percentage,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    service_account=VERTEX_SA,
    sync=True,
    # deploy_request_timeout=1800
)

print(f"Deployed model to endpoint in {round((time.process_time() - start),2)} seconds\n")

print(model)

Deploying model to Endpoint : projects/934903580331/locations/us-central1/endpoints/5183797102731329536
Deploy Endpoint model backing LRO: projects/934903580331/locations/us-central1/endpoints/5183797102731329536/operations/8101062928830038016
Endpoint model deployed. Resource name: projects/934903580331/locations/us-central1/endpoints/5183797102731329536
Deployed model to endpoint in 0.66 seconds

<google.cloud.aiplatform.models.Model object at 0x7f49f31c3550> 
resource name: projects/934903580331/locations/us-central1/models/2551484901975130112


#### check endpoint

In [136]:
print(endpoint.gca_resource.deployed_models[0])

id: "3433005155405004800"
model: "projects/934903580331/locations/us-central1/models/2551484901975130112"
display_name: "nb-deployed-mm-qtower-v27"
create_time {
  seconds: 1678203793
  nanos: 200988000
}
dedicated_resources {
  machine_spec {
    machine_type: "n1-highmem-16"
    accelerator_type: NVIDIA_TESLA_T4
    accelerator_count: 1
  }
  min_replica_count: 1
  max_replica_count: 1
}
service_account: "934903580331-compute@developer.gserviceaccount.com"
model_version_id: "1"



## Test Deployed Endpoint

### Define endpoint

In [147]:
ENDPOINT_URI = 'projects/934903580331/locations/us-central1/endpoints/5374777874429509632'
endpoint = vertex_ai.Endpoint(ENDPOINT_URI)

# vertex_ai.Endpoint.delete(endpoint)

In [148]:
print(endpoint.gca_resource.deployed_models[0])

id: "7554924714355851264"
model: "projects/934903580331/locations/us-central1/models/149940400679813120"
display_name: "merlin-query-tower-jtv03"
create_time {
  seconds: 1678239690
  nanos: 44023000
}
dedicated_resources {
  machine_spec {
    machine_type: "n1-standard-4"
    accelerator_type: NVIDIA_TESLA_T4
    accelerator_count: 1
  }
  min_replica_count: 1
  max_replica_count: 1
}
service_account: "934903580331-compute@developer.gserviceaccount.com"
model_version_id: "1"



### create sample test instance

In [149]:
TEST_INSTANCE = {
    'collaborative': 'false',
    'album_name_pl': [
        "There's Really A Wolf", 'Late Nights: The Album','American Teen', 'Crazy In Love', 'Pony'
    ], 
    'artist_genres_pl': [
        "'hawaiian hip hop', 'rap'",
       "'chicago rap', 'dance pop', 'pop', 'pop rap', 'r&b', 'southern hip hop', 'trap', 'urban contemporary'",
       "'pop', 'pop r&b'", "'dance pop', 'pop', 'r&b'",
       "'chill r&b', 'pop', 'pop r&b', 'r&b', 'urban contemporary'"
    ], 
    'artist_name_pl': [
        'Russ', 'Jeremih', 'Khalid', 'Beyonc\xc3\xa9','William Singe'
    ], 
    'artist_pop_can': 82.0, 
    'description_pl': '', 
    'duration_ms_songs_pl': [
        237506.0, 217200.0, 219080.0, 226400.0, 121739.0
    ], 
    'n_songs_pl': 8.0, 
    'name': 'Lit Tunes ', 
    'num_albums_pl': 8.0, 
    'num_artists_pl': 8.0, 
    'track_name_pl': [
        'Losin Control', 'Paradise', 'Location','Crazy In Love - Remix', 'Pony'
    ], 
    'track_pop_pl': [
        79.0, 58.0, 83.0, 71.0, 57.0
    ],
    'duration_ms_seed_pl': 51023.1,
    'pid': 1,
    'track_uri_pl': [
        'spotify:track:4cxMGhkinTocPSVVKWIw0d',
        'spotify:track:1wNEBPo3nsbGCZRryI832I',
        'spotify:track:152lZdxL1OR0ZMW6KquMif',
        'spotify:track:2f4IuijXLxYOeBncS60GUD',
        'spotify:track:4Lj8paMFwyKTGfILLELVxt'
    ]
}

### make prediction request

In [150]:
# endpoint.predict(instances=[[TEST_INSTANCE, TEST_INSTANCE]])

In [151]:
start = time.process_time()

playlist_emb = endpoint.predict(instances=[TEST_INSTANCE])

print(f"query conversion: {round((time.process_time() - start),4)} seconds")
print(f"Vector Dimensions: {len(playlist_emb.predictions[0])}\n")

print(f"embeddings: {playlist_emb.predictions}")

query conversion: 0.1045 seconds
Vector Dimensions: 128

embeddings: [[0.0, 0.3523032665252686, 0.0, 0.0, 0.008577216416597366, 0.0, 0.08329428732395172, 0.02887200936675072, 0.0007585510611534119, 0.0, 0.0, 0.001477033831179142, 0.0, 0.06219078227877617, 0.0, 0.0, 0.0, 0.09305078536272049, 0.1470593810081482, 0.1398964375257492, 0.001969851553440094, 0.05529309809207916, 0.03459690138697624, 0.0, 0.05020562559366226, 0.1369166076183319, 0.2387715429067612, 0.02589188329875469, 0.1098095700144768, 0.02495669946074486, 0.0, 0.1223256438970566, 0.0, 0.155764251947403, 0.0, 0.1598586440086365, 0.0, 0.07261146605014801, 0.0, 0.0, 0.0, 0.2138606607913971, 0.1101328507065773, 0.0, 0.06424976885318756, 0.08508362621068954, 0.1797376871109009, 0.04638246819376945, 0.0, 0.00943107157945633, 0.01935089193284512, 0.0, 0.0, 0.0, 0.0, 0.006165824830532074, 0.04977959021925926, 0.08918123692274094, 0.0, 0.0, 0.0, 0.0, 0.06835342943668365, 0.0237956065684557, 0.0, 0.0, 0.088253915309906, 0.0, 0.0, 0.

### prediction latency

In [152]:
%%timeit
endpoint.predict(instances=[[TEST_INSTANCE, TEST_INSTANCE]])

584 ms ± 21.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [153]:
%%timeit
endpoint.predict(instances=[[TEST_INSTANCE]])

550 ms ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# endpoint.predict(instances=[[TEST_INSTANCE]])

In [161]:
# playlist_emb.predictions

## write test instance to GCS

In [156]:
import pickle as pkl

LOCAL_INSTANCE_FILE = 'merlin_last5_test_instance.pkl'

filehandler = open(LOCAL_INSTANCE_FILE, 'wb')
pkl.dump(TEST_INSTANCE, filehandler)
filehandler.close()

In [160]:
# !gsutil cp $LOCAL_INSTANCE_FILE gs://BUCKET/PATH