# Train and Deploy Merlin models with Vertex AI

In [2]:
# !pip install kfp
# !pip install google-cloud-pipeline-components 

In [6]:
# ! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
# ! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
# ! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

In [7]:
PROJECT_ID = 'hybrid-vertex' 
LOCATION = 'us-central1' 
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [9]:
import os
import json
from datetime import datetime
from time import time
import pandas as pd
# disable INFO and DEBUG logging everywhere
import logging
import time
from pprint import pprint

logging.disable(logging.WARNING)

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

# Pipelines
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.types import artifact_types

# Kubeflow SDK
# TODO: fix these
from kfp.v2 import dsl
import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)

storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(project=PROJECT_ID,location=LOCATION)

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
VERSION = 'v1'  # component code
RUN = f'pipe-run-{TIMESTAMP}'
PIPELINE_ROOT = 'pipelines_root'
BUCKET = 'jt-scaling-merlin'
BUCKET_URI = f'gs://{BUCKET}'

In [None]:
REPO_DOCKER_PATH_PREFIX = 'src'
PIPELINES_SUB_DIR = 'train_pipes'

In [None]:
! rm -rf {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}

# Pipeline Components

In [10]:
os.getcwd()

'/home/jupyter/jt-merlin/merlin-on-vertex'

## Build Custom Image

In [None]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/build_custom_image.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)
@kfp.v2.dsl.component(
    base_image="gcr.io/google.com/cloudsdktool/cloud-sdk:latest",
    packages_to_install=[
        "google-cloud-build"
    ],
)
def build_custom_image(
    project: str,
    artifact_gcs_path: str,
    docker_name: str,
    app_dir_name: str,
    custom_image_uri: str,
) -> NamedTuple('Outputs', [
    ('custom_image_uri', str),
]):
    # TODO: make output Artifact for image_uri
    """
    custom pipeline component to build custom image using
    Cloud Build, the training/serving application code, and dependencies
    defined in the Dockerfile
    """
    
    import logging
    import os

    from google.cloud.devtools import cloudbuild_v1 as cloudbuild
    from google.protobuf.duration_pb2 import Duration

    # initialize client for cloud build
    logging.getLogger().setLevel(logging.INFO)
    build_client = cloudbuild.services.cloud_build.CloudBuildClient()
    
    # parse step inputs to get path to Dockerfile and training application code
    _gcs_dockerfile_path = os.path.join(artifact_gcs_path, f"{docker_name}") # Dockerfile.XXXXX
    _gcs_script_dir_path = os.path.join(artifact_gcs_path, f"{app_dir_name}/") # "trainer/"
    
    logging.info(f"_gcs_dockerfile_path: {_gcs_dockerfile_path}")
    logging.info(f"_gcs_script_dir_path: {_gcs_script_dir_path}")
    
    # define build steps to pull the training code and Dockerfile
    # and build/push the custom training container image
    build = cloudbuild.Build()
    build.steps = [
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", "-r", _gcs_script_dir_path, "."],
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", _gcs_dockerfile_path, "Dockerfile"],
        },
        # enabling Kaniko cache in a Docker build that caches intermediate
        # layers and pushes image automatically to Container Registry
        # https://cloud.google.com/build/docs/kaniko-cache
        # {
        #     "name": "gcr.io/kaniko-project/executor:latest",
        #     # "name": "gcr.io/kaniko-project/executor:v1.8.0",        # TODO; downgraded to avoid error in build
        #     # "args": [f"--destination={training_image_uri}", "--cache=true"],
        #     "args": [f"--destination={training_image_uri}", "--cache=false"],
        # },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ['build','-t', f'{custom_image_uri}', '.'],
        },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ['push', f'{custom_image_uri}'], 
        },
    ]
    # override default timeout of 10min
    timeout = Duration()
    timeout.seconds = 7200
    build.timeout = timeout

    # create build
    operation = build_client.create_build(project_id=project, build=build)
    logging.info("IN PROGRESS:")
    logging.info(operation.metadata)

    # get build status
    result = operation.result()
    logging.info("RESULT:", result.status)

    # return step outputs
    return (
        custom_image_uri,
    )

## Train Job

In [None]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/train_merlin.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        'google-cloud-aiplatform==1.18.1',
        # 'google-cloud-storage',
    ],
)
def train_merlin(
    project: str,
    location: str,
    version: str,
    train_image_uri: str,     # TODO: Artifact
    tb_resource: str,
    batch_size: str, 
    train_epochs: int,
    train_dir: str,
    valid_dir: str,
    workflow_dir: str,
    experiment_name: str,
    experiment_run: str,
    service_account: str,
) -> NamedTuple('Outputs', [
    ('merlin_model_gcs_dir', str),
    ('query_tower_gcs_dir', str),
    ('candidate_tower_gcs_uri', str),
    ('candidate_embeddings_gcs_uri', str),
]):
    
    import logging
    from google.cloud import aiplatform as vertex_ai
    from datetime import datetime
    import time

    vertex_ai.init(
        project=project,
        location=location,
    )
    
    TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
    # ====================================================
    # Helper function for workerpool specs
    # ====================================================
    def prepare_worker_pool_specs(
        image_uri,
        # args,
        cmd,
        replica_count=1,
        machine_type="n1-standard-16",
        accelerator_count=1,
        accelerator_type="ACCELERATOR_TYPE_UNSPECIFIED",
        reduction_server_count=0,
        reduction_server_machine_type="n1-highcpu-16",
        reduction_server_image_uri="us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest",
    ):

        if accelerator_count > 0:
            machine_spec = {
                "machine_type": machine_type,
                "accelerator_type": accelerator_type,
                "accelerator_count": accelerator_count,
            }
        else:
            machine_spec = {"machine_type": machine_type}

        container_spec = {
            "image_uri": image_uri,
            # "args": args,
            "command": cmd,
        }

        chief_spec = {
            "replica_count": 1,
            "machine_spec": machine_spec,
            "container_spec": container_spec,
        }

        worker_pool_specs = [chief_spec]
        if replica_count > 1:
            workers_spec = {
                "replica_count": replica_count - 1,
                "machine_spec": machine_spec,
                "container_spec": container_spec,
            }
            worker_pool_specs.append(workers_spec)
        if reduction_server_count > 1:
            workers_spec = {
                "replica_count": reduction_server_count,
                "machine_spec": {
                    "machine_type": reduction_server_machine_type,
                },
                "container_spec": {"image_uri": reduction_server_image_uri},
            }
            worker_pool_specs.append(workers_spec)

        return worker_pool_specs
    
    # ====================================================
    # Define device strategy
    # ====================================================
    # TODO: parameterize
    
    WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
    REPLICA_COUNT = 1
    ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
    PER_MACHINE_ACCELERATOR_COUNT = 1
    REDUCTION_SERVER_COUNT = 0                                                      
    REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
    DISTRIBUTE_STRATEGY = 'single'
    
    # ====================================================
    # # DEFINE ARGS
    # ====================================================
    # TODO: parameterize
    
    BATCH_SIZE = 4096*4      # TODO: `batch_size * 4 ? jw
    LEARNING_RATE = 0.001
    LAYERS = "[512, 256, 128]"
    
    OUTPUT_BUCKET = 'jt-merlin-scaling'
    
    EXPERIMENT_RUN = f'{experiment_run}-{TIMESTAMP}'
    
    WORKER_CMD = [
        'sh',
        '-euc',
        f'''pip freeze && python -m trainer.train_task --tb_name={tb_resource} --per_gpu_batch_size={batch_size} \
        --train_output_bucket={OUTPUT_BUCKET} --train_dir={train_dir} --valid_dir={valid_dir} --workflow_dir={workflow_dir} \
        --num_epochs={train_epochs} --learning_rate={LEARNING_RATE} --distribute={DISTRIBUTE_STRATEGY} \
        --experiment_name={experiment_name} --experiment_run={EXPERIMENT_RUN} --project={project} --location={location}'''
    ]
    
    WORKER_POOL_SPECS = prepare_worker_pool_specs(
        image_uri=train_image_uri,
        # args=WORKER_ARGS,
        cmd=WORKER_CMD,
        replica_count=REPLICA_COUNT,
        machine_type=WORKER_MACHINE_TYPE,
        accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
        accelerator_type=ACCELERATOR_TYPE,
        reduction_server_count=REDUCTION_SERVER_COUNT,
        reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
    )
    # ==============================================================================
    # Submit Train Job 
    # ==============================================================================
    STAGING_BUCKET = f'gs://{OUTPUT_BUCKET}/{experiment_name}'
    JOB_NAME = f'train-merlin-retrieval-{version}'
    gpu_type = ACCELERATOR_TYPE.lower() # lowercase for labels

    job = vertex_ai.CustomJob(
        display_name=JOB_NAME,
        worker_pool_specs=WORKER_POOL_SPECS,
        staging_bucket=STAGING_BUCKET,
        labels={
            'gpu': f'{gpu_type}',
            'gpu_per_replica' : f'{PER_MACHINE_ACCELERATOR_COUNT}',
            'replica_cnt' : f'{REPLICA_COUNT}',
        }
    )
    
    job.run(
        sync=True, 
        service_account=service_account,
        # tensorboard=EXPERIMENT_TB,
        restart_job_on_worker_restart=False,
        enable_web_access=True,
    )
    
    # uris set during train script
    WORKING_DIR_GCS_URI = f'gs://{OUTPUT_BUCKET}/{experiment_name}/{EXPERIMENT_RUN}'
    MODEL_DIR = f"{WORKING_DIR_GCS_URI}/model-dir"
    QUERY_TOWER_PATH = f"{MODEL_DIR}/query-tower"
    CANDIDATE_TOWER_PATH = f"{MODEL_DIR}/candidate-tower"
    EMBEDDINGS_PATH = f"{MODEL_DIR}/candidate-embeddings"
    
    logging.info(f'WORKING_DIR_GCS_URI: {WORKING_DIR_GCS_URI}')
    logging.info(f'MODEL_DIR: {MODEL_DIR}')
    logging.info(f'QUERY_TOWER_PATH: {QUERY_TOWER_PATH}')
    logging.info(f'CANDIDATE_TOWER_PATH: {CANDIDATE_TOWER_PATH}')
    logging.info(f'EMBEDDINGS_PATH: {EMBEDDINGS_PATH}')
    
    return (
        f'{MODEL_DIR}',
        f'{QUERY_TOWER_PATH}',
        f'{CANDIDATE_TOWER_PATH}',
        f'{EMBEDDINGS_PATH}',
    )

# Build & Compile Pipeline

### pipe configs

In [None]:
PIPELINE_VERSION = 'v1' # pipeline code
PIPELINE_TAG = f'retail-visual-similarity-{PIPELINE_VERSION}'
print("PIPELINE_TAG:", PIPELINE_TAG)

## Build pipeline

In [None]:
from src.train_pipes import XXXX

@kfp.v2.dsl.pipeline(
    name=f'{VERSION}-{PIPELINE_TAG}'.replace('_', '-')
)
def pipeline(
    project: str,
    project_number: str,
    location: str,
    version:str,
):
    
    from kfp.v2.components import importer_node
    from google_cloud_pipeline_components.types import artifact_types
    # ========================================================================
    # TODO: data processing steps
    # ========================================================================
    
    
    
    # ========================================================================
    # Build TRAIN Image
    # ========================================================================
    build_custom_train_image_op = (
        build_custom_image(
            project=PROJECT_ID,
            gcs_train_script_path=gcs_train_script_path,
            training_image_uri=TRAIN_IMAGE,
        )
        .set_display_name("Build custom train image")
        .set_caching_options(True)
    )
    
    
    
    # ========================================================================
    # Train Merlin Towers
    # ========================================================================
    
    
    
    # ========================================================================
    # Build SERVING Image
    # ========================================================================
    
    
    
    
    # ========================================================================
    # Import Trained Towers to Pipeline DAG
    # ========================================================================
    
    
    
    # ========================================================================
    # Upload Models to Vertex AI Model Registry
    # ========================================================================
    
    
    
    # ========================================================================
    # Deploy Model to Endpoint
    # ========================================================================
    
    
    
    # ========================================================================
    # Vertex Matching Engine Steps
    # ========================================================================

In [None]:
# TRAIN_IMAGE_ID = json.dumps(str(build_custom_train_image_op.outputs['training_image_uri']).replace("'",'"'))

In [None]:
#   import_unmanaged_query_model_task = (
#       importer_node.importer(
#           artifact_uri=QUERY_MODEL_DIR,
#           artifact_class=artifact_types.UnmanagedContainerModel,
#           metadata={
#               'containerSpec': {
#                   'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest',
#               },
#           },
#       )
#       .set_display_name("Import Query Tower")
#       .after(run_train_task_op)
#   )

#   query_model_upload_op = (
#       gcc_aip.ModelUploadOp(
#           project=project,
#           location=location,
#           display_name=QUERY_MODEL_DISPLAY_NAME,
#           unmanaged_container_model=import_unmanaged_query_model_task.outputs["artifact"],
#           labels={"version": VERSION, "tower": "query", "model_endpoint_name": MODEL_ENDPOINT_NAME}, # replace with cfg.MODEL_ENDPOINT_NAME '2tower-recsys-pipe-v4-model-endpoint'
#       )
#       # .after(import_unmanaged_query_model_task)
#       .set_display_name("Upload Query Tower to Vertex")
#   )

## Compile Pipeline

## Submit Pipeline Job