# Train Merlin TwoTower model

### Notebook Steps
* Build custom Vertex training container based on NVIDIA NGC Merlin Training container
* Confiruger and submit Vertec custom training job
* Configure and submit hyperparameter tuning job
* Evaluate results of hyperparameter tuning job

### Negative Sampling

* Merlin provides scalable negative sampling algorithms for the Item Retrieval Task 
* In this example, the in-batch sampling algorithm, which uses the items interacted by other users as negatives within the same mini-batch

### Setup

In [2]:
import json
import os
import time

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import hyperparameter_tuning as hpt

In [3]:
# TODO: Project definitions
PROJECT_ID = 'hybrid-vertex' # Change to your project ID.
REGION = 'us-central1' # Change to your region.

# TODO: Service Account address
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com' # Change to your service account with Vertex AI Admin permitions.

### For HugeCTR data access

* must be a `/gcs/BUCKET_NAME/...` path for GCSFuse 

In [4]:
# using GCSFuse file lists
TRAIN_DATA = '/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-processed/train/_gcs_file_list.txt'
VALID_DATA = '/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-processed/valid/_gcs_file_list.txt'

# Schema used by the training pipepine
SCHEMA_PATH = '/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-defined/train/schema.pbtxt'

# Merline Datatsets
# train = MerlinDataset(output_train_dir + "/*.parquet", schema=schema, part_size="500MB")
# valid = MerlinDataset(output_valid_dir + "/*.parquet", schema=schema, part_size="500MB")

In [5]:
# Bucket definitions
BUCKET = 'spotify-merlin-v1'

VERSION = 'v8' # changed merlin image from "..:07" to "...:06"
MODEL_NAME = 'twotower'
FRAMEWORK = 'merlin-tf'
MODEL_DISPLAY_NAME = f'vertex-{FRAMEWORK}-{MODEL_NAME}-{VERSION}'
WORKSPACE = f'gs://{BUCKET}/{MODEL_DISPLAY_NAME}'

# Docker definitions for training
IMAGE_NAME = f'{FRAMEWORK}-{MODEL_NAME}-training-{VERSION}'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
# DOCKERNAME = 'hugectr'
DOCKERNAME = 'merlintf'
MACHINE_TYPE ='e2-highcpu-8'
FILE_LOCATION = './src'

In [25]:
# Rob stuff
TRAIN_DIR='/home/jupyter/spotify-merlin/src/training'
SRC_DIR = '/home/jupyter/spotify-merlin/src'

REPO_NAME='spotify-merlin'
GCS_OUTPUT_DIR=f"gs://{BUCKET}/merlin-testing/{VERSION}"

ENDPOINT="us-central1-aiplatform.googleapis.com"

NODE_IMAGE_NAME='merlin-tf'
NODE_IMAGE_TAG='latest'
NODE_IMAGE_URI=f"us-central1-docker.pkg.dev/{PROJECT_ID}/{REPO_NAME}/{FRAMEWORK}:{NODE_IMAGE_TAG}"

os.environ['PROJECT_ID']=PROJECT_ID
os.environ['ENDPOINT']=ENDPOINT
os.environ['TRAIN_DIR']=TRAIN_DIR
os.environ['SRC_DIR']=SRC_DIR
os.environ['NODE_IMAGE_URI']=NODE_IMAGE_URI
os.environ['GCS_OUTPUT_DIR']=GCS_OUTPUT_DIR

In [26]:
%%writefile {TRAIN_DIR}/training.py
import time

while(True):
    time.sleep(60)

Overwriting /home/jupyter/spotify-merlin/src/training/training.py


### Initialize Vertex AI SDK

In [27]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=os.path.join(WORKSPACE, 'staging')
)

### Create Train Image

In [28]:
!pwd

/home/jupyter/spotify-merlin/src


In [29]:
# REPO_DOCKER_PATH_PREFIX = 'src'

> `RUN pip install merlin-models==0.6.0`

In [30]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

# FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.07

# WORKDIR /src

# RUN pip install -U pip
# RUN pip install google-cloud-bigquery gcsfs cloudml-hypertune
# RUN pip install google-cloud-aiplatform kfp
# RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y

# COPY training/* ./

# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/compat/lib.real:/usr/local/hugectr/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib

## Rob code

### Create Dockerfile

In [34]:
dockerfile = f"""

FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.07

WORKDIR /src

RUN pip install -U pip
RUN pip install google-cloud-bigquery gcsfs cloudml-hypertune
RUN pip install google-cloud-aiplatform kfp
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y

COPY training/* ./

ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/compat/lib.real:/usr/local/hugectr/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
ENTRYPOINT ["python", "training.py"]

"""

print(f'Writing {SRC_DIR}/Dockerfile...')
with open(f'{SRC_DIR}/Dockerfile',"w+") as dfile:
    dfile.writelines(dockerfile)
    dfile.close()

Writing /home/jupyter/spotify-merlin/src/Dockerfile...


### Push to Artifact Registry

In [35]:
os.chdir(f'{SRC_DIR}')
os.getcwd()

'/home/jupyter/spotify-merlin/src'

In [36]:
NODE_IMAGE_URI

'us-central1-docker.pkg.dev/hybrid-vertex/spotify-merlin/merlin-tf:latest'

In [37]:
%%bash
docker build -f Dockerfile -t ${NODE_IMAGE_URI} ./
docker push ${NODE_IMAGE_URI}

Sending build context to Docker daemon  629.8kB
Step 1/9 : FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.07
 ---> b5324e8a331b
Step 2/9 : WORKDIR /src
 ---> Using cache
 ---> cb43a6df9d1b
Step 3/9 : RUN pip install -U pip
 ---> Using cache
 ---> 7ccb69b04e52
Step 4/9 : RUN pip install google-cloud-bigquery gcsfs cloudml-hypertune
 ---> Using cache
 ---> 9696f57b9ab6
Step 5/9 : RUN pip install google-cloud-aiplatform kfp
 ---> Using cache
 ---> 093b8ad48f5a
Step 6/9 : RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y
 ---> Using cache
 ---> f3056de4944a
Step 7/9 : COPY training/* ./
 ---> Using cache
 ---> 83fbf75da887
Step 8/9 : ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/c

denied: Permission "artifactregistry.repositories.downloadArtifacts" denied on resource "projects/hybrid-vertex/locations/us-central1/repositories/spotify-merlin" (or it may not exist)


CalledProcessError: Command 'b'docker build -f Dockerfile -t ${NODE_IMAGE_URI} ./\ndocker push ${NODE_IMAGE_URI}\n'' returned non-zero exit status 1.

In [23]:
# print(f"DOCKERNAME: {DOCKERNAME}")
# print(f"IMAGE_URI: {IMAGE_URI}")
# print(f"FILE_LOCATION: {FILE_LOCATION}")
# print(f"MACHINE_TYPE: {MACHINE_TYPE}")

DOCKERNAME: merlintf
IMAGE_URI: gcr.io/hybrid-vertex/merlin-tf-twotower-training-v8
FILE_LOCATION: ./src
MACHINE_TYPE: e2-highcpu-8


### Submit a Vertex custom training job

In [24]:
# os.chdir('/home/jupyter/spotify-merlin')
# os.getcwd()

'/home/jupyter/spotify-merlin'

In [25]:
# FILE_LOCATION = './src'
# ! gcloud builds submit --config src/cloudbuild.yaml --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION --timeout=2h --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 65 file(s) totalling 984.5 KiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1661279816.983557-41b4204d47f04e6384ab4bf03f0883a1.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/233a0960-4747-4183-9336-be616d315342].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/233a0960-4747-4183-9336-be616d315342?project=934903580331].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "233a0960-4747-4183-9336-be616d315342"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1661279816.983557-41b4204d47f04e6384ab4bf03f0883a1.tgz#1661279817340400
Copying gs://hybrid-vertex_cloudbuild/source/1661279816.983557-41b4204d47f04e6384ab4bf03f0883a1.tgz#1661279817340400...
/ [1 files][129.6 KiB/129.6 KiB]                                                
Operation completed over 1 objects/129.

In [34]:
# # Training parameters
# NUM_EPOCHS = 2
# MAX_ITERATIONS = 25000
# EVAL_INTERVAL = 1000
# EVAL_BATCHES = 500
# EVAL_BATCHES_FINAL = 2500
# DISPLAY_INTERVAL = 200
# SNAPSHOT_INTERVAL = 0
# PER_GPU_BATCH_SIZE = 2048
# LR = 0.001
# DROPOUT_RATE = 0.5
# NUM_WORKERS = 12
# LAYER_SIZES='[1024,512,256]'

In [35]:
# layers = json.dumps([list(f"{LAYER_SIZES}")]).replace(' ','')
# layers

'[["[","1","0","2","4",",","5","1","2",",","2","5","6","]"]]'

## Vertex Training

* See [here](https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus) for GPU config options

In [36]:
TRAIN_DATA = '/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-processed/train' #/_gcs_file_list.txt'
VALID_DATA = '/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-processed/valid' #/_gcs_file_list.txt'
WORKFLOW_DIR = f'gs://{BUCKET}/nvt-preprocessing-spotify-v24/nvt-analyzed'

In [37]:
# MACHINE_TYPE = 'a2-highgpu-1g'
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# ACCELERATOR_NUM = 1

# Smaller GPU config
MACHINE_TYPE = "n1-standard-16"
ACCELERATOR_TYPE = "NVIDIA_TESLA_T4"
ACCELERATOR_NUM = 1

gpus = json.dumps([list(range(ACCELERATOR_NUM))]).replace(' ','')
                 
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": MACHINE_TYPE,
            "accelerator_type": ACCELERATOR_TYPE,
            "accelerator_count": ACCELERATOR_NUM,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "-m", "train_task"],
            "args": [
                f'--per_gpu_batch_size={PER_GPU_BATCH_SIZE}',
                f'--model_name={MODEL_NAME}',
                f'--train_dir={TRAIN_DATA}',
                f'--valid_dir={VALID_DATA}',
                f'--schema={SCHEMA_PATH}',
                f'--workflow_dir={WORKFLOW_DIR}',
                # f'--layer_sizes={LAYER_SIZES}',
                # f'--slot_size_array={cardinalities}',
                f'--max_iter={MAX_ITERATIONS}',
                # f'--max_eval_batches={EVAL_BATCHES}',
                # f'--eval_batches={EVAL_BATCHES_FINAL}',
                # f'--dropout_rate={DROPOUT_RATE}',
                # f'--lr={LR}',
                # f'--num_workers={NUM_WORKERS}',
                f'--num_epochs={NUM_EPOCHS}',
                # f'--eval_interval={EVAL_INTERVAL}',
                # f'--snapshot={SNAPSHOT_INTERVAL}',
                # f'--display_interval={DISPLAY_INTERVAL}',
                f'--gpus={gpus}',
                # f'--train_dir, --valid_dir, --layer_sizes
            ],
        },
    }
]

In [38]:
from pprint import pprint

pprint(worker_pool_specs)

[{'container_spec': {'args': ['--per_gpu_batch_size=2048',
                              '--model_name=twotower',
                              '--train_dir=/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-processed/train',
                              '--valid_dir=/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-processed/valid',
                              '--schema=/gcs/spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-defined/train/schema.pbtxt',
                              '--workflow_dir=gs://spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-analyzed',
                              '--max_iter=25000',
                              '--num_epochs=2',
                              '--gpus=[[0]]'],
                     'command': ['python', '-m', 'train_task'],
                     'image_uri': 'gcr.io/hybrid-vertex/merlin-tf-twotower-training-v8'},
  'machine_spec': {'accelerator_count': 1,
                   'accelerator_type': 'NVIDIA_TESLA_T4',
          

### Submit and monitor train job

In [39]:
job_name = 'merlin_towers_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir =  os.path.join(WORKSPACE, job_name)

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    base_output_dir=base_output_dir
)
job.run(
    sync=True,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False,
    enable_web_access=True,
)

Creating CustomJob
CustomJob created. Resource name: projects/934903580331/locations/us-central1/customJobs/1394951776548945920
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/934903580331/locations/us-central1/customJobs/1394951776548945920')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1394951776548945920?project=934903580331
CustomJob projects/934903580331/locations/us-central1/customJobs/1394951776548945920 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/1394951776548945920 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/1394951776548945920 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/1394951776548945920 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/1394951

RuntimeError: Job failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=934903580331&resource=ml_job%2Fjob_id%2F1394951776548945920&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%221394951776548945920%22"
