# Train Merlin Two-Towers

### pip & package

In [1]:
import os
# import nvtabular as nvt
from time import time
import pandas as pd
# disable INFO and DEBUG logging everywhere
import logging
import time
from pprint import pprint

logging.disable(logging.WARNING)

# from nvtabular.ops import (
#     Categorify,
#     TagAsUserID,
#     TagAsItemID,
#     TagAsItemFeatures,
#     TagAsUserFeatures,
#     AddMetadata,
#     ListSlice
# )
# import nvtabular.ops as ops

# from merlin.schema.tags import Tags

# import merlin.models.tf as mm
# from merlin.io.dataset import Dataset
# from merlin.io.dataset import Dataset as MerlinDataset
# from merlin.models.utils.example_utils import workflow_fit_transform
# import tensorflow as tf

from google.cloud import aiplatform as vertex_ai
# from google.cloud.aiplatform import hyperparameter_tuning as hpt

# for running this example on CPU, comment out the line below
# os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

### Setup

In [2]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


In [3]:
# TODO: Service Account address
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com' # Change to your service account with Vertex AI Admin permitions.

In [4]:
# Bucket definitions
BUCKET = 'jt-merlin-scaling' # 'spotify-merlin-v1'

VERSION = 'jtv34'
MODEL_NAME = '2tower'
FRAMEWORK = 'merlin-tf'
MODEL_DISPLAY_NAME = f'vertex-{FRAMEWORK}-{MODEL_NAME}-{VERSION}'
WORKSPACE = f'gs://{BUCKET}/{MODEL_DISPLAY_NAME}'

# # Docker definitions for training
# IMAGE_NAME = f'{FRAMEWORK}-{MODEL_NAME}-training-{VERSION}'
# IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
# # DOCKERNAME = 'hugectr'
# DOCKERNAME = 'merlintf'
# MACHINE_TYPE ='e2-highcpu-32'
# FILE_LOCATION = './src'

# Training Package

In [5]:
REPO_DOCKER_PATH_PREFIX = 'src'
TRAIN_SUB_DIR = 'trainer'

In [6]:
# Make the training subfolder
! rm -rf {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}

In [7]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/__init__.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Writing src/trainer/__init__.py


## Interactive Train Shell

In [8]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/interactive_train.py

import time

while(True):
    time.sleep(60)

Writing src/trainer/interactive_train.py


## Two-Tower Model

In [9]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/two_tower_model.py

from typing import List, Any

import nvtabular as nvt
# # import nvtabular.ops as ops

# from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.schema.tags import Tags
import merlin.models.tf as mm
from merlin.models.tf.outputs.base import DotProduct, MetricsFn, ModelOutput

import logging

import tensorflow as tf


def create_two_tower(
    train_dir: str,
    valid_dir: str,
    workflow_dir: str,
    layer_sizes: List[Any] = [512, 256, 128],
):
    
    #=========================================
    # get workflow details
    #=========================================
    workflow = nvt.Workflow.load(workflow_dir) # gs://spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-analyzed
    
    schema = workflow.output_schema
    # embeddings = ops.get_embedding_sizes(workflow)
    
    user_schema = schema.select_by_tag(Tags.USER)
    user_inputs = mm.InputBlockV2(user_schema)
    
    #=========================================
    # build towers
    #=========================================
    query = mm.Encoder(user_inputs, mm.MLPBlock(layer_sizes))
    
    item_schema = schema.select_by_tag(Tags.ITEM)
    item_inputs = mm.InputBlockV2(
        item_schema,
    )
    candidate = mm.Encoder(item_inputs, mm.MLPBlock(layer_sizes))
    
    model = mm.RetrievalModelV2(
        query=query,
        candidate=candidate,
        output=mm.ContrastiveOutput(
            to_call=DotProduct(),
            negative_samplers="in-batch",
            schema=item_schema.select_by_tag(Tags.ITEM_ID),
        )
    )
    
    # model = mm.TwoTowerModelV2(
    #     query_tower=query,
    #     candidate_tower=candidate,
    #     # output=mm.ContrastiveOutput(
    #     #     to_call=DotProduct(),
    #     #     negative_samplers="in-batch",
    #     #     schema=item_schema.select_by_tag(Tags.ITEM_ID),
    #     #     candidate_name="item",
    #     # )
    # )
    
    return model

Writing src/trainer/two_tower_model.py


## Trainer utils

In [10]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/train_utils.py

# ====================================================
# Helper functions - moved from train_task.py
# ====================================================

import argparse
import json
import logging
import os
import sys
import time
import pandas as pd

from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob


import glob


# GCS_CLIENT = storage.Client()

def _is_chief(task_type, task_id): 
    ''' Check for primary if multiworker training
    '''
    if task_type == 'chief':
        results = 'chief'
    else:
        results = None
    return results

def get_upload_logs_to_manged_tb_command(tb_resource_name, logs_dir, experiment_name, ttl_hrs, oneshot="false"):
    """
    Run this and copy/paste the command into terminal to have 
    upload the tensorboard logs from this machine to the managed tb instance
    Note that the log dir is at the granularity of the run to help select the proper
    timestamped run in Tensorboard
    You can also run this in one-shot mode after training is done 
    to upload all tb objects at once
    """
    return(
        f"""tb-gcp-uploader --tensorboard_resource_name={tb_resource_name} \
        --logdir={logs_dir} \
        --experiment_name={experiment_name} \
        --one_shot={oneshot} \
        --event_file_inactive_secs={60*60*ttl_hrs}"""
    )

def _upload_blob_gcs(gcs_uri, source_file_name, destination_blob_name, project):
    """Uploads a file to GCS bucket"""
    storage_client = storage.Client(project=project)
    blob = Blob.from_string(os.path.join(gcs_uri, destination_blob_name))
    blob.bucket._client = storage_client
    blob.upload_from_filename(source_file_name)
    
def get_arch_from_string(arch_string):
    q = arch_string.replace(']', '')
    q = q.replace('[', '')
    q = q.replace(" ", "")
    return [int(x) for x in q.split(',')]

def upload_from_directory(
    directory_path: str, 
    dest_bucket_name: str, 
    dest_blob_name: str,
    project: str,
):
    storage_client = storage.Client(project=project)
    rel_paths = glob.glob(directory_path + '/**', recursive=True)
    bucket = storage_client.get_bucket(dest_bucket_name)
    
    for local_file in rel_paths:
        remote_path = f'{dest_blob_name}/{"/".join(local_file.split(os.sep)[1:])}'
        if os.path.isfile(local_file):
            blob = bucket.blob(remote_path)
            blob.upload_from_filename(local_file)

Writing src/trainer/train_utils.py


## Train task

In [11]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/train_task.py

import argparse
import json
import logging
import os
import sys
import time
import pandas as pd
import random
import string

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
# os.environ["TF_MEMORY_ALLOCATION"] = "0.3"  # fraction of free memory

# merlin
# from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.io.dataset import Dataset as MerlinDataset
from merlin.models.tf.outputs.base import DotProduct, MetricsFn, ModelOutput
from merlin.schema.tags import Tags
import merlin.models.tf as mm

from merlin.models.utils.dataset import unique_rows_by_features

# nvtabular
import nvtabular as nvt
import nvtabular.ops as ops

# tensorflow
import tensorflow as tf
from tensorflow.python.client import device_lib

# gcp
import google.cloud.aiplatform as vertex_ai
from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob
# import hypertune
import traceback
from google.cloud.aiplatform.training_utils import cloud_profiler

# repo
from .two_tower_model import create_two_tower
from .train_utils import (
    get_upload_logs_to_manged_tb_command, 
    get_arch_from_string, 
    _upload_blob_gcs, 
    upload_from_directory
)

# local
HYPERTUNE_METRIC_NAME = 'AUC'
LOCAL_MODEL_DIR = '/tmp/saved_model'
LOCAL_CHECKPOINT_DIR = '/tmp/checkpoints'

# ====================================================
# arg parser
# ====================================================
    
def parse_args():
    """
    Parses command line arguments
    
    type: int, float, str
          bool() converts empty strings to `False` and non-empty strings to `True`
          see more details here: https://docs.python.org/3/library/argparse.html#type
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--experiment_name',type=str,required=False,default='unnamed-experiment')
    parser.add_argument('--experiment_run', type=str, required=False, default='unnamed_run')
    parser.add_argument('--tb_name', type=str, required=False)
    parser.add_argument('--distribute', type=str, required=False, default='single')
    parser.add_argument('--train_output_bucket', type=str, required=True) # default='single',)
    parser.add_argument('--workflow_dir', type=str, required=True)
    parser.add_argument('--train_dir', type=str, required=True)
    parser.add_argument('--valid_dir', type=str, required=True)
    parser.add_argument('--num_epochs', type=int, required=True)
    parser.add_argument('--per_gpu_batch_size', type=int, required=True)
    parser.add_argument('--layer_sizes', type=str, required=False, default='[512, 256, 128]')
    parser.add_argument('--learning_rate', type=float, required=False, default=.001)
    parser.add_argument('--project', type=str, required=True)
    parser.add_argument('--location', type=str, required=True)
    parser.add_argument('--valid_frequency', type=int, required=False)
    parser.add_argument('--epoch_steps', type=int, required=False)
    parser.add_argument('--valid_steps', type=int, required=False)
    parser.add_argument('--chkpt_freq', required=True) # type=int | TODO: value could be int or string
    parser.add_argument("--profiler", action='store_true', help="include for True; ommit for False")
    parser.add_argument("--write_embeddings", action='store_true', help="include for True; ommit for False")
    
    return parser.parse_args()
                        
# ====================================================
# TRAINING SCRIPT
# ====================================================
    
def main(args):
    """Runs a training loop."""
    
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    # tf.debugging.set_log_device_placement(True) # logs all tf ops and their device placement;
    # os.environ['TF_GPU_THREAD_MODE']='gpu_private'
    # os.environ['TF_GPU_THREAD_COUNT']='1'
    os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
    TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
    
    # ====================================================
    # handle train job restarts for experiment runs (no duplicates)
    # ====================================================
    logging.info(f"EXPERIMENT_NAME: {args.experiment_name}\n RUN_NAME: {args.experiment_run}")
    
    SESSION_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))
    EXPERIMENT_RUN_THIS = f'{args.experiment_run}-{SESSION_id}'
    
    logging.info(f"Changing: {args.experiment_run} to: {EXPERIMENT_RUN_THIS} to handle job restarts")
    
    # ====================================================
    # Set directories
    # ====================================================
    WORKING_DIR_GCS_URI = f'gs://{args.train_output_bucket}/{args.experiment_name}/{args.experiment_run}'
    logging.info(f"WORKING_DIR_GCS_URI: {WORKING_DIR_GCS_URI}")
    
    LOGS_DIR = f'{WORKING_DIR_GCS_URI}/tb_logs'
    if 'AIP_TENSORBOARD_LOG_DIR' in os.environ:
        LOGS_DIR=os.environ['AIP_TENSORBOARD_LOG_DIR']
    logging.info(f'TensorBoard LOGS_DIR: {LOGS_DIR}')
    
    # ====================================================
    # log variables
    # ====================================================
    logging.info(f'TIMESTAMP: {TIMESTAMP}')
    logging.info(f'EXPERIMENT_NAME: {args.experiment_name}')
    logging.info(f'RUN_NAME: {args.experiment_run}')
    logging.info(f'EXPERIMENT_RUN_THIS: {EXPERIMENT_RUN_THIS}')
    logging.info(f'NUM_EPOCHS: {args.num_epochs}')
    logging.info(f'TB_RESOURCE_NAME tb_name: {args.tb_name}')
    logging.info(f'distribute: {args.distribute}')
    logging.info(f'train_output_bucket: {args.train_output_bucket}')
    logging.info(f'workflow_dir: {args.workflow_dir}')
    logging.info(f'train_dir: {args.train_dir}')
    logging.info(f'valid_dir: {args.valid_dir}')
    logging.info(f'num_epochs: {args.num_epochs}')
    logging.info(f'per_gpu_batch_size: {args.per_gpu_batch_size}')
    logging.info(f'layer_sizes: {args.layer_sizes}')
    logging.info(f'learning_rate: {args.learning_rate}')
    logging.info(f'project: {args.project}')
    logging.info(f'location: {args.location}')
    logging.info(f'valid_frequency: {args.valid_frequency}')
    logging.info(f'epoch_steps: {args.epoch_steps}')
    logging.info(f'valid_steps: {args.valid_steps}')
    logging.info(f'chkpt_freq: {args.chkpt_freq}')
    logging.info(f'profiler: {args.profiler}')
    logging.info(f'write_embeddings: {args.write_embeddings}')
    
    LAYER_SIZES = get_arch_from_string(args.layer_sizes)
    logging.info(f'LAYER_SIZES: {LAYER_SIZES}')
    
    # ====================================================
    # Init Clients
    # ====================================================
    project_number = os.environ["CLOUD_ML_PROJECT_ID"]
                        
    storage_client = storage.Client(project=f'{args.project}')
    
    vertex_ai.init(
        project=f'{args.project}',
        location=f'{args.location}',
        experiment=f'{args.experiment_name}',
    )
    
    logging.info("vertex_ai initialized...")
    
    # ====================================================
    # Set Device / GPU Strategy
    # ====================================================    
    logging.info("Detecting devices....")
    logging.info(f'Detected Devices {str(device_lib.list_local_devices())}')
    
    logging.info("Setting device strategy...")
    
    # Single Machine, single compute device
    if args.distribute == 'single':
        if tf.test.is_gpu_available(): # TODO: replace with - tf.config.list_physical_devices('GPU')
            strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        else:
            strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        logging.info("Single device training")
    
    # Single Machine, multiple compute device
    elif args.distribute == 'mirrored':
        strategy = tf.distribute.MirroredStrategy()
        logging.info("Mirrored Strategy distributed training")

    # Multi Machine, multiple compute device
    elif args.distribute == 'multiworker':
        strategy = tf.distribute.MultiWorkerMirroredStrategy()
        logging.info("Multi-worker Strategy distributed training")
        logging.info('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
        
    
    # set related vars...
    NUM_WORKERS = strategy.num_replicas_in_sync
    GLOBAL_BATCH_SIZE = NUM_WORKERS * args.per_gpu_batch_size
    logging.info(f'NUM_WORKERS = {NUM_WORKERS}')
    logging.info(f'GLOBAL_BATCH_SIZE: {GLOBAL_BATCH_SIZE}')
    
    # set worker vars...
    logging.info(f'Setting task_type and task_id...')
    if args.distribute == 'multiworker':
        task_type, task_id = (
            strategy.cluster_resolver.task_type,
            strategy.cluster_resolver.task_id
        )
    else:
        task_type, task_id = 'chief', None
    
    logging.info(f'task_type = {task_type}')
    logging.info(f'task_id = {task_id}')
        
    # ====================================================
    # Prepare Train and Valid Data
    # ====================================================
    logging.info(f'Loading workflow & schema from : {args.workflow_dir}')
    
    workflow = nvt.Workflow.load(args.workflow_dir)
    schema = workflow.output_schema
    
    train_data = MerlinDataset(os.path.join(args.train_dir, "*.parquet"), schema=schema, part_size="1GB")
    valid_data = MerlinDataset(os.path.join(args.valid_dir, "*.parquet"), schema=schema, part_size="1GB")
    
    # ====================================================
    # Callbacks
    # ====================================================            
    checkpoint_dir=os.environ['AIP_CHECKPOINT_DIR']
    logging.info(f'Saving model checkpoints to {checkpoint_dir}')
    
    # model checkpoints - ModelCheckpoint | BackupAndRestore
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_dir + "/cp-{epoch:03d}-loss={loss:.2f}.ckpt", # cp-{epoch:04d}.ckpt" cp-{epoch:04d}.ckpt"
        save_weights_only=True,
        save_best_only=True,
        monitor='total_loss',
        mode='min',
        save_freq=args.chkpt_freq,
        verbose=1,
    )

    if args.profiler:
        #TODO
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=LOGS_DIR,
            # histogram_freq=args.hist_frequency, 
            write_graph=True,
            # embeddings_freq=args.embed_frequency,
            profile_batch=(25, 30),
            update_freq='epoch',     # TODO: JT updated
        )
        logging.info(f'Tensorboard callback should profile batches...')
        
    else:
        # TODO
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=LOGS_DIR,
            # histogram_freq=args.hist_frequency, 
            write_graph=True,
            # embeddings_freq=args.embed_frequency,
        )
        logging.info(f'Tensorboard callback NOT profiling batches...')
    
    # ====================================================
    # Train
    # ==================================================== 
    
    # Initialize profiler
    logging.info('Initializing profiler ...')
    
    try:
        cloud_profiler.init()
    except:
        ex_type, ex_value, ex_traceback = sys.exc_info()
        print("*** Unexpected:", ex_type.__name__, ex_value)
        traceback.print_tb(ex_traceback, limit=10, file=sys.stdout)
        
    logging.info('The profiler initiated...')

    # with strategy.scope():
        # here
    model = create_two_tower(
        train_dir=args.train_dir,
        valid_dir=args.valid_dir,
        workflow_dir=args.workflow_dir,
        layer_sizes=LAYER_SIZES # args.layer_sizes,
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adagrad(args.learning_rate),
        run_eagerly=False,
        metrics=[
            mm.RecallAt(10), 
            mm.RecallAt(20), 
            mm.NDCGAt(10)
        ],
    )
    logging.info('model compiled...')
    
    # cloud_profiler.init() # managed TB profiler
        
    logging.info('Starting training loop...')
    
    start_model_fit = time.time()
    
    model.fit(
        train_data, 
        validation_data=valid_data,
        validation_freq=args.valid_frequency,
        batch_size=GLOBAL_BATCH_SIZE, 
        epochs=args.num_epochs,
        steps_per_epoch=args.epoch_steps,
        validation_steps=args.valid_steps, # 100,
        callbacks=[
            tensorboard_callback, 
            # UploadTBLogsBatchEnd(),
            model_checkpoint_callback
        ],
        verbose=2
    )
    
    # capture elapsed time
    end_model_fit = time.time()
    
    total_train_time = int((end_model_fit - start_model_fit) / 60)
    logging.info(f'Elapsed total_train_time: {total_train_time}')
    
    # ====================================================
    # metaparams & metrics for Vertex Ai Experiments
    # ====================================================
    logging.info('Logging params & metrics for Vertex Experiments')
    
    # get the metrics for the experiment run
    history_keys = model.history.history.keys()
    
    metrics_dict = {}
    _ = [metrics_dict.update({key: model.history.history[key][-1]}) for key in history_keys]
    metrics_dict["total_train_time"] = total_train_time 
    
    logging.info(f'metrics_dict: {metrics_dict}')
    
    metaparams = {}
    metaparams["experiment_name"] = f'{args.experiment_name}'
    metaparams["experiment_run"] = f"{args.experiment_run}"
    logging.info(f'metaparams: {metaparams}')
    
    hyperparams = {}
    hyperparams["epochs"] = int(args.num_epochs)
    hyperparams["num_gpus"] = NUM_WORKERS # num_gpus
    hyperparams["per_gpu_batch_size"] = args.per_gpu_batch_size
    hyperparams["global_batch_size"] = GLOBAL_BATCH_SIZE
    hyperparams["learning_rate"] = args.learning_rate
    hyperparams['layers'] = f'{args.layer_sizes}'
    logging.info(f'hyperparams: {hyperparams}')
    
    # ====================================================
    # Experiments
    # ====================================================
    logging.info(f"Creating run: {EXPERIMENT_RUN_THIS}; for experiment: {args.experiment_name}")
    
    if task_type == 'chief':
        logging.info(f" task_type logging experiments: {task_type}")
        logging.info(f" task_id logging experiments: {task_id}")
        logging.info(f" logging data to experiment run: {EXPERIMENT_RUN_THIS}")
    
        # Create experiment
        vertex_ai.init(experiment=args.experiment_name)

        with vertex_ai.start_run(args.experiment_run) as my_run:
            logging.info(f"logging metrics_dict")
            my_run.log_metrics(metrics_dict)

            logging.info(f"logging metaparams")
            my_run.log_params(metaparams)

            logging.info(f"logging hyperparams")
            my_run.log_params(hyperparams)
            
            vertex_ai.end_run()
            logging.info(f"experiment run: {EXPERIMENT_RUN_THIS} has ended")
        
    # =============================================
    # save retrieval (query) tower
    # =============================================
    QUERY_TOWER_LOCAL_DIR = 'query_tower'
    CANDIDATE_TOWER_LOCAL_DIR = 'candidate_tower'
    # set vars...
    MODEL_DIR = f"{WORKING_DIR_GCS_URI}/model_dir"
    logging.info(f'Saving towers to {MODEL_DIR}')
    
    QUERY_TOWER_PATH = f"{MODEL_DIR}/query_tower"
    CANDIDATE_TOWER_PATH = f"{MODEL_DIR}/candidate_tower"
    EMBEDDINGS_PATH = f"{MODEL_DIR}/candidate_embeddings"
    
    if task_type == 'chief':
        
        # save query tower
        query_tower = model.query_encoder
        query_tower.save(f'{QUERY_TOWER_LOCAL_DIR}/')
        logging.info(f'Saved query tower locally to {QUERY_TOWER_LOCAL_DIR}')
        upload_from_directory(f'./{QUERY_TOWER_LOCAL_DIR}', args.train_output_bucket, f'{args.experiment_name}/{args.experiment_run}/model_dir', f'{args.project}')
        logging.info(f'Saved query tower to {QUERY_TOWER_PATH}')
        
        candidate_tower = model.candidate_encoder
        candidate_tower.save(f'{CANDIDATE_TOWER_LOCAL_DIR}')
        logging.info(f'Saved candidate tower locally to {CANDIDATE_TOWER_LOCAL_DIR}')
        upload_from_directory(f'./{CANDIDATE_TOWER_LOCAL_DIR}', args.train_output_bucket, f'{args.experiment_name}/{args.experiment_run}/model_dir', f'{args.project}')
        logging.info(f'Saved candidate tower to {CANDIDATE_TOWER_PATH}')

    
    # ====================================================
    # Save embeddings
    # ====================================================
    
    if args.write_embeddings:
        # TODO: 
        logging.info('Saving candidate embeddings...')
        EMBEDDINGS_FILE_NAME = "candidate_embeddings.json"
        logging.info(f"Saving {EMBEDDINGS_FILE_NAME} to {EMBEDDINGS_PATH}")
    
        # helper function
        def format_for_matching_engine(data) -> None:
            cols = [str(i) for i in range(LAYER_SIZES[-1])]      # ensure we are only pulling 0-EMBEDDING_DIM cols
            emb = [data[col] for col in cols]                    # get the embeddings
            formatted_emb = '{"id":"' + str(data['track_uri_can']) + '","embedding":[' + ",".join(str(x) for x in list(emb)) + ']}'
            with open(f"{EMBEDDINGS_FILE_NAME}", 'a') as f:
                f.write(formatted_emb)
                f.write("\n")

        item_data = pd.read_parquet(f'{args.workflow_dir}/categories/unique.track_uri_can.parquet')
        lookup_dict = dict(item_data['track_uri_can'])

        # item embeds from TRAIN
        start_embeds = time.time()

        item_features = (
            unique_rows_by_features(train_data, Tags.ITEM, Tags.ID)
        )
        item_embs = model.candidate_embeddings(
            item_features, 
            index=item_features.schema['track_uri_can'], 
            batch_size=10000
        )
        item_emb_pd = item_embs.compute().to_pandas().fillna(1e-10).reset_index() #filling blanks with an epsilon value
        item_emb_pd['track_uri_can'] = item_emb_pd['track_uri_can'].apply(lambda l: lookup_dict[l])
        _ = item_emb_pd.apply(format_for_matching_engine, axis=1)

        # capture elapsed time
        end_embeds = time.time()
        elapsed_time = end_embeds - start_embeds
        elapsed_time = round(elapsed_time, 2)
        logging.info(f'Elapsed time writting TRAIN embeddings: {elapsed_time} seconds')

        # item embeds from VALID
        start_embeds = time.time()

        item_features_val = (
            unique_rows_by_features(valid_data, Tags.ITEM, Tags.ID)
        )
        item_embs_val = model.candidate_embeddings(
            item_features_val, 
            index=item_features_val.schema['track_uri_can'], 
            batch_size=10000
        )
        item_emb_pd_val = item_embs_val.compute().to_pandas().fillna(1e-10).reset_index() #filling blanks with an epsilon value
        item_emb_pd_val['track_uri_can'] = item_emb_pd_val['track_uri_can'].apply(lambda l: lookup_dict[l])
        _ = item_emb_pd_val.apply(format_for_matching_engine, axis=1)

        # capture elapsed time
        end_embeds = time.time()
        elapsed_time = end_embeds - start_embeds
        elapsed_time = round(elapsed_time, 2)
        logging.info(f'Elapsed time writting VALID embeddings: {elapsed_time} seconds')
    
        if task_type == 'chief':
            _upload_blob_gcs(
                EMBEDDINGS_PATH, 
                f"{EMBEDDINGS_FILE_NAME}", 
                f"{EMBEDDINGS_FILE_NAME}",
                args.project
            )
            
            logging.info(f"Saved {EMBEDDINGS_FILE_NAME} to {EMBEDDINGS_PATH}")
            
    else:
        logging.info(f"Did not write embeddings JSON...")
    
    logging.info('All done - model saved') #all done

if __name__ == '__main__':
    logging.basicConfig(
        format='%(asctime)s - %(message)s',
        level=logging.INFO, 
        datefmt='%d-%m-%y %H:%M:%S',
        stream=sys.stdout
    )

    parsed_args = parse_args()

    logging.info('Args: %s', parsed_args)
    start_time = time.time()
    logging.info('Starting training')

    main(parsed_args)

    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info('Training completed. Elapsed time: %s', elapsed_time )

Writing src/trainer/train_task.py


### train requirements

In [12]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/requirements.txt
merlin-models 
gcsfs 
google-cloud-aiplatform>=1.23.0 
fastapi
tensorboard-plugin-profile==2.11.1
google-cloud-aiplatform[cloud_profiler]>=1.23.0

Writing src/trainer/requirements.txt


## Training Image

### versioned image

In [13]:
# Docker definitions for training
MERLIN_VERSION = '2212v16'
IMAGE_NAME = f'train-{MERLIN_VERSION}-{MODEL_DISPLAY_NAME}'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

DOCKERNAME = f'train'
MACHINE_TYPE ='e2-highcpu-32'
FILE_LOCATION = './src'

**TODO:**

```
RUN pip install google-cloud-bigquery gcsfs
RUN pip install google-cloud-aiplatform[cloud_profiler] kfp
```

In [14]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.12

WORKDIR /src

# Copies the trainer code to the docker image.
COPY trainer/* trainer/ 

# RUN pip install -U pip
# RUN pip install merlin-models gcsfs google-cloud-aiplatform fastapi
RUN pip install -r trainer/requirements.txt

RUN apt update && apt -y install nvtop

Overwriting src/Dockerfile.train


# Build Train Image

In [15]:
!pwd

/home/jupyter/merlin-on-vertex-ORIGINAL/merlin-on-vertex


In [16]:
! gcloud config set gcloudignore/enabled true

Updated property [gcloudignore/enabled].


In [17]:
%%writefile .gcloudignore
.gcloudignore
/archive/*
/imgs/*
/mm_src/*
/src/serving/*
/src/train_pipes/*
/src/process_pipes/*
/src/preprocessor/*
/test_app/*
/local_workflow/
README.md
*.pkl
*.png
*.ipynb
.git
.github
.ipynb_checkpoints/*
*__pycache__
*cpython-37.pyc
pip_freeze.txt
custom_container_pipeline_spec.json
# *.json
Dockerfile.triton-cpr
Dockerfile.merlin-retriever
Dockerfile.merlintf-22_12_v4
Dockerfile.nvt
Dockerfile.nvt-133
Dockerfile
src/Dockerfile.mm-query-serve
nvt-parquet-full-1a100.json
nvt-parquet-latest-12.json
nvt-parquet-full-4t4.json
nvt-parquet-full-2a100.json
custom_pipeline_spec.json
spotipy_secret_creds.py
sp_utils.py
.gitignore
.cache

Overwriting .gcloudignore


In [18]:
!gcloud meta list-files-for-upload

utils/train_utils.py
src/cloudbuild.yaml
src/Dockerfile.train
src/trainer/train_task.py
src/trainer/interactive_train.py
src/trainer/requirements.txt
src/trainer/two_tower_model.py
src/trainer/__init__.py
src/trainer/train_utils.py


### `cloudbuild.yaml`

In [19]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
images:
- '$_IMAGE_URI'

Overwriting src/cloudbuild.yaml


In [20]:
# os.chdir('/home/jupyter/jt-merlin/merlin-on-vertex')
# os.getcwd()
print(f"export DOCKERNAME={DOCKERNAME}")
print(f"export IMAGE_URI={IMAGE_URI}")
print(f"export FILE_LOCATION={FILE_LOCATION}")
print(f"export MACHINE_TYPE={MACHINE_TYPE}")

export DOCKERNAME=train
export IMAGE_URI=gcr.io/hybrid-vertex/train-2212v16-vertex-merlin-tf-2tower-jtv34
export FILE_LOCATION=./src
export MACHINE_TYPE=e2-highcpu-32


In [22]:
! gcloud builds submit --config src/cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

# Vertex Train Job

### Prepare `worker_pool_specs`

### Acclerators and Device Strategy

In [23]:
import time

# ====================================================
# Single | Single machine, single GPU
# ====================================================
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
PER_MACHINE_ACCELERATOR_COUNT = 1
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
DISTRIBUTE_STRATEGY = 'single'

## Train Args

### Previously defined Vars

In [24]:
print(f"PROJECT: {PROJECT_ID}")
print(f"VERSION: {VERSION}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"MODEL_NAME: {MODEL_NAME}")
print(f"FRAMEWORK: {FRAMEWORK}")
print(f"MODEL_DISPLAY_NAME: {MODEL_DISPLAY_NAME}")
print(f"WORKSPACE: {WORKSPACE}")

PROJECT: hybrid-vertex
VERSION: jtv34
IMAGE_URI: gcr.io/hybrid-vertex/train-2212v16-vertex-merlin-tf-2tower-jtv34
MODEL_NAME: 2tower
FRAMEWORK: merlin-tf
MODEL_DISPLAY_NAME: vertex-merlin-tf-2tower-jtv34
WORKSPACE: gs://jt-merlin-scaling/vertex-merlin-tf-2tower-jtv34


### Vertex Experiments

In [25]:
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
EXPERIMENT_PREFIX = 'mm-vertex'
EXPERIMENT_NAME = f'{EXPERIMENT_PREFIX}-tf-{MODEL_NAME}-{VERSION}'
RUN_NAME_PREFIX = f'run-{TIMESTAMP}' # timestamp assigned during job

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME_PREFIX: {RUN_NAME_PREFIX}")

EXPERIMENT_NAME: mm-vertex-tf-2tower-jtv34
RUN_NAME_PREFIX: run-20230321-095142


### Data dirs

In [26]:
# data and schema from nvtabular pipes
DATA_DIR = 'gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed'
TRAIN_DATA = f'{DATA_DIR}/train'
VALID_DATA = f'{DATA_DIR}/valid' 

# WORKFLOW_DIR = 'gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-analyzed'
# WORKFLOW_DIR = 'gs://spotify-beam-v3/merlin-processed/workflow/2t-spotify-workflow'
WORKFLOW_DIR = 'gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow'

print(f"TRAIN_DATA: {TRAIN_DATA}")
print(f"VALID_DATA: {VALID_DATA}")
print(f"WORKFLOW_DIR: {WORKFLOW_DIR}")

TRAIN_DATA: gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/train
VALID_DATA: gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/valid
WORKFLOW_DIR: gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow


### Managed TB

In [27]:
# ====================================================
# Managed Tensorboard
# ====================================================

# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/6924469145035603968'

# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}-v1"
tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME, project=PROJECT_ID, location=LOCATION)
TB_RESOURCE_NAME = tensorboard.resource_name


print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")
print(f"TB display name: {tensorboard.display_name}")

TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/387846129628217344
TB display name: mm-vertex-tf-2tower-jtv34-v1


### Worker args

In [28]:
from utils import train_utils

# gcs bucket
OUTPUT_BUCKET = 'jt-merlin-scaling'

# data size
train_sample_cnt = 8_205_265 # 8_205_265
valid_samples_cnt = 82_959

# train config
NUM_EPOCHS = 4
BATCH_SIZE = 4096*4 
LEARNING_RATE = 0.001
VALID_FREQUENCY = 20
VALID_STEPS = valid_samples_cnt // BATCH_SIZE
EPOCH_STEPS = train_sample_cnt // BATCH_SIZE
CHECKPOINT_FREQ='epoch'

# model
LAYERS = "[512, 256, 128]"

    
WORKER_CMD = [
    'sh',
    '-euc',
    f"""pip freeze && python -m trainer.train_task \
    --per_gpu_batch_size={BATCH_SIZE} \
    --train_output_bucket={OUTPUT_BUCKET} \
    --train_dir={TRAIN_DATA} \
    --valid_dir={VALID_DATA} \
    --workflow_dir={WORKFLOW_DIR} \
    --num_epochs={NUM_EPOCHS} \
    --learning_rate={LEARNING_RATE} \
    --distribute={DISTRIBUTE_STRATEGY} \
    --experiment_name={EXPERIMENT_NAME} \
    --experiment_run={RUN_NAME_PREFIX} \
    --project={PROJECT_ID} \
    --location={LOCATION} \
    --valid_frequency={VALID_FREQUENCY} \
    --epoch_steps={EPOCH_STEPS} \
    --valid_steps={VALID_STEPS} \
    --layer_sizes=\'{LAYERS}\' \
    --chkpt_freq={CHECKPOINT_FREQ} \
    --write_embeddings \
    --profiler"""
    # --write_embeddings
    # '''
    # --tb_name={TB_RESOURCE_NAME} \
]

# ====================================================
# Worker pool specs
# ====================================================
    
WORKER_POOL_SPECS = train_utils.prepare_worker_pool_specs(
    image_uri=IMAGE_URI,
    # args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)
# jt-merlin-scaling/nvt-last5-latest-12

[{'container_spec': {'command': ['sh',
                                 '-euc',
                                 'pip freeze && python -m '
                                 'trainer.train_task     '
                                 '--per_gpu_batch_size=16384     '
                                 '--train_output_bucket=jt-merlin-scaling     '
                                 '--train_dir=gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/train     '
                                 '--valid_dir=gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/valid     '
                                 '--workflow_dir=gs://jt-merlin-scaling/nvt-last5-latest-12/nvt-processed/workflow     '
                                 '--num_epochs=4     --learning_rate=0.001     '
                                 '--distribute=single     '
                                 '--experiment_name=mm-vertex-tf-2tower-jtv34     '
                                 '--experiment_run=run-20230321-095142     '
 

In [29]:
# WORKER_POOL_SPECS[0]['container_spec']['command']

## Submit train job

In [30]:
BASE_OUTPUT_DIR = f'gs://{OUTPUT_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME_PREFIX}'

# initialize vertex sdk
vertex_ai.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=f'{BASE_OUTPUT_DIR}/staging',
    # experiment=EXPERIMENT_NAME,
)

JOB_NAME = f'train-{MODEL_DISPLAY_NAME}'

# labels for train job
gpu_type = ACCELERATOR_TYPE.lower()
gpu_per_replica = PER_MACHINE_ACCELERATOR_COUNT
replica_cnt = REPLICA_COUNT

print(f'BASE_OUTPUT_DIR : {BASE_OUTPUT_DIR}')
print(f'JOB_NAME : {JOB_NAME}\n')
print(f'gpu_type : {gpu_type}')
print(f'gpu_per_replica : {gpu_per_replica}')
print(f'replica_cnt : {replica_cnt}')

BASE_OUTPUT_DIR : gs://jt-merlin-scaling/mm-vertex-tf-2tower-jtv34/run-20230321-095142
JOB_NAME : train-vertex-merlin-tf-2tower-jtv34

gpu_type : nvidia_tesla_a100
gpu_per_replica : 1
replica_cnt : 1


In [31]:
job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    worker_pool_specs=WORKER_POOL_SPECS,
    base_output_dir=BASE_OUTPUT_DIR,
    staging_bucket=f'{BASE_OUTPUT_DIR}/staging',
    labels={
        # 'mm_image' : 'nightly',
        'gpu' : f'{gpu_type}',
        'gpu_per_replica' : f'{gpu_per_replica}',
        'replica_cnt' : f'{replica_cnt}',
    }
)

job.run(
    tensorboard=TB_RESOURCE_NAME,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False,
    enable_web_access=True,
    sync=False,
)

In [None]:
# FileNotFoundError: [Errno 2] No such file or directory: 'gs:/jt-merlin-scaling/latest-2tower-merlin-tf-jtv16/run-20230223-221213/model-dir/query-tower/.merlin'

In [32]:
TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs' # 

In [33]:
%load_ext tensorboard
# %reload_ext tensorboard

In [34]:
%tensorboard --logdir=$TB_LOGS_PATH