# Train Merlin Two-Towers

### pip & package

In [42]:
import os
import nvtabular as nvt
from time import time
import pandas as pd
# disable INFO and DEBUG logging everywhere
import logging
import time
from pprint import pprint

logging.disable(logging.WARNING)

from nvtabular.ops import (
    Categorify,
    TagAsUserID,
    TagAsItemID,
    TagAsItemFeatures,
    TagAsUserFeatures,
    AddMetadata,
    ListSlice
)
import nvtabular.ops as ops

from merlin.schema.tags import Tags

import merlin.models.tf as mm
# from merlin.io.dataset import Dataset
from merlin.io.dataset import Dataset as MerlinDataset
from merlin.models.utils.example_utils import workflow_fit_transform
import tensorflow as tf

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import hyperparameter_tuning as hpt

# for running this example on CPU, comment out the line below
# os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

### Setup

In [43]:
# TODO: Project definitions
PROJECT_ID = 'hybrid-vertex' # Change to your project ID.
REGION = 'us-central1' # Change to your region.

# TODO: Service Account address
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com' # Change to your service account with Vertex AI Admin permitions.

In [44]:
# Bucket definitions
BUCKET = 'jt-merlin-scaling' # 'spotify-merlin-v1'

VERSION = 'jtv1'
MODEL_NAME = '2tower'
FRAMEWORK = 'merlin-tf'
MODEL_DISPLAY_NAME = f'vertex-{FRAMEWORK}-{MODEL_NAME}-{VERSION}'
WORKSPACE = f'gs://{BUCKET}/{MODEL_DISPLAY_NAME}'

# # Docker definitions for training
# IMAGE_NAME = f'{FRAMEWORK}-{MODEL_NAME}-training-{VERSION}'
# IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
# # DOCKERNAME = 'hugectr'
# DOCKERNAME = 'merlintf'
# MACHINE_TYPE ='e2-highcpu-32'
# FILE_LOCATION = './src'

# Training Package

In [45]:
REPO_DOCKER_PATH_PREFIX = 'src'
TRAIN_SUB_DIR = 'trainer'

In [46]:
# Make the training subfolder
! rm -rf {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}

In [47]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/__init__.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Writing src/trainer/__init__.py


## Interactive Train Shell

In [48]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/interactive_train.py

import time

while(True):
    time.sleep(60)

Writing src/trainer/interactive_train.py


## Two-Tower Model

In [49]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/two_tower_model.py

from typing import List, Any

import nvtabular as nvt
# # import nvtabular.ops as ops

# from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.schema.tags import Tags
import merlin.models.tf as mm
from merlin.models.tf.outputs.base import DotProduct, MetricsFn, ModelOutput

import logging

import tensorflow as tf


def create_two_tower(
    train_dir: str,
    valid_dir: str,
    workflow_dir: str,
    layer_sizes: List[Any] = [512, 256, 128],
):
    
    #=========================================
    # get workflow details
    #=========================================
    workflow = nvt.Workflow.load(workflow_dir) # gs://spotify-merlin-v1/nvt-preprocessing-spotify-v24/nvt-analyzed
    
    schema = workflow.output_schema
    # embeddings = ops.get_embedding_sizes(workflow)
    
    user_schema = schema.select_by_tag(Tags.USER)
    user_inputs = mm.InputBlockV2(user_schema)
    
    #=========================================
    # build towers
    #=========================================
    query = mm.Encoder(user_inputs, mm.MLPBlock(layer_sizes))
    
    item_schema = schema.select_by_tag(Tags.ITEM)
    item_inputs = mm.InputBlockV2(
        item_schema,
    )
    candidate = mm.Encoder(item_inputs, mm.MLPBlock(layer_sizes))
    
    model = mm.RetrievalModelV2(
        query=query,
        candidate=candidate,
        output=mm.ContrastiveOutput(
            to_call=DotProduct(),
            negative_samplers="in-batch",
            schema=item_schema.select_by_tag(Tags.ITEM_ID),
            candidate_name="item",
        )
    )
    
    return model

Writing src/trainer/two_tower_model.py


## Train task

In [50]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/train_task.py

import argparse
import json
import logging
import os
import sys
import time
import pandas as pd

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
# os.environ["TF_MEMORY_ALLOCATION"] = "0.3"  # fraction of free memory

# # nvtabular
# import nvtabular as nvt
# import nvtabular.ops as ops

# merlin
# from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.io.dataset import Dataset as MerlinDataset
from merlin.models.tf.outputs.base import DotProduct, MetricsFn, ModelOutput
from merlin.schema.tags import Tags
import merlin.models.tf as mm

from merlin.models.utils.dataset import unique_rows_by_features

# nvtabular
import nvtabular as nvt
import nvtabular.ops as ops

# tensorflow
import tensorflow as tf
from tensorflow.python.client import device_lib

# gcp
import google.cloud.aiplatform as vertex_ai
from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob
# import hypertune
# from google.cloud.aiplatform.training_utils import cloud_profiler

# repo
from .two_tower_model import create_two_tower
# import utils

# local
HYPERTUNE_METRIC_NAME = 'AUC'
LOCAL_MODEL_DIR = '/tmp/saved_model'
LOCAL_CHECKPOINT_DIR = '/tmp/checkpoints'

# ====================================================
# Helper functions - TODO: move to utils?
# ====================================================

def _is_chief(task_type, task_id): 
    ''' Check for primary if multiworker training
    '''
    if task_type == 'chief':
        results = 'chief'
    else:
        results = None
    return results
    # return (task_type == 'chief') or (task_type == 'worker' and task_id == 0) or task_type is None
    # return ((task_type == 'chief' and task_id == 0) or task_type is None)

def get_upload_logs_to_manged_tb_command(tb_resource_name, logs_dir, experiment_name, ttl_hrs, oneshot="false"):
    """
    Run this and copy/paste the command into terminal to have 
    upload the tensorboard logs from this machine to the managed tb instance
    Note that the log dir is at the granularity of the run to help select the proper
    timestamped run in Tensorboard
    You can also run this in one-shot mode after training is done 
    to upload all tb objects at once
    """
    return(
        f"""tb-gcp-uploader --tensorboard_resource_name={tb_resource_name} \
        --logdir={logs_dir} \
        --experiment_name={experiment_name} \
        --one_shot={oneshot} \
        --event_file_inactive_secs={60*60*ttl_hrs}"""
    )

def _upload_blob_gcs(gcs_uri, source_file_name, destination_blob_name, project):
    """Uploads a file to GCS bucket"""
    client = storage.Client(project=project)
    blob = Blob.from_string(os.path.join(gcs_uri, destination_blob_name))
    blob.bucket._client = client
    blob.upload_from_filename(source_file_name)
    
def get_arch_from_string(arch_string):
    q = arch_string.replace(']', '')
    q = q.replace('[', '')
    q = q.replace(" ", "")
    return [int(x) for x in q.split(',')]

# ====================================================
# TRAINING SCRIPT
# ====================================================
    
def main(args):
    """Runs a training loop."""
    
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    # tf.debugging.set_log_device_placement(True) # logs all tf ops and their device placement;
    # os.environ['TF_GPU_THREAD_MODE']='gpu_private'
    # os.environ['TF_GPU_THREAD_COUNT']='1'
    os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
    
    TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
    
    vertex_ai.init(project=f'{args.project}', location=f'{args.location}')
    storage_client = storage.Client(project=args.project)
    logging.info("vertex_ai initialized...")
    
    EXPERIMENT_NAME = f"{args.experiment_name}"
    RUN_NAME = f"{args.experiment_run}-{TIMESTAMP}" # f"{args.experiment_run}"
    logging.info(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}\n RUN_NAME: {RUN_NAME}")
    
    WORKING_DIR_GCS_URI = f'gs://{args.train_output_bucket}/{EXPERIMENT_NAME}/{RUN_NAME}'
    logging.info(f"WORKING_DIR_GCS_URI: {WORKING_DIR_GCS_URI}")
    
    TB_RESOURCE_NAME = f'{args.tb_name}'
    LOGS_DIR = f'{WORKING_DIR_GCS_URI}/tb_logs'
    logging.info(f"tensorboard LOGS_DIR: {LOGS_DIR}")
    
    # ====================================================
    # Set Device / GPU Strategy
    # ====================================================    
    logging.info("Detecting devices....")
    logging.info(f'Detected Devices {str(device_lib.list_local_devices())}')
    
    logging.info("Setting device strategy...")
    
    # Single Machine, single compute device
    if args.distribute == 'single':
        if tf.test.is_gpu_available(): # TODO: replace with - tf.config.list_physical_devices('GPU')
            strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        else:
            strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        logging.info("Single device training")
    
    # Single Machine, multiple compute device
    elif args.distribute == 'mirrored':
        strategy = tf.distribute.MirroredStrategy()
        logging.info("Mirrored Strategy distributed training")

    # Multi Machine, multiple compute device
    elif args.distribute == 'multiworker':
        strategy = tf.distribute.MultiWorkerMirroredStrategy()
        logging.info("Multi-worker Strategy distributed training")
        logging.info('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
        
    
    # set related vars...
    NUM_WORKERS = strategy.num_replicas_in_sync
    GLOBAL_BATCH_SIZE = NUM_WORKERS * args.per_gpu_batch_size
    # num_gpus = sum([len(gpus) for gpus in args.gpus])
    # GLOBAL_BATCH_SIZE = num_gpus * args.per_gpu_batch_size

    logging.info(f'NUM_WORKERS = {NUM_WORKERS}')
    # logging.info(f'num_gpus: {num_gpus}')
    logging.info(f'GLOBAL_BATCH_SIZE: {GLOBAL_BATCH_SIZE}')
    
    # set worker vars...
    logging.info(f'Setting task_type and task_id...')
    if args.distribute == 'multiworker':
        task_type, task_id = (
            strategy.cluster_resolver.task_type,
            strategy.cluster_resolver.task_id
        )
    else:
        task_type, task_id = 'chief', None
    
    logging.info(f'task_type = {task_type}')
    logging.info(f'task_id = {task_id}')
        
    # ====================================================
    # Prepare Train and Valid Data
    # ====================================================
    logging.info(f'Loading workflow & schema from : {args.workflow_dir}')
    
    workflow = nvt.Workflow.load(args.workflow_dir) # gs://{BUCKET}/..../nvt-analyzed
    schema = workflow.output_schema
    embeddings = ops.get_embedding_sizes(workflow)
    
    train_data = MerlinDataset(os.path.join(args.train_dir, "*.parquet"), schema=schema, part_size="1GB")
    valid_data = MerlinDataset(os.path.join(args.valid_dir, "*.parquet"), schema=schema, part_size="1GB")
    
    # train_data = MerlinDataset(args.train_dir + "*.parquet", part_size="1GB")
    # valid_data = MerlinDataset(args.valid_dir + "*.parquet", part_size="1GB")
    
    # ====================================================
    # Callbacks
    # ====================================================
    class UploadTBLogsBatchEnd(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            os.system(
                get_upload_logs_to_manged_tb_command(
                    tb_resource_name=TB_RESOURCE_NAME, 
                    logs_dir=LOGS_DIR, 
                    experiment_name=EXPERIMENT_NAME,
                    ttl_hrs = 5, 
                    oneshot="true",
                )
            )
            
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=LOGS_DIR,
        histogram_freq=0, 
        write_graph=True, 
        # profile_batch=(20,50)
    )
    
    # ====================================================
    # Train
    # ==================================================== 
    LAYER_SIZES = get_arch_from_string(args.layer_sizes)
    logging.info(f'LAYER_SIZES: {LAYER_SIZES}')

    # with strategy.scope():
    model = create_two_tower(
        train_dir=args.train_dir,
        valid_dir=args.valid_dir,
        workflow_dir=args.workflow_dir,
        layer_sizes=LAYER_SIZES # args.layer_sizes,
    )
        
        
    model.compile(
        optimizer=tf.keras.optimizers.Adagrad(args.learning_rate),
        run_eagerly=False,
        metrics=[mm.RecallAt(1), mm.RecallAt(10), mm.NDCGAt(10)],
    )
    
    # cloud_profiler.init() # managed TB profiler
        
    logging.info('Starting training loop...')
    
    start_model_fit = time.time()
    
    model.fit(
        train_data, 
        validation_data=valid_data, 
        batch_size=GLOBAL_BATCH_SIZE, 
        epochs=args.num_epochs,
        # steps_per_epoch=20, 
        callbacks=[
            tensorboard_callback, 
            UploadTBLogsBatchEnd()
        ],
    )
    
    # capture elapsed time
    end_model_fit = time.time()
    elapsed_model_fit = end_model_fit - start_model_fit
    elapsed_model_fit = round(elapsed_model_fit, 2)
    logging.info(f'Elapsed model_fit: {elapsed_model_fit} seconds')
    
    # ====================================================
    # metaparams & metrics for Vertex Ai Experiments
    # ====================================================
    logging.info('Logging params & metrics for Vertex Experiments')
    
    # get the metrics for the experiment run
    history_keys = model.history.history.keys()
    metrics_dict = {}
    _ = [metrics_dict.update({key: model.history.history[key][-1]}) for key in history_keys]
    metrics_dict["elapsed_model_fit"] = elapsed_model_fit
    
    logging.info(f'metrics_dict: {metrics_dict}')
    
    metaparams = {}
    metaparams["experiment_name"] = f'{EXPERIMENT_NAME}'
    metaparams["experiment_run"] = f"{RUN_NAME}"
    
    logging.info(f'metaparams: {metaparams}')
    
    hyperparams = {}
    hyperparams["epochs"] = int(args.num_epochs)
    hyperparams["num_gpus"] = NUM_WORKERS # num_gpus
    hyperparams["per_gpu_batch_size"] = args.per_gpu_batch_size
    hyperparams["global_batch_size"] = GLOBAL_BATCH_SIZE
    hyperparams["learning_rate"] = args.learning_rate
    hyperparams['layers'] = f'{args.layer_sizes}'
    
    logging.info(f'hyperparams: {hyperparams}')
    
    # ====================================================
    # Experiments
    # ====================================================
    logging.info(f"Creating run: {RUN_NAME}; for experiment: {EXPERIMENT_NAME}")
    
    if task_type == 'chief':
        logging.info(f" task_type logging experiments: {task_type}")
        logging.info(f" task_id logging experiments: {task_id}")
    
        # Create experiment
        vertex_ai.init(experiment=EXPERIMENT_NAME)

        with vertex_ai.start_run(RUN_NAME) as my_run:
            logging.info(f"logging metrics_dict")
            my_run.log_metrics(metrics_dict)

            logging.info(f"logging metaparams")
            my_run.log_params(metaparams)

            logging.info(f"logging hyperparams")
            my_run.log_params(hyperparams)
        
    # =============================================
    # save retrieval (query) tower
    # =============================================
    # set vars...
    MODEL_DIR = f"{WORKING_DIR_GCS_URI}/model-dir"
    logging.info(f'Saving towers to {MODEL_DIR}')
    
    QUERY_TOWER_PATH = f"{MODEL_DIR}/query-tower"
    CANDIDATE_TOWER_PATH = f"{MODEL_DIR}/candidate-tower"
    EMBEDDINGS_PATH = f"{MODEL_DIR}/candidate-embeddings"
    
    if task_type == 'chief':
        # save query tower
        query_tower = model.query_encoder
        query_tower.save(QUERY_TOWER_PATH)
        logging.info(f'Saved query tower to {QUERY_TOWER_PATH}')
        
        candidate_tower = model.candidate_encoder
        candidate_tower.save(CANDIDATE_TOWER_PATH)
        logging.info(f'Saved candidate tower to {CANDIDATE_TOWER_PATH}')
    
    # =============================================
    # save embeddings for ME index
    # =============================================
    EMBEDDINGS_FILE_NAME = "candidate_embeddings.json"
    logging.info(f"Saving {EMBEDDINGS_FILE_NAME} to {EMBEDDINGS_PATH}")
    
    def format_for_matching_engine(data) -> None:
        emb = [data[i] for i in range(LAYER_SIZES[-1])] # get the embeddings
        formatted_emb = '{"id":"' + str(data['track_uri_can']) + '","embedding":[' + ",".join(str(x) for x in list(emb)) + ']}'
        with open(f"{EMBEDDINGS_FILE_NAME}", 'a') as f:
            f.write(formatted_emb)
            f.write("\n")
    
    # !rm candidate_embeddings.json > /dev/null 
    # !touch candidate_embeddings.json
    item_data = pd.read_parquet(f'{args.workflow_dir}/categories/unique.track_uri_can.parquet')
    lookup_dict = dict(item_data['track_uri_can'])

    # item embeds from TRAIN
    start_embeds = time.time()
    
    item_features = (unique_rows_by_features(train_data, Tags.ITEM, Tags.ID))
    item_embs = model.candidate_embeddings(item_features, index=item_features.schema['track_uri_can'], batch_size=10000)
    item_emb_pd = item_embs.compute().to_pandas().fillna(1e-10).reset_index() #filling blanks with an epsilon value
    item_emb_pd['track_uri_can'] = item_emb_pd['track_uri_can'].apply(lambda l: lookup_dict[l])
    _ = item_emb_pd.apply(format_for_matching_engine, axis=1)
    
    # capture elapsed time
    end_embeds = time.time()
    elapsed_time = end_embeds - start_embeds
    elapsed_time = round(elapsed_time, 2)
    logging.info(f'Elapsed time writting TRAIN embeddings: {elapsed_time} seconds')
    
    # item embeds from VALID
    start_embeds = time.time()
    
    item_features_val = (unique_rows_by_features(valid_data, Tags.ITEM, Tags.ID))
    item_embs_val = model.candidate_embeddings(item_features_val, index=item_features_val.schema['track_uri_can'], batch_size=10000)
    item_emb_pd_val = item_embs_val.compute().to_pandas().fillna(1e-10).reset_index() #filling blanks with an epsilon value
    item_emb_pd_val['track_uri_can'] = item_emb_pd_val['track_uri_can'].apply(lambda l: lookup_dict[l])
    _ = item_emb_pd_val.apply(format_for_matching_engine, axis=1)
    
    # capture elapsed time
    end_embeds = time.time()
    elapsed_time = end_embeds - start_embeds
    elapsed_time = round(elapsed_time, 2)
    logging.info(f'Elapsed time writting VALID embeddings: {elapsed_time} seconds')
    
    if task_type == 'chief':
        _upload_blob_gcs(
            EMBEDDINGS_PATH, 
            f"{EMBEDDINGS_FILE_NAME}", 
            f"{EMBEDDINGS_FILE_NAME}",
            args.project
        )
    
    logging.info('All done - model saved') #all done
    
# ====================================================
# arg parser
# ====================================================
    
def parse_args():
    """Parses command line arguments."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--experiment_name',
        type=str,
        required=False,
        default='unnamed-experiment',
        help='name of vertex ai experiment'
    )
    parser.add_argument(
        '--experiment_run',
        type=str,
        required=False,
        default='unnamed_run',
        help='name of vertex ai experiment run'
    )
    parser.add_argument(
        '--tb_name',
        type=str,
        required=True,
        help='projects/XXXXXX/locations/us-central1/tensorboards/XXXXXXXX'
    )
    parser.add_argument(
        '--distribute',
        type=str,
        required=False,
        default='single',
        help='training strategy'
    )
    parser.add_argument(
        '--train_output_bucket',
        type=str,
        required=True,
        # default='single',
        help='gcs bucket name'
    )
    parser.add_argument(
        '--workflow_dir',
        type=str,
        required=True,
        help='Path to saved workflow.pkl e.g., nvt-analyzed'
    )
    parser.add_argument(
        '--train_dir',
        type=str,
        required=True,
        help='Path to training data _file_list.txt'
    )
    parser.add_argument(
        '--valid_dir',
        type=str,
        required=True,
        help='Path to validation data _file_list.txt'
    )
    parser.add_argument(
        '--num_epochs',
        type=int,
        required=True,
        help='num_epochs'
    )
    parser.add_argument(
        '--per_gpu_batch_size',
        type=int,
        required=True,
        help='Per GPU Batch size'
    )
    parser.add_argument(
        '--layer_sizes',
        type=str,
        required=False,
        default='[512, 256, 128]',
        help='layer_sizes'
    )
    parser.add_argument(
        '--learning_rate',
        type=float,
        required=False,
        default=.001,
        help='learning_rate'
    )
    parser.add_argument(
        '--project',
        type=str,
        required=True,
        help='gcp project'
    )
    parser.add_argument(
        '--location',
        type=str,
        required=True,
        help='gcp location'
    )
    # parser.add_argument(
    #     '--gpus',
    #     type=str,
    #     required=False,
    #     default='[[0]]',
    #     help='GPU devices to use for Preprocessing'
    # )
    
    return parser.parse_args()

if __name__ == '__main__':
    logging.basicConfig(
        format='%(asctime)s - %(message)s',
        level=logging.INFO, 
        datefmt='%d-%m-%y %H:%M:%S',
        stream=sys.stdout
    )

    parsed_args = parse_args()

    # parsed_args.gpus = json.loads(parsed_args.gpus)

    # parsed_args.slot_size_array = [
    #     int(i) for i in parsed_args.slot_size_array.split(sep=' ')
    # ]

    logging.info('Args: %s', parsed_args)
    start_time = time.time()
    logging.info('Starting training')

    main(parsed_args)

    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info('Training completed. Elapsed time: %s', elapsed_time )

Writing src/trainer/train_task.py


## Training Image

### versioned image

In [51]:
# Docker definitions for training
MERLIN_VERSION = '22_09_v2'
IMAGE_NAME = f'{FRAMEWORK}-{MODEL_NAME}-training-{VERSION}-{MERLIN_VERSION}'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

DOCKERNAME = f'merlintf-{MERLIN_VERSION}'
MACHINE_TYPE ='e2-highcpu-32'
FILE_LOCATION = './src'

* nvtabular==1.5.0
* nvtabular==1.3.3
* cloudml-hypertune

```
RUN pip install google-cloud-bigquery gcsfs
RUN pip install google-cloud-aiplatform[cloud_profiler] kfp
```

In [52]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

# FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.09

# WORKDIR /src

# RUN pip install -U pip
# RUN pip install git+https://github.com/NVIDIA-Merlin/models.git@efe4bc91cc7e161f6e1c6dab3ff2a8ef04fd84b5 gcsfs google-cloud-aiplatform[cloud_profiler] kfp
# RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y

# COPY trainer/* ./

# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/hugectr/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib:/opt/tritonserver/lib

In [53]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.09

WORKDIR /

# RUN pip install -U pip
RUN pip install git+https://github.com/NVIDIA-Merlin/models.git@efe4bc91cc7e161f6e1c6dab3ff2a8ef04fd84b5 gcsfs google-cloud-aiplatform fastapi


COPY trainer /trainer
# COPY trainer/* ./

# RUN pip install -r trainer/requirements.txt

# CMD python trainer/task.py

Overwriting src/Dockerfile.merlintf-22_09_v2


In [54]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}/requirements.txt
# fastapi
# git+https://github.com/NVIDIA-Merlin/models.git@efe4bc91cc7e161f6e1c6dab3ff2a8ef04fd84b5
# gsutil
# gcsfs
# matplotlib
# google-cloud-aiplatform

In [55]:
!tree /home/jupyter/spotify-merlin/{REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}

[01;34m/home/jupyter/spotify-merlin/src/trainer[00m
├── __init__.py
├── interactive_train.py
├── train_task.py
└── two_tower_model.py

0 directories, 4 files


### nightly image

In [56]:
# # Docker definitions for training
# MERLIN_VERSION = 'nightly'
# IMAGE_NAME = f'{FRAMEWORK}-{MODEL_NAME}-training-{VERSION}-{MERLIN_VERSION}'
# IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

# DOCKERNAME = f'merlintf-{MERLIN_VERSION}'
# MACHINE_TYPE ='e2-highcpu-32'
# FILE_LOCATION = './src'

In [57]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

# # FROM nvcr.io/nvidia/merlin/merlin-tensorflow:22.09
# FROM nvcr.io/nvidia/merlin/merlin-tensorflow:nightly

# WORKDIR /src

# RUN pip install -U pip
# RUN pip install git+https://github.com/NVIDIA-Merlin/models.git
# RUN pip install google-cloud-bigquery gcsfs cloudml-hypertune
# RUN pip install google-cloud-aiplatform[cloud_profiler] kfp
# RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y

# COPY trainer/* ./

# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/hugectr/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib:/opt/tritonserver/lib


In [58]:
# !tree /home/jupyter/spotify-merlin/{REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUB_DIR}

In [59]:
# IMAGE_URI='gcr.io/hybrid-vertex/merlin-tf-twotower-training-jtv1-nightly'

# Build Train Image

### test locally

In [60]:
os.chdir('/home/jupyter/spotify-merlin')
os.getcwd()

'/home/jupyter/spotify-merlin'

In [61]:
# TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

# _LOCATION='us-central1'
# _TB_NAME='projects/934903580331/locations/us-central1/tensorboards/5925030667573264384'
# _DATA_DIR='gs://spotify-beam-v3/merlin-processed'
# _TRAIN_DATA=f'{_DATA_DIR}/valid/'
# _VALID_DATA=f'{_DATA_DIR}/valid/'
# _WORKFLOW_DIR=f'{_DATA_DIR}/workflow/2t-spotify-workflow'
# _OUTPUT_BUCKET='jt-merlin-scaling'
# _EXPERIMENT_NAME='local-experiment'
# _EXPERIMENT_RUN=f'run-v1-{TIMESTAMP}'
# _DISTRIBUTE='single'
# _PER_GPU_BATCH_SIZE=4096
# _LAYER_SIZES='[256, 128]'
# _LEARNING_RATE=0.001
# _NUM_EPOCHS=1

# # !cd src/trainer; python3 -m trainer.train_task \
# !cd src/trainer; python3 train_task.py \
#     --project=PROJECT_ID --location=$_LOCATION \
#     --train_output_bucket=$_OUTPUT_BUCKET --tb_name=$_TB_NAME \
#     --workflow_dir=$_WORKFLOW_DIR --train_dir=$_TRAIN_DATA --valid_dir=$_VALID_DATA \
#     --experiment_name=$_EXPERIMENT_NAME --experiment_run=$_EXPERIMENT_RUN \
#     --distribute=$_DISTRIBUTE \
#     --num_epochs=$_NUM_EPOCHS \
#     --per_gpu_batch_size=$_PER_GPU_BATCH_SIZE

### `cloudbuild.yaml`

In [62]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
images:
- '$_IMAGE_URI'

Overwriting src/cloudbuild.yaml


In [63]:
os.chdir('/home/jupyter/spotify-merlin')
os.getcwd()

'/home/jupyter/spotify-merlin'

In [64]:
! gcloud builds submit --config src/cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 87 file(s) totalling 1.9 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1667920095.497321-5d11b67f530e48d080b9d6f937632f29.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/db4bbe66-4a28-4476-9d69-e8d7ab9e52d7].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/db4bbe66-4a28-4476-9d69-e8d7ab9e52d7?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "db4bbe66-4a28-4476-9d69-e8d7ab9e52d7"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1667920095.497321-5d11b67f530e48d080b9d6f937632f29.tgz#1667920096132128
Copying gs://hybrid-vertex_cloudbuild/source/1667920095.497321-5d11b67f530e48d080b9d6f937632f29.tgz#1667920096132128...
/ [1 files][322.1 KiB/322.1 KiB]                                                
Operation completed over 1 objects/322.

# Vertex Train Job

### Prepare `worker_pool_specs`

In [65]:
def prepare_worker_pool_specs(
    image_uri,
    # args,
    cmd,
    replica_count=1,
    machine_type="n1-standard-16",
    accelerator_count=1,
    accelerator_type="ACCELERATOR_TYPE_UNSPECIFIED",
    reduction_server_count=0,
    reduction_server_machine_type="n1-highcpu-16",
    reduction_server_image_uri="us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest",
):

    if accelerator_count > 0:
        machine_spec = {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        }
    else:
        machine_spec = {"machine_type": machine_type}

    container_spec = {
        "image_uri": image_uri,
        # "args": args,
        "command": cmd,
    }

    chief_spec = {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": container_spec,
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            "replica_count": replica_count - 1,
            "machine_spec": machine_spec,
            "container_spec": container_spec,
        }
        worker_pool_specs.append(workers_spec)
    if reduction_server_count > 1:
        workers_spec = {
            "replica_count": reduction_server_count,
            "machine_spec": {
                "machine_type": reduction_server_machine_type,
            },
            "container_spec": {"image_uri": reduction_server_image_uri},
        }
        worker_pool_specs.append(workers_spec)

    return worker_pool_specs

### Acclerators and Device Strategy

In [66]:
import time

# ====================================================
# Single | Single machine, single GPU
# ====================================================
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
PER_MACHINE_ACCELERATOR_COUNT = 1
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
DISTRIBUTE_STRATEGY = 'single'

# ====================================================
# Mirrored | Single Machine; multiple GPU
# ====================================================
# WORKER_MACHINE_TYPE = 'a2-highgpu-2g'           # a2-ultragpu-4g
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 2
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'mirrored'

# ====================================================
# Multi-Worker | Multiple Machines, 1 GPU per Machine
# ====================================================
# WORKER_MACHINE_TYPE = 'n1-standard-16'
# REPLICA_COUNT = 10
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 10                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

# WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
# REPLICA_COUNT = 2
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 4                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

# WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
# REPLICA_COUNT = 4
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 4                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

## Train Args

### Previously defined Vars

In [67]:
print(f"PROJECT: {PROJECT_ID}")
print(f"VERSION: {VERSION}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"MODEL_NAME: {MODEL_NAME}")
print(f"FRAMEWORK: {FRAMEWORK}")
print(f"MODEL_DISPLAY_NAME: {MODEL_DISPLAY_NAME}")
print(f"WORKSPACE: {WORKSPACE}")
print(f"IMAGE_URI: {IMAGE_URI}")

PROJECT: hybrid-vertex
VERSION: jtv1
IMAGE_URI: gcr.io/hybrid-vertex/merlin-tf-2tower-training-jtv1-22_09_v2
MODEL_NAME: 2tower
FRAMEWORK: merlin-tf
MODEL_DISPLAY_NAME: vertex-merlin-tf-2tower-jtv1
WORKSPACE: gs://jt-merlin-scaling/vertex-merlin-tf-2tower-jtv1
IMAGE_URI: gcr.io/hybrid-vertex/merlin-tf-2tower-training-jtv1-22_09_v2


In [68]:
EXPERIMENT_PREFIX = 'test'
EXPERIMENT_NAME = f'{EXPERIMENT_PREFIX}-{MODEL_NAME}-{FRAMEWORK}-{VERSION}'
RUN_NAME_PREFIX = f'run-v4' # timestamp assigned during job

# # data and schema from JW 
# DATA_DIR = 'gs://spotify-beam-v3/merlin-processed'
# TRAIN_DATA = f'{DATA_DIR}/train' #/_gcs_file_list.txt'
# VALID_DATA = f'{DATA_DIR}/valid' #/_gcs_file_list.txt'
# # WORKFLOW_DIR = f'gs://{DATA_DIR}/workflow/2t-spotify'
# WORKFLOW_DIR = 'gs://spotify-beam-v3/merlin-processed/workflow/2t-spotify-workflow'

# data and schema from nvtabular pipes
DATA_DIR = 'gs://jt-merlin-scaling/nvt-last5-v1full/nvt-processed'
TRAIN_DATA = f'{DATA_DIR}/train' #/_gcs_file_list.txt'
VALID_DATA = f'{DATA_DIR}/valid' #/_gcs_file_list.txt'
WORKFLOW_DIR = 'gs://jt-merlin-scaling/nvt-last5-v1full/nvt-analyzed'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME_PREFIX: {RUN_NAME_PREFIX}")
print(f"TRAIN_DATA: {TRAIN_DATA}")
print(f"VALID_DATA: {VALID_DATA}")
print(f"WORKFLOW_DIR: {WORKFLOW_DIR}")

EXPERIMENT_NAME: test-2tower-merlin-tf-jtv1
RUN_NAME_PREFIX: run-v4
TRAIN_DATA: gs://jt-merlin-scaling/nvt-last5-v1full/nvt-processed/train
VALID_DATA: gs://jt-merlin-scaling/nvt-last5-v1full/nvt-processed/valid
WORKFLOW_DIR: gs://jt-merlin-scaling/nvt-last5-v1full/nvt-analyzed


### Managed TB

In [69]:
# ====================================================
# Managed Tensorboard
# ====================================================

# create new
# TENSORBOARD_DISPLAY_NAME = f"tb-{EXPERIMENT_NAME}"
# tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME)
# EXPERIMENT_TB = f'{tensorboard.gca_resource.name}'

# use existing
EXPERIMENT_TB = 'projects/934903580331/locations/us-central1/tensorboards/70659015247396864'
TENSORBOARD_DISPLAY_NAME = 'tb-test-twotower-merlin-tf-jtv1'


print("TensorBoard resource name:", EXPERIMENT_TB)
print("TENSORBOARD_DISPLAY_NAME:", TENSORBOARD_DISPLAY_NAME)
# TENSORBOARD= "projects/934903580331/locations/us-central1/tensorboards/7439955380509081600"
# tensorboard = vertex_ai.Tensorboard(f'{TENSORBOARD}')

TensorBoard resource name: projects/934903580331/locations/us-central1/tensorboards/70659015247396864
TENSORBOARD_DISPLAY_NAME: tb-test-twotower-merlin-tf-jtv1


### Worker args

In [70]:
OUTPUT_BUCKET = 'jt-merlin-scaling'
NUM_EPOCHS = 1
BATCH_SIZE = 4096*4      # TODO: `batch_size * 4 ? jw
LEARNING_RATE = 0.001
LAYERS = "[512, 256, 128]"

# ACCELERATOR_NUM = REPLICA_COUNT * PER_MACHINE_ACCELERATOR_COUNT
# gpus = json.dumps([list(range(ACCELERATOR_NUM))]).replace(' ','')

# WORKER_CMD = ["python", "trainer/train_task.py"]
# WORKER_CMD = ["python", "-m", "train_task"]
# WORKER_CMD = ['sh','-euc','pip freeze && python -m trainer.train_task']
# WORKER_CMD = ['sh','-euc','pip freeze && python trainer/train_task.py']

# WORKER_ARGS = [
#     f'--project={PROJECT_ID}',
#     f'--location={REGION}',
#     f'--tb_name={EXPERIMENT_TB}',
#     f'--workflow_dir={WORKFLOW_DIR}',
#     f'--train_dir={TRAIN_DATA}',
#     f'--valid_dir={VALID_DATA}',
#     f'--train_output_bucket={OUTPUT_BUCKET}',
#     f'--experiment_name={EXPERIMENT_NAME}',
#     f'--experiment_run={RUN_NAME_PREFIX}',
#     f'--distribute={DISTRIBUTE_STRATEGY}',
#     f'--per_gpu_batch_size={BATCH_SIZE}',
#     # f'--layer_sizes={LAYERS}',
#     f'--learning_rate={LEARNING_RATE}',
#     f'--num_epochs={NUM_EPOCHS}',
# ]

# python trainer/train_task.py    # python: can't open file 'trainer/train_task.py'
# python -m train_task            # /usr/bin/python: No module named train_task
# python -m trainer.train_task    # /etc/bash.bashrc: line 9: PS1: unbound variable
    
WORKER_CMD = [
    'sh',
    '-euc',
    f'''pip freeze && python -m trainer.train_task --tb_name={EXPERIMENT_TB} --per_gpu_batch_size={BATCH_SIZE} \
    --train_output_bucket={OUTPUT_BUCKET} --train_dir={TRAIN_DATA} --valid_dir={VALID_DATA} --workflow_dir={WORKFLOW_DIR} \
    --num_epochs={NUM_EPOCHS} --learning_rate={LEARNING_RATE} --distribute={DISTRIBUTE_STRATEGY} \
    --experiment_name={EXPERIMENT_NAME} --experiment_run={RUN_NAME_PREFIX} --project={PROJECT_ID} --location={REGION}'''
]
    # --layer_sizes={LAYERS} \

# ====================================================
# Worker pool specs
# ====================================================
    
WORKER_POOL_SPECS = prepare_worker_pool_specs(
    image_uri=IMAGE_URI,
    # args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'command': ['sh',
                                 '-euc',
                                 'pip freeze && python -m trainer.train_task '
                                 '--tb_name=projects/934903580331/locations/us-central1/tensorboards/70659015247396864 '
                                 '--per_gpu_batch_size=16384     '
                                 '--train_output_bucket=jt-merlin-scaling '
                                 '--train_dir=gs://jt-merlin-scaling/nvt-last5-v1full/nvt-processed/train '
                                 '--valid_dir=gs://jt-merlin-scaling/nvt-last5-v1full/nvt-processed/valid '
                                 '--workflow_dir=gs://jt-merlin-scaling/nvt-last5-v1full/nvt-analyzed     '
                                 '--num_epochs=1 --learning_rate=0.001 '
                                 '--distribute=single     '
                                 '--experiment_name=test-2tower-merlin-tf-jtv1 '
                                 '--exp

## Submit train job

In [71]:
STAGING_BUCKET = f'gs://{OUTPUT_BUCKET}/{EXPERIMENT_NAME}'

# initialize vertex sdk
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

job_prefix = '2209-tb'
JOB_NAME = f'{job_prefix}-train-{MODEL_DISPLAY_NAME}'

# labels for train job
gpu_type = ACCELERATOR_TYPE.lower()
gpu_per_replica = PER_MACHINE_ACCELERATOR_COUNT
replica_cnt = REPLICA_COUNT

print(f'STAGING_BUCKET : {STAGING_BUCKET}')
print(f'JOB_NAME : {JOB_NAME}\n')
print(f'gpu_type : {gpu_type}')
print(f'gpu_per_replica : {gpu_per_replica}')
print(f'replica_cnt : {replica_cnt}')

STAGING_BUCKET : gs://jt-merlin-scaling/test-2tower-merlin-tf-jtv1
JOB_NAME : 2209-tb-train-vertex-merlin-tf-2tower-jtv1

gpu_type : nvidia_tesla_a100
gpu_per_replica : 1
replica_cnt : 1


In [None]:
job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=STAGING_BUCKET,
    labels={
        # 'mm_image' : 'nightly',
        'gpu' : f'{gpu_type}',
        'gpu_per_replica' : f'{gpu_per_replica}',
        'replica_cnt' : f'{replica_cnt}',
    }
)
job.run(sync=True, 
        service_account=VERTEX_SA,
        tensorboard=EXPERIMENT_TB,
        restart_job_on_worker_restart=False,
        enable_web_access=True,
)