# Distributed Training Notebook

In [1]:
from absl import app
from absl import flags
from absl import logging
from google.cloud import aiplatform as vertex_ai
import os
import time

import json

# import tensorflow as tf
# import tensorflow_recommenders as tfrs
# import tensorflow_io as tfio

# from google.cloud import storage

import numpy as np
import pickle as pkl
from pprint import pprint

## Setup

In [2]:
# PREFIX = 'spotify-2tower'
APP='sp'
MODEL_TYPE='2tower'
FRAMEWORK = 'tfrs'
MODEL_VERSION = 'v15'
PIPELINE_VERSION = 'v0'
MODEL_ROOT_NAME = f'{APP}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}-{PIPELINE_VERSION}'

PROJECT= 'hybrid-vertex'
REGION='us-central1'
# BUCKET_NAME='spotify-tfrecords-blog'
OUTPUT_BUCKET = 'jt-tfrs-test'
STAGING_BUCKET =f'gs://{OUTPUT_BUCKET}'
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

# Docker definitions for training
IMAGE_NAME = f'{MODEL_ROOT_NAME}-training'
IMAGE_URI = f'gcr.io/{PROJECT}/{IMAGE_NAME}'

DOCKERNAME = 'tfrs'
REPO_DOCKER_PATH_PREFIX = 'src'
MACHINE_TYPE ='e2-highcpu-32'
FILE_LOCATION = './src'

print(f"IMAGE_URI: {IMAGE_URI}")

IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training


### Create TensorBoard resource

In [3]:
# initialize vertex sdk
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

In [4]:
TENSORBOARD_DISPLAY_NAME = f"{MODEL_ROOT_NAME}"

In [31]:
tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME)

tensorboard_resource_name = tensorboard.gca_resource.name
print("TensorBoard resource name:", tensorboard_resource_name)

Creating Tensorboard
Create Tensorboard backing LRO: projects/934903580331/locations/us-central1/tensorboards/5925030667573264384/operations/5508265526708666368
Tensorboard created. Resource name: projects/934903580331/locations/us-central1/tensorboards/5925030667573264384
To use this Tensorboard in another session:
tb = aiplatform.Tensorboard('projects/934903580331/locations/us-central1/tensorboards/5925030667573264384')
TensorBoard resource name: projects/934903580331/locations/us-central1/tensorboards/5925030667573264384


In [5]:
# TENSORBOARD = 'projects/934903580331/locations/us-central1/tensorboards/4842196432167370752'
# TENSORBOARD = 'projects/934903580331/locations/us-central1/tensorboards/5764308455871479808'

TENSORBOARD= "projects/934903580331/locations/us-central1/tensorboards/5925030667573264384"

# tb = aiplatform.Tensorboard('projects/934903580331/locations/us-central1/tensorboards/2710867908514283520')

## Perepare Vertex Training Package

### Create repo for training package

In [6]:
!pwd

/home/jupyter/spotify-tfrs


In [7]:
# Make folder for Python training script

# Make folder for Python training script
# ! rm -rf {REPO_DOCKER_PATH_PREFIX}
# ! mkdir {REPO_DOCKER_PATH_PREFIX}

# Add package information
# ! touch {REPO_DOCKER_PATH_PREFIX}/README.md

# Make the training subfolder
! rm -rf {REPO_DOCKER_PATH_PREFIX}/trainer
! mkdir {REPO_DOCKER_PATH_PREFIX}/trainer
! touch {REPO_DOCKER_PATH_PREFIX}/trainer/__init__.py

### interactive training shell in Vertex AI Training

In [8]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/trainer/interactive_train.py

import time

while(True):
    time.sleep(60)

Writing src/trainer/interactive_train.py


### Dockerfile

```
gcloud compute images list \
        --project deeplearning-platform-release \
        --no-standard-images
```


```
gcloud compute images describe-from-family IMAGE_FAMILY \
        --project deeplearning-platform-release
```

In [9]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

FROM tensorflow/tensorflow:2.9.2-gpu

WORKDIR /src

# Copies the trainer code to the docker image.
COPY trainer/* trainer/ 

RUN pip install -r trainer/requirements.txt

# # Sets up the entry point to invoke the trainer.
# # ENTRYPOINT ["python", "-m", "trainer.task"]

Overwriting src/Dockerfile.tfrs


### `cloudbuild.yaml`

In [10]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
images:
- '$_IMAGE_URI'

Overwriting src/cloudbuild.yaml


### requirements.txt

* TODO: for profiling, install `google-cloud-aiplatform[cloud_profiler]`

In [11]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/trainer/requirements.txt

google-cloud-aiplatform==1.17.0
tensorflow-recommenders==0.7.0
tensorboard==2.9.1
tensorboard-data-server==0.6.1
tensorboard-plugin-profile==2.5.0
cloudml-hypertune

Writing src/trainer/requirements.txt


In [12]:
# google-cloud-aiplatform==1.17.0
# tensorflow==2.9.2
# tensorflow-cloud==0.1.16
# tensorflow-datasets==4.4.0
# tensorflow-estimator==2.9.0
# tensorflow-hub==0.12.0
# tensorflow-io==0.23.1
# tensorflow-io-gcs-filesystem==0.27.0
# tensorflow-metadata==1.10.0
# tensorflow-recommenders==0.7.0
# tensorflow-serving-api==2.10.0
# tensorflow-transform==1.10.1
# tensorboard==2.9.1
# tensorboard-data-server==0.6.1
# tensorboard-plugin-profile==2.5.0
# cloudml-hypertune

### data.py

In [13]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/trainer/_data.py

import tensorflow as tf

import train_config as cfg

MAX_PLAYLIST_LENGTH = cfg.MAX_PADDING # 375

def pad_up_to(t, max_in_dims=[1 ,MAX_PLAYLIST_LENGTH], constant_value=''):
    s = tf.shape(t)
    paddings = [[0, m-s[i]] for (i,m) in enumerate(max_in_dims)]
    return tf.pad(t, paddings, 'CONSTANT', constant_values=constant_value)

def return_padded_tensors(data):
    
    a = pad_up_to(tf.reshape(data['track_name_pl'], shape=(1,-1)) , constant_value='')
    b = pad_up_to(tf.reshape(data['artist_name_pl'], shape=(1,-1)) , constant_value='')
    c = pad_up_to(tf.reshape(data['album_name_pl'], shape=(1,-1)) , constant_value='')
    d = pad_up_to(tf.reshape(data['track_uri_pl'], shape=(1, -1,)) , constant_value='')
    e = pad_up_to(tf.reshape(data['duration_ms_songs_pl'], shape=(1,-1)) , constant_value=-1.)
    f = pad_up_to(tf.reshape(data['artist_pop_pl'], shape=(1,-1)) , constant_value=-1.)
    g = pad_up_to(tf.reshape(data['artists_followers_pl'], shape=(1,-1)) , constant_value=-1.)
    h = pad_up_to(tf.reshape(data['track_pop_pl'], shape=(1,-1)) , constant_value=-1.)
    i = pad_up_to(tf.reshape(data['artist_genres_pl'], shape=(1,-1)) , constant_value='')
        
    padded_data = data.copy()
    padded_data['track_name_pl'] = a
    padded_data['artist_name_pl'] = b
    padded_data['album_name_pl'] = c
    padded_data['track_uri_pl'] = d
    padded_data['duration_ms_songs_pl'] = e
    padded_data['artist_pop_pl'] = f
    padded_data['artists_followers_pl'] = g
    padded_data['track_pop_pl'] = h
    padded_data['artist_genres_pl'] = i
        
    return padded_data

all_features = {
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'pos_seed_track': tf.io.FixedLenFeature(dtype=tf.int64, shape=()),
    'track_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'pid': tf.io.FixedLenFeature(dtype=tf.int64, shape=()),
    'name': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'collaborative': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_seed_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'n_songs_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'num_artists_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'num_albums_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'description_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    ###ragged
    'track_name_pl': tf.io.RaggedFeature(tf.string),
    'artist_name_pl': tf.io.RaggedFeature(tf.string),
    'album_name_pl': tf.io.RaggedFeature(tf.string),
    'track_uri_pl': tf.io.RaggedFeature(tf.string),
    'duration_ms_songs_pl': tf.io.RaggedFeature(tf.float32),
    'artist_pop_pl': tf.io.RaggedFeature(tf.float32),
    'artists_followers_pl': tf.io.RaggedFeature(tf.float32),
    'track_pop_pl': tf.io.RaggedFeature(tf.float32),
    'artist_genres_pl': tf.io.RaggedFeature(tf.string),
}

def parse_tfrecord_fn(example, feature_dict=all_features): # =all_features
    example = tf.io.parse_single_example(
        example, 
        features=feature_dict
    )
    return example

candidate_features = {
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
}

def parse_candidate_tfrecord_fn(example, feature_dict=candidate_features):
    example = tf.io.parse_single_example(
        example, 
        features=feature_dict
    )
    return example

Writing src/trainer/_data.py


### model.py

In [14]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/trainer/_model.py

import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np

import train_config as cfg
# ====================================================
# Playlist (query) Tower
# ====================================================

# TODO: parameterize

EMBEDDING_DIM = cfg.EMBEDDING_DIM       # 32
PROJECTION_DIM = cfg.PROJECTION_DIM     # 5
SEED = cfg.SEED                         # 1234
USE_CROSS_LAYER = cfg.USE_CROSS_LAYER   # True
DROPOUT = cfg.USE_DROPOUT               # 'False'
DROPOUT_RATE = cfg.DROPOUT_RATE         # '0.33'
MAX_PLAYLIST_LENGTH = cfg.MAX_PADDING # 375

class Playlist_Model(tf.keras.Model):
    def __init__(self, layer_sizes, vocab_dict):
        super().__init__()

        # ========================================
        # non-sequence playlist features
        # ========================================
        
        # Feature: playlist name
        self.pl_name_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    max_tokens=len(vocab_dict["name"]), # not needed if passing vocab
                    # vocabulary=vocab_dict['name'], 
                    name="pl_name_txt_vectorizer", 
                    ngrams=2
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["name"]) + 1,
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="pl_name_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="pl_name_pooling"),
            ], name="pl_name_emb_model"
        )
        
        # Feature: collaborative
        collaborative_vocab = np.array([b'false', b'true'])
        
        self.pl_collaborative_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=collaborative_vocab, 
                    mask_token=None, 
                    name="pl_collaborative_lookup", 
                    output_mode='int'
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(collaborative_vocab) + 1,
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="pl_collaborative_emb_layer",
                ),
            ], name="pl_collaborative_emb_model"
        )
        
        # Feature: pid
        self.pl_pid_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.IntegerLookup(
                    vocabulary=tf.constant(vocab_dict['unique_pids']), 
                    mask_token=None, 
                    name="pl_pid_lookup", 
                    output_mode='int'
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['unique_pids']),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="pl_pid_emb_layer",
                ),
            ], name="pl_pid_emb_model"
        )
        # JW Change 9/21/2022
        # self.pl_track_uri_embedding = tf.keras.Sequential(
        #     [
        #         tf.keras.layers.StringLookup(
        #             vocabulary=tf.constant(vocab_dict['track_uri_can']), 
        #             mask_token=None, 
        #             name="pl_track_uri_lookup", 
        #         ),
        #         tf.keras.layers.Embedding(
        #             input_dim=len(vocab_dict['track_uri_can'])+1,
        #             output_dim=EMBEDDING_DIM,
        #             mask_zero=False,
        #             name="pl_track_uri_layer",
        #         ),
        #     ], name="pl_track_uri_emb_model"
        # )

        # Feature: description_pl
        # self.pl_description_text_embedding = tf.keras.Sequential(
        #     [
        #         tf.keras.layers.TextVectorization(
        #             max_tokens=len(vocab_dict["description_pl"]), # not needed if passing vocab
        #             # vocabulary=tf.constant(vocab_dict['description_pl']), 
        #             name="description_pl_vectorizer", 
        #             ngrams=2,
        #         ),
        #         tf.keras.layers.Embedding(
        #             input_dim=len(vocab_dict["description_pl"]) + 1,
        #             output_dim=EMBEDDING_DIM,
        #             mask_zero=False,
        #             name="description_pl_emb_layer",
        #         ),
        #         tf.keras.layers.GlobalAveragePooling1D(name="description_pl_pooling"),
        #     ], name="pl_description_emb_model"
        # )
        
        # Feature: duration_ms_seed_pl                      
        # TODO: Noramlize or Descritize?
        duration_ms_seed_pl_buckets = np.linspace(
            vocab_dict['min_duration_ms_seed_pl'], 
            vocab_dict['max_duration_ms_seed_pl'], 
            num=1000
        )
        self.duration_ms_seed_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(duration_ms_seed_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(duration_ms_seed_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="duration_ms_seed_pl_emb_layer",
                )
            ], name="duration_ms_seed_pl_emb_model"
        )
        # self.duration_ms_seed_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_duration_ms_seed_pl'],
        #     variance=vocab_dict['var_duration_ms_seed_pl'],
        #     axis=None
        # )
        
        # Feature: n_songs_pl
        # TODO: Noramlize or Descritize?
        n_songs_pl_buckets = np.linspace(
            vocab_dict['min_n_songs_pl'], 
            vocab_dict['max_n_songs_pl'], 
            num=100
        )
        self.n_songs_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(n_songs_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(n_songs_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="n_songs_pl_emb_layer",
                )
            ], name="n_songs_pl_emb_model"
        )
        # self.n_songs_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_n_songs_pl'],
        #     variance=vocab_dict['var_n_songs_pl'],
        #     axis=None
        # )
        
        # Feature: num_artists_pl
        # TODO: Noramlize or Descritize?
        n_artists_pl_buckets = np.linspace(
            vocab_dict['min_n_artists_pl'], 
            vocab_dict['max_n_artists_pl'], 
            num=100
        )
        self.n_artists_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(n_artists_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(n_artists_pl_buckets) + 2, 
                    output_dim=EMBEDDING_DIM, 
                    name="n_artists_pl_emb_layer",
                    mask_zero=True
                )
            ], name="n_artists_pl_emb_model"
        )
        # self.n_artists_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_n_artists_pl'],
        #     variance=vocab_dict['var_n_artists_pl'],
        #     axis=None
        # )
        
        # Feature: num_albums_pl
        n_albums_pl_buckets = np.linspace(
            vocab_dict['min_n_albums_pl'], 
            vocab_dict['max_n_albums_pl'],
            num=100
        )
        self.n_albums_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(n_albums_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(n_albums_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="n_albums_pl_emb_layer",
                )
            ], name="n_albums_pl_emb_model"
        )
        # self.n_albums_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_n_albums_pl'],
        #     variance=vocab_dict['var_n_albums_pl'],
        #     axis=None
        # )
        
        # ========================================
        # sequence playlist features
        # ========================================
        
        # Feature: artist_name_pl
        self.artist_name_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['artist_name_pl'], mask_token=''),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['artist_name_pl']) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="artist_name_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="artist_name_pl_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="artist_name_gru"),
            ], name="artist_name_pl_emb_model"
        )
        
        # Feature: track_uri_pl
        # 2.2M unique
        self.track_uri_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['track_uri_pl']) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="track_uri_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="track_uri_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="track_uri_gru"),
            ], name="track_uri_pl_emb_model"
        )
        
        # Feature: track_name_pl
        self.track_name_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['track_name_pl'], 
                    name="track_name_pl_lookup",
                    output_mode='int',
                    mask_token=''
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['track_name_pl']) + 2, 
                    output_dim=EMBEDDING_DIM,
                    name="track_name_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="track_name_pl_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="track_name_gru"),
            ], name="track_name_pl_emb_model"
        )
        
        Feature: duration_ms_songs_pl
        duration_ms_songs_pl_buckets = np.linspace(
            vocab_dict['min_duration_ms_songs_pl'], 
            vocab_dict['max_duration_ms_songs_pl'], 
            num=100
        )
        self.duration_ms_songs_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.Discretization(duration_ms_songs_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(duration_ms_songs_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="duration_ms_songs_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="duration_ms_songs_pl_emb_layer_pl_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="duration_ms_songs_gru"),
            ], name="duration_ms_songs_pl_emb_model"
        )
        
        # Feature: album_name_pl
        self.album_name_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.StringLookup(
                    vocabulary=tf.constant(vocab_dict['album_name_pl']), mask_token='', name="album_name_pl_lookup"),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['album_name_pl']) + 2, 
                    output_dim=EMBEDDING_DIM,
                    name="album_name_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="album_name_pl_emb_layer_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="album_name_gru"),
            ], name="album_name_pl_emb_model"
        )
        
        # Feature: artist_pop_pl
        artist_pop_pl_buckets = np.linspace(
            vocab_dict['min_artist_pop'], 
            vocab_dict['max_artist_pop'], 
            num=10
        )
        self.artist_pop_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.Discretization(artist_pop_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(artist_pop_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="artist_pop_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(EMBEDDING_DIM, name="artist_pop_conv1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="artist_pop_gru"),
            ], name="artist_pop_pl_emb_model"
        )
        
        # Feature: artists_followers_pl
        artists_followers_pl_buckets = np.linspace(
            vocab_dict['min_artist_followers'], 
            vocab_dict['max_artist_followers'], 
            num=10
        )
        self.artists_followers_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.Discretization(artists_followers_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(artists_followers_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="artists_followers_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="artists_followers_pl_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="artist_followers_gru"),
            ], name="artists_followers_pl_emb_model"
        )
        
        # Feature: track_pop_pl
        track_pop_pl_buckets = np.linspace(
            vocab_dict['min_track_pop'], 
            vocab_dict['max_track_pop'], 
            num=10
        )
        self.track_pop_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(dtype=tf.float32),
                tf.keras.layers.Discretization(track_pop_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(track_pop_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="track_pop_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="track_pop_pl_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="track_pop_gru"),
            ], name="track_pop_pl_emb_model"
        )
        
        # Feature: artist_genres_pl
        self.artist_genres_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['artist_genres_pl'], mask_token=''),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['artist_genres_pl'])+2, 
                    output_dim=EMBEDDING_DIM,
                    name="artist_genres_pl_emb_layer",
                    mask_zero=False
                ),
                # tf.keras.layers.GlobalAveragePooling1D(name="artist_genres_pl_1d"),
                tf.keras.layers.GRU(EMBEDDING_DIM, name="artist_genres_gru"),
            ], name="artist_genres_pl_emb_model"
        )
        
        # ========================================
        # dense and cross layers
        # ========================================

        # Cross Layers
        if USE_CROSS_LAYER:
            self._cross_layer = tfrs.layers.dcn.Cross(
                projection_dim=PROJECTION_DIM,
                kernel_initializer="glorot_uniform", 
                name="pl_cross_layer"
            )
        else:
            self._cross_layer = None
            
        # Dense Layers
        self.dense_layers = tf.keras.Sequential(name="pl_dense_layers")
        initializer = tf.keras.initializers.GlorotUniform(seed=SEED)
        
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    activation="relu", 
                    kernel_initializer=initializer,
                )
            )
            if DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(DROPOUT_RATE))
                
        # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    kernel_initializer=initializer
                )
            )
        ### ADDING L2 NORM AT THE END
        self.dense_layers.add(
            tf.keras.layers.Lambda(
                lambda x: tf.nn.l2_normalize(
                    x, 1, epsilon=1e-12, name="normalize_dense"
                )
            )
        )
        
    # ========================================
    # call
    # ========================================
    def call(self, data):
        '''
        The call method defines what happens when
        the model is called
        '''
        
        all_embs = tf.concat(
            [
                self.pl_name_text_embedding(data['name']),
                self.pl_collaborative_embedding(data['collaborative']),
                self.pl_pid_embedding(data["pid"]),
                # self.pl_track_uri_embedding(data["track_uri_can"]),
                # self.pl_description_text_embedding(data['description_pl']),
                self.duration_ms_seed_pl_embedding(data["duration_ms_seed_pl"]),
                # tf.reshape(self.duration_ms_seed_pl_normalization(data["duration_ms_seed_pl"]), (-1, 1))      # Normalize or Discretize?
                self.n_songs_pl_embedding(data["n_songs_pl"]),
                # tf.reshape(self.n_songs_pl_normalization(data["n_songs_pl"]), (-1, 1))                        # Normalize or Discretize?
                self.n_artists_pl_embedding(data['num_artists_pl']),
                # tf.reshape(self.n_artists_pl_normalization(data["num_artists_pl"]), (-1, 1))                  # Normalize or Discretize?
                self.n_albums_pl_embedding(data["num_albums_pl"]),
                # tf.reshape(self.n_albums_pl_normalization(data["num_albums_pl"]), (-1, 1))                    # Normalize or Discretize?
                
                # sequence features
                # data["pos_pl"],
                self.artist_name_pl_embedding(data["artist_name_pl"]),
                self.track_uri_pl_embedding(data["track_uri_pl"]),
                self.track_name_pl_embedding(data["track_name_pl"]),
                self.duration_ms_songs_pl_embedding(data["duration_ms_songs_pl"]),
                self.album_name_pl_embedding(data["album_name_pl"]),
                self.artist_pop_pl_embedding(data["artist_pop_pl"]),
                self.artists_followers_pl_embedding(data["artists_followers_pl"]),
                self.track_pop_pl_embedding(data["track_pop_pl"]),
                self.artist_genres_pl_embedding(data["artist_genres_pl"]),
            ], axis=1)
        
                # JW 9/22/22
                # self.artist_name_pl_embedding(tf.reshape(data["artist_name_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.track_uri_pl_embedding(tf.reshape(data["track_uri_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.track_name_pl_embedding(tf.reshape(data["track_name_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.duration_ms_songs_pl_embedding(tf.reshape(data["duration_ms_songs_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.album_name_pl_embedding(tf.reshape(data["album_name_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.artist_pop_pl_embedding(tf.reshape(data["artist_pop_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.artists_followers_pl_embedding(tf.reshape(data["artists_followers_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.track_pop_pl_embedding(tf.reshape(data["track_pop_pl"], (-1, MAX_PLAYLIST_LENGTH))),
                # self.artist_genres_pl_embedding(tf.reshape(data["artist_genres_pl"], (-1, MAX_PLAYLIST_LENGTH))),
        
        # Build Cross Network
        if self._cross_layer is not None:
            cross_embs = self._cross_layer(all_embs)
            return self.dense_layers(cross_embs)
        else:
            return self.dense_layers(all_embs)

# ====================================================
# Track (candidate) Tower
# ====================================================
class Candidate_Track_Model(tf.keras.Model):
    def __init__(self, layer_sizes, vocab_dict):
        super().__init__()
        
        # ========================================
        # Candidate features
        # ========================================
        
        # Feature: artist_name_can
        self.artist_name_can_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["artist_name_can"],
                    name="artist_name_can_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["artist_name_can"]) + 1,
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="artist_name_can_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="artist_name_can_pooling"),
            ], name="artist_name_can_emb_model"
        )
        
        # Feature: track_name_can
        self.track_name_can_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["track_name_can"],
                    name="track_name_can_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["track_name_can"]) + 1,
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="track_name_can_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="track_name_can_pooling"),
            ], name="track_name_can_emb_model"
        )
        
        # Feature: album_name_can
        self.album_name_can_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["album_name_can"],
                    name="album_name_can_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["album_name_can"]) + 1,
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="album_name_can_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="album_name_can_pooling"),
            ], name="album_name_can_emb_model"
        )
        
        # Feature: artist_uri_can
        self.artist_uri_can_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim = len(vocab_dict['artist_uri_can'])+1, # TODO: JW - 200_000 + 1, 
                    output_dim = EMBEDDING_DIM,
                    name="artist_uri_can_emb_layer",
                ),
            ], name="artist_uri_can_emb_model"
        )
        
        # Feature: track_uri_can
        self.track_uri_can_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim = len(vocab_dict['track_uri_can'])+1, # TODO: JW - 200_000 + 1,  
                    output_dim=EMBEDDING_DIM,
                    name="track_uri_can_emb_layer",
                ),
            ], name="track_uri_can_emb_model"
        )
        
        # Feature: album_uri_can
        self.album_uri_can_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim = len(vocab_dict['album_uri_can'])+1, # TODO: JW - 200_000 + 1,  
                    output_dim=EMBEDDING_DIM,
                    name="album_uri_can_emb_layer",
                ),
            ], name="album_uri_can_emb_model"
        )
        
        # Feature: duration_ms_can
        self.duration_ms_can_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_duration_ms_songs_pl'],
            variance=vocab_dict['var_duration_ms_songs_pl'],
            axis=None
        )
        
        # Feature: track_pop_can
        self.track_pop_can_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_track_pop'],
            variance=vocab_dict['var_track_pop'],
            axis=None
        )
        
        # Feature: artist_pop_can
        self.artist_pop_can_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_artist_pop'],
            variance=vocab_dict['var_artist_pop'],
            axis=None
        )
        
        # Feature: artist_followers_can
        self.artist_followers_can_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_artist_followers'],
            variance=vocab_dict['var_artist_followers'],
            axis=None
        )
        
        # Feature: artist_genres_can
        self.artist_genres_can_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["artist_genres_can"],
                    name="artist_genres_can_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["artist_genres_can"])+1,
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="artist_genres_can_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="artist_genres_can_pooling"),
            ], name="artist_genres_can_emb_model"
        )
        
        # ========================================
        # Dense & Cross Layers
        # ========================================
        
        # Cross Layers
        if USE_CROSS_LAYER:
            self._cross_layer = tfrs.layers.dcn.Cross(
                projection_dim=PROJECTION_DIM,
                kernel_initializer="glorot_uniform", 
                name="can_cross_layer"
            )
        else:
            self._cross_layer = None
        
        # Dense Layer
        self.dense_layers = tf.keras.Sequential(name="candidate_dense_layers")
        initializer = tf.keras.initializers.GlorotUniform(seed=SEED)
        
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    activation="relu", 
                    kernel_initializer=initializer,
                )
            )
            if DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(DROPOUT_RATE))
                
        # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    kernel_initializer=initializer
                )
            )
            
    # ========================================
    # Call Function
    # ========================================
            
    def call(self, data):
        
        all_embs = tf.concat(
            [
                self.artist_name_can_text_embedding(data['artist_name_can']),
                self.track_name_can_text_embedding(data['track_name_can']),
                self.album_name_can_text_embedding(data['album_name_can']),
                self.artist_uri_can_embedding(data['artist_uri_can']),
                self.track_uri_can_embedding(data['track_uri_can']),
                self.album_uri_can_embedding(data['album_uri_can']),
                tf.reshape(self.duration_ms_can_normalized(data["duration_ms_can"]), (-1, 1)),
                tf.reshape(self.track_pop_can_normalized(data["track_pop_can"]), (-1, 1)),
                tf.reshape(self.artist_pop_can_normalized(data["artist_pop_can"]), (-1, 1)),
                tf.reshape(self.artist_followers_can_normalized(data["artist_followers_can"]), (-1, 1)),
                self.artist_genres_can_text_embedding(data['album_uri_can']),
            ], axis=1
        )
        
        # return self.dense_layers(all_embs)
                # Build Cross Network
        if self._cross_layer is not None:
            cross_embs = self._cross_layer(all_embs)
            return self.dense_layers(cross_embs)
        else:
            return self.dense_layers(all_embs)

# ====================================================
# Combined 2Tower
# ====================================================
class TheTwoTowers(tfrs.models.Model):

    def __init__(self, layer_sizes, vocab_dict_load, parsed_candidate_dataset):
        super().__init__()
        
        self.query_tower = Playlist_Model(layer_sizes, vocab_dict_load)
        
        self.candidate_tower = Candidate_Track_Model(layer_sizes, vocab_dict_load)
        
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=parsed_candidate_dataset.batch(128).cache().map(self.candidate_tower) # TODO: parameterize
            )
        )
        
    def compute_loss(self, data, training=False):
        query_embeddings = self.query_tower(data)
        candidate_embeddings = self.candidate_tower(data)

        return self.task(
            query_embeddings, 
            candidate_embeddings, 
            compute_metrics=not training
        ) # turn off metrics to save time on training


Writing src/trainer/_model.py


### train `task.py`

In [15]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/trainer/task.py

import json
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.python.client import device_lib

import argparse
import os
import sys
import logging
import pickle as pkl

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage
import hypertune

import time
import numpy as np

# ====================================================
# Helper functions
# ====================================================

def _is_chief(task_type, task_id): 
    ''' Check for primary if multiworker training
    '''
    return (task_type == 'chief') or (task_type == 'worker' and task_id == 0) or task_type is None

def get_arch_from_string(arch_string):
    q = arch_string.replace(']', '')
    q = q.replace('[', '')
    q = q.replace(" ", "")
    return [int(x) for x in q.split(',')]

# ====================================================
# Main
# ====================================================
import _data as trainer_data
import _model as trainer_model
import train_config as cfg
import time 

TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

def main(args):
    
    logging.info("Starting training...")
    logging.info('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
    
    storage_client = storage.Client(
        project=args.project
    )
    
    WORKING_DIR = f'gs://{args.train_output_gcs_bucket}'             # replaced f'gs://{args.model_dir}/{args.version}'
    logging.info(f'Train job output directory: {WORKING_DIR}')
    
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    
    # AIP_TB_LOGS = args.aip_tb_logs # os.environ.get('AIP_TENSORBOARD_LOG_DIR', 'NA')
    # logging.info(f'AIP TENSORBOARD LOG DIR: {AIP_TB_LOGS}')
    
    # ====================================================
    # Set Device / GPU/TPU Strategy
    # ====================================================
    logging.info("Detecting devices....")
    logging.info(f'Detected Devices {str(device_lib.list_local_devices())}')
    logging.info("Setting device strategy...")
    
    # Single Machine, single compute device
    if args.distribute == 'single':
        if tf.config.list_physical_devices('GPU'): # TODO: replace with - tf.config.list_physical_devices('GPU') | tf.test.is_gpu_available()
            strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        else:
            strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        logging.info("Single device training")
    
    # Single Machine, multiple compute device
    elif args.distribute == 'mirrored':
        strategy = tf.distribute.MirroredStrategy()
        logging.info("Mirrored Strategy distributed training")

    # Multi Machine, multiple compute device
    elif args.distribute == 'multiworker':
        strategy = tf.distribute.MultiWorkerMirroredStrategy()
        logging.info("Multi-worker Strategy distributed training")
        logging.info('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
    
    # Single Machine, multiple TPU devices
    elif args.distribute == 'tpu':
        cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
        tf.config.experimental_connect_to_cluster(cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
        strategy = tf.distribute.TPUStrategy(cluster_resolver)
        logging.info("All devices: ", tf.config.list_logical_devices('TPU'))

    
    logging.info('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))
    NUM_REPLICAS = strategy.num_replicas_in_sync
    
    # ====================================================
    # Vocab Files
    # ====================================================

    # TODO: parameterize & configure for adapts vs vocab files

    BUCKET_NAME = 'spotify-v1'
    FILE_PATH = 'vocabs/v1_string_vocabs'
    FILE_NAME = 'string_vocabs_v1_20220705-202905.txt'
    DESTINATION_FILE = 'downloaded_vocabs.txt'     # TODO: args.vocab_file

    with open(f'{DESTINATION_FILE}', 'wb') as file_obj:
        storage_client.download_blob_to_file(
            f'gs://{BUCKET_NAME}/{FILE_PATH}/{FILE_NAME}', file_obj)


    with open(f'{DESTINATION_FILE}', 'rb') as pickle_file:
        vocab_dict_load = pkl.load(pickle_file)


    # TODO: include as a preprocessing step 
    avg_duration_ms_seed_pl = 13000151.68
    var_duration_ms_seed_pl = 133092900971233.58
    vocab_dict_load['avg_duration_ms_seed_pl']=avg_duration_ms_seed_pl
    vocab_dict_load['var_duration_ms_seed_pl']=var_duration_ms_seed_pl

    avg_n_songs_pl = 55.21
    var_n_songs_pl = 2317.54
    vocab_dict_load['avg_n_songs_pl']=avg_n_songs_pl
    vocab_dict_load['var_n_songs_pl']=var_n_songs_pl

    avg_n_artists_pl = 30.56
    var_n_artists_pl = 769.26
    vocab_dict_load['avg_n_artists_pl']=avg_n_artists_pl
    vocab_dict_load['var_n_artists_pl']=var_n_artists_pl

    avg_n_albums_pl = 40.25
    var_n_albums_pl = 1305.54
    vocab_dict_load['avg_n_albums_pl']=avg_n_albums_pl
    vocab_dict_load['var_n_albums_pl']=var_n_albums_pl

    avg_artist_pop = 16.08
    var_artist_pop = 300.64
    vocab_dict_load['avg_artist_pop']=avg_artist_pop
    vocab_dict_load['var_artist_pop']=var_artist_pop

    avg_duration_ms_songs_pl = 234823.14
    var_duration_ms_songs_pl = 5558806228.41
    vocab_dict_load['avg_duration_ms_songs_pl']=avg_duration_ms_songs_pl
    vocab_dict_load['var_duration_ms_songs_pl']=var_duration_ms_songs_pl

    avg_artist_followers = 43337.77
    var_artist_followers = 377777790193.57
    vocab_dict_load['avg_artist_followers']=avg_artist_followers
    vocab_dict_load['var_artist_followers']=var_artist_followers

    avg_track_pop = 10.85
    var_track_pop = 202.18
    vocab_dict_load['avg_track_pop']=avg_track_pop
    vocab_dict_load['var_track_pop']=var_track_pop

    # ====================================================
    # TRAIN dataset - Parse & Pad
    # ====================================================

    # logging.info(f'Getting train data from bucket: {args.train_dir}')
    # logging.info(f'args.train_dir_prefix: {args.train_dir_prefix}')
    
    logging.info(f'Path to TRAIN files: gs://{args.train_dir}/{args.train_dir_prefix}')
    
    train_files = []
    for blob in storage_client.list_blobs(f'{args.train_dir}', prefix=f'{args.train_dir_prefix}', delimiter="/"):
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
    # Parse train dataset
    raw_train_ds = tf.data.TFRecordDataset(train_files)
    parsed_train_ds = raw_train_ds.map(trainer_data.parse_tfrecord_fn) # _data
    parsed_padded_train_ds = parsed_train_ds.map(trainer_data.return_padded_tensors) # _data
    
    # ====================================================
    # VALID dataset - Parse & Pad 
    # ====================================================
    
    # logging.info(f'args.valid_dir: {args.valid_dir}')                   # TODO: args.valid_dir
    # logging.info(f'args.valid_dir_prefix: {args.valid_dir_prefix}')     # TODO: args.valid_dir_prefix
    
    logging.info(f'Path to VALID files: gs://{args.valid_dir}/{args.valid_dir_prefix}')
    
    valid_files = []
    for blob in storage_client.list_blobs(f'{args.valid_dir}', prefix=f'{args.valid_dir_prefix}', delimiter="/"):
        valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
    # Parse train dataset
    raw_valid_ds = tf.data.TFRecordDataset(valid_files)
    parsed_valid_ds = raw_valid_ds.map(trainer_data.parse_tfrecord_fn) # _data
    parsed_padded_valid_ds = parsed_valid_ds.map(trainer_data.return_padded_tensors) # _data
    
    # ====================================================
    # Parse candidates dataset
    # ====================================================

    # logging.info(f'args.candidate_file_dir: {args.candidate_file_dir}')
    # logging.info(f'args.candidate_files_prefix: {args.candidate_files_prefix}')
    
    logging.info(f'Path to CANDIDATE files: gs://{args.candidate_file_dir}/{args.candidate_files_prefix}')

    candidate_files = []
    for blob in storage_client.list_blobs(f'{args.candidate_file_dir}', prefix=f'{args.candidate_files_prefix}', delimiter="/"):
        candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
    raw_candidate_dataset = tf.data.TFRecordDataset(candidate_files)
    parsed_candidate_dataset = raw_candidate_dataset.map(trainer_data.parse_candidate_tfrecord_fn) # _data
    
    # ====================================================
    # Prepare Train and Valid Data
    # ====================================================
    logging.info(f'preparing train and valid splits...')
    tf.random.set_seed(42)
    
    # TRAIN
    shuffled_parsed_train_ds = parsed_padded_train_ds.shuffle(10_000, seed=42, reshuffle_each_iteration=False)
    cached_train = shuffled_parsed_train_ds.batch(args.batch_size * strategy.num_replicas_in_sync).prefetch(tf.data.AUTOTUNE)
    
    # VALID
    # shuffled_parsed_train_ds = parsed_padded_valid_ds.shuffle(10_000, seed=42, reshuffle_each_iteration=False)
    cached_valid = parsed_padded_valid_ds.batch(args.batch_size * strategy.num_replicas_in_sync).cache().prefetch(tf.data.AUTOTUNE)
    
    logging.info(f'TRAIN and VALID prepped...')

    # train_data = shuffled_parsed_ds.take(80_000).batch(128)
    # valid_data = shuffled_parsed_ds.skip(80_000).take(20_000).batch(128)
    
    # valid_size = 20_000 # cfg.VALID_SIZE # 20_000 # args.valid_size
    # valid = shuffled_parsed_ds.take(valid_size)
    # train = shuffled_parsed_ds.skip(valid_size)
    # cached_train = train.batch(args.batch_size * strategy.num_replicas_in_sync).prefetch(tf.data.AUTOTUNE)
    # cached_valid = valid.batch(args.batch_size * strategy.num_replicas_in_sync).cache().prefetch(tf.data.AUTOTUNE)
    
    # ====================================================
    # metaparams for Vertex Ai Experiments
    # ====================================================
    logging.info('Logging metaparams & hyperparams for Vertex Experiments')
    
    EXPERIMENT_NAME = f"{args.experiment_name}"
    RUN_NAME = f"{args.experiment_run}"
    logging.info(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}\n RUN_NAME: {RUN_NAME}")
    
    metaparams = {}
    metaparams["experiment_name"] = f'{EXPERIMENT_NAME}'
    metaparams["experiment_run"] = f"{RUN_NAME}"
    metaparams["model_version"] = f"{args.model_version}"
    metaparams["pipe_version"] = f"{args.pipeline_version}"
    metaparams["data_regime"] = f"{args.data_regime}"
    metaparams["distribute"] = f'{args.distribute}'
    
    hyperparams = {}
    hyperparams["epochs"] = int(args.num_epochs)
    hyperparams["batch_size"] = int(args.batch_size)
    hyperparams["embedding_dim"] = args.embedding_dim
    hyperparams["projection_dim"] = args.projection_dim
    hyperparams["use_cross_layer"] = cfg.USE_CROSS_LAYER # args.use_cross_layer
    hyperparams["use_dropout"] = cfg.USE_DROPOUT # args.use_dropout
    hyperparams["dropout_rate"] = args.dropout_rate
    hyperparams['layer_sizes'] = args.layer_sizes
    
    logging.info(f"Creating run: {RUN_NAME}; for experiment: {EXPERIMENT_NAME}")
    
    # Create experiment
    vertex_ai.init(experiment=EXPERIMENT_NAME)
    # vertex_ai.start_run(RUN_NAME,resume=True) # RUN_NAME
    
    with vertex_ai.start_run(RUN_NAME) as my_run:
        logging.info(f"logging metaparams")
        my_run.log_params(metaparams)
        
        logging.info(f"logging hyperparams")
        my_run.log_params(hyperparams)
        
    # ====================================================
    # Compile, Adapt, and Train model
    # ====================================================
    logging.info('Setting model adapts and compiling the model')
    
    LAYER_SIZES = get_arch_from_string(args.layer_sizes)
    logging.info(f'LAYER_SIZES: {LAYER_SIZES}')
    
    logging.info(f'adapting layers: {cfg.NEW_ADAPTS}') # args.new_adapts | cfg.NEW_ADAPTS
    
    # Wrap variable creation within strategy scope
    with strategy.scope():

        model = trainer_model.TheTwoTowers(LAYER_SIZES, vocab_dict_load, parsed_candidate_dataset)
        
        model.query_tower.pl_name_text_embedding.layers[0].adapt(shuffled_parsed_train_ds.map(lambda x: x['name']).batch(args.batch_size)) # TODO: use cached_train or shuffled_parsed_train_ds ?
        # artist_name_can
        # track_name_can
        # album_name_can
        # artist_genres_can
        
        # if cfg.NEW_ADAPTS:
            # model.query_tower.pl_name_text_embedding.layers[0].adapt(shuffled_parsed_ds.map(lambda x: x['name']).batch(args.batch_size)) # TODO: adapts on full dataset or train onl
            
        model.compile(optimizer=tf.keras.optimizers.Adagrad(args.learning_rate))
        
    if cfg.NEW_ADAPTS:
        vocab_dict_load['name'] = model.query_tower.pl_name_text_embedding.layers[0].get_vocabulary()
        bucket = storage_client.bucket(args.train_output_gcs_bucket)                               # TODO: args.train_output_gcs_bucket # replaced args.model_dir
        blob = bucket.blob(f'{EXPERIMENT_NAME}/{RUN_NAME}/vocabs_stats/vocab_dict_{RUN_NAME}.txt') # replaced f'{args.version}/vocabs_stats/vocab_dict_{RUN_NAME}.txt'
        pickle_out = pkl.dumps(vocab_dict_load)
        blob.upload_from_string(pickle_out)
    
    logging.info('Adapts finish - training next')
        
    tf.random.set_seed(args.seed)
    
    logs_dir = f'gs://{args.train_output_gcs_bucket}/{EXPERIMENT_NAME}/{RUN_NAME}/tb-logs'         # replaced f"{WORKING_DIR}/tb-logs-{RUN_NAME}" 
    AIP_LOGS = os.environ.get('AIP_TENSORBOARD_LOG_DIR', f'{logs_dir}')
    logging.info(f'TensorBoard logdir: {AIP_LOGS}')
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=AIP_LOGS,
        histogram_freq=0, 
        write_graph=True, 
        profile_batch = '500,520'
    )
    # if os.environ.get('AIP_TENSORBOARD_LOG_DIR', 'NA') is not 'NA':
    #     tensorboard_callback = tf.keras.callbacks.TensorBoard(
    #         log_dir=os.environ['AIP_TENSORBOARD_LOG_DIR'],
    #         histogram_freq=0, write_graph=True, profile_batch = '500,520')
    # else:
    #     os.mkdir('/tb_logs')
    #     tensorboard_callback = tf.keras.callbacks.TensorBoard(
    #         log_dir='/tb_logs',
    #         histogram_freq=0)
        
    logging.info('Training starting')
    layer_history = model.fit(
        cached_train,
        validation_data=cached_valid,
        validation_freq=args.valid_frequency,
        callbacks=tensorboard_callback,
        epochs=args.num_epochs,
        verbose=2
    )
    
    # Determine type and task of the machine from the strategy cluster resolver
    if args.distribute == 'multiworker':
        task_type, task_id = (strategy.cluster_resolver.task_type,
                              strategy.cluster_resolver.task_id)
    else:
        task_type, task_id = None, None
    
    # ====================================================
    # Eval Metrics
    # ====================================================
    logging.info('Getting evaluation metrics')

    val_metrics = model.evaluate(cached_valid, return_dict=True) #check performance
    
    logging.info('Validation metrics below:')
    logging.info(val_metrics)
    
    with vertex_ai.start_run(RUN_NAME,resume=True) as my_run:
        logging.info(f"logging metrics to experiment run {RUN_NAME}")
        my_run.log_metrics(val_metrics)
    
    # logging.info(f"Ending experiment run: {RUN_NAME}")
    # vertex_ai.end_run()
    
    # ====================================================
    # Save Towers
    # ====================================================
    
    # logging.info(f'Saving models to {args.model_dir}')                                        # TODO: f'gs://args.train_output_gcs_bucket/{EXPERIMENT_NAME}/{RUN_NAME}/model-dir
    MODEL_DIR_GCS_URI = f'gs://{args.train_output_gcs_bucket}/{EXPERIMENT_NAME}/{RUN_NAME}/model-dir'
    logging.info(f'Saving models to {MODEL_DIR_GCS_URI}')

    query_dir_save = f"{MODEL_DIR_GCS_URI}/query_tower/"                                      # replaced: f"gs://{args.model_dir}/{args.version}/{RUN_NAME}/query_tower/" 
    candidate_dir_save = f"{MODEL_DIR_GCS_URI}/candidate_tower/"                              # replaced: f"gs://{args.model_dir}/{args.version}/{RUN_NAME}/candidate_tower/"
    logging.info(f'Saving chief query model to {query_dir_save}')
    
    # save model from primary node in multiworker
    if _is_chief(task_type, task_id):
        tf.saved_model.save(model.query_tower, query_dir_save)
        logging.info(f'Saved chief query model to {query_dir_save}')
        tf.saved_model.save(model.candidate_tower, candidate_dir_save)
        logging.info(f'Saved chief candidate model to {candidate_dir_save}')
    else:
        worker_dir_query = query_dir_save + '/workertemp_query_/' + str(task_id)
        tf.io.gfile.makedirs(worker_dir_query)
        tf.saved_model.save(model.query_tower, worker_dir_query)
        logging.info(f'Saved worker: {task_id} query model to {worker_dir_query}')

        worker_dir_can = candidate_dir_save + '/workertemp_can_/' + str(task_id)
        tf.io.gfile.makedirs(worker_dir_can)
        tf.saved_model.save(model.candidate_tower, worker_dir_can)
        logging.info(f'Saved worker: {task_id} candidate model to {worker_dir_can}')

    if not _is_chief(task_type, task_id):
        tf.io.gfile.rmtree(worker_dir_can)
        tf.io.gfile.rmtree(worker_dir_query)

    logging.info('All done - model saved') #all done
    
def parse_args():
    """
    Parses command line arguments
    
    type: int, float, str
          bool() converts empty strings to `False` and non-empty strings to `True`
          see more details here: https://docs.python.org/3/library/argparse.html#type
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir',
                        default=os.getenv('AIP_MODEL_DIR'), type=str, help='Model dir', required=False) # TODO: sunset this arg
    
    parser.add_argument('--train_output_gcs_bucket',
                        default=os.getenv('AIP_MODEL_DIR'), type=str, help='bucket for train job output', required=False) # TODO: use this
    
    parser.add_argument('--train_dir', 
                        type=str, help='bucket holding training files', required=True)
    
    parser.add_argument('--train_dir_prefix', 
                        type=str, help='file path under GCS bucket', required=True)
    
    parser.add_argument('--valid_dir', 
                        type=str, help='bucket holding valid files', required=True)
    
    parser.add_argument('--valid_dir_prefix', 
                        type=str, help='file path under GCS bucket', required=True)
    
    parser.add_argument('--candidate_file_dir', 
                        type=str, help='bucket holding candidate files', required=True)

    parser.add_argument('--candidate_files_prefix', 
                        type=str, help='file path under GCS bucket', required=True)

    parser.add_argument('--project', 
                        type=str, help='project', required=True)

    parser.add_argument('--max_padding', 
                        default=375, type=int, help='max_padding', required=False)

    parser.add_argument('--experiment_name', 
                        type=str, help='#TODO', required=True)

    parser.add_argument('--experiment_run', 
                        type=str, help='#TODO', required=True)

    parser.add_argument('--num_epochs', 
                        default=1, type=int, help='#TODO', required=False)

    parser.add_argument('--batch_size', 
                        default=128, type=int, help='#TODO', required=False)

    parser.add_argument('--embedding_dim', 
                        default=32, type=int, help='#TODO', required=False)

    parser.add_argument('--projection_dim', 
                        default=5, type=int, help='#TODO', required=False)

    parser.add_argument('--seed', 
                        default=1234, type=str, help='#TODO', required=False)

#     parser.add_argument('--use_cross_layer', 
#                         default=True, type=bool, help='#TODO', required=False)

#     parser.add_argument('--use_dropout', 
#                         default=False, type=bool, help='#TODO', required=False)

    parser.add_argument('--dropout_rate', 
                        default=0.4, type=float, help='#TODO', required=False)

    parser.add_argument('--layer_sizes', 
                        default='[64,32]', type=str, help='#TODO', required=False)

    # parser.add_argument('--aip_tb_logs', 
    #                     default=os.getenv('AIP_TENSORBOARD_LOG_DIR'), type=str, help='#TODO', required=False)

    # parser.add_argument('--new_adapts', 
    #                     default=False, type=bool, help='#TODO', required=False)

    parser.add_argument('--learning_rate', 
                        default=0.01, type=float, help='learning rate', required=False)

    # parser.add_argument('--valid_size', 
    #                     default='#TODO', type=str, help='number of records in valid split', required=False)

    parser.add_argument('--valid_frequency', 
                        default=10, type=int, help='number of epochs per metrics val calculation', required=False)

    parser.add_argument('--distribute', 
                        default='single', type=str, help='TF strategy: single, mirrored, multiworker, tpu', required=False)

    # parser.add_argument('--version', 
    #                     type=str, help='version of train code; for tracking', required=True)
    
    parser.add_argument('--model_version', 
                        type=str, help='version of model train code', required=True)
    
    parser.add_argument('--pipeline_version', 
                        type=str, help='version of pipeline code; v0 for non-pipeline execution', required=True)
    
    parser.add_argument('--data_regime', 
                        type=str, help='id for tracking different datasets', required=True)


    # args = parser.parse_args()
    return parser.parse_args()
    
if __name__ == '__main__':
    logging.basicConfig(
        format='%(asctime)s - %(message)s',
        level=logging.INFO, 
        datefmt='%d-%m-%y %H:%M:%S',
        stream=sys.stdout
    )

    parsed_args = parse_args()

    logging.info('Args: %s', parsed_args)
    start_time = time.time()
    logging.info('Starting jobs main() script')

    main(parsed_args)

    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info('Training completed. Elapsed time: %s', elapsed_time )

Writing src/trainer/task.py


In [16]:
!tree /home/jupyter/spotify-tfrs/src
#/vertex_train/trainer

[01;34m/home/jupyter/spotify-tfrs/src[00m
├── Dockerfile.tfrs
├── README.md
├── cloudbuild.yaml
├── downloaded_vocabs.txt
├── [01;34mpipelines[00m
│   ├── [01;34m__pycache__[00m
│   │   ├── build_custom_train_image.cpython-37.pyc
│   │   ├── build_vocabs_stats.cpython-37.pyc
│   │   ├── create_tensorboard.cpython-37.pyc
│   │   ├── find_model_endpoint_test.cpython-37.pyc
│   │   ├── generate_candidate_embedding_index.cpython-37.pyc
│   │   └── train_custom_model.cpython-37.pyc
│   ├── build_custom_train_image.py
│   ├── build_vocabs_stats.py
│   ├── create_tensorboard.py
│   ├── custom_container_pipeline_spec.json
│   ├── find_model_endpoint_test.py
│   ├── generate_candidate_embedding_index.py
│   └── train_custom_model.py
└── [01;34mtrainer[00m
    ├── __init__.py
    ├── _data.py
    ├── _model.py
    ├── interactive_train.py
    ├── requirements.txt
    └── task.py

3 directories, 23 files


## Prepare Worker Pool Specs

In [17]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd,
    replica_count=1,
    machine_type="n1-standard-16",
    accelerator_count=1,
    accelerator_type="ACCELERATOR_TYPE_UNSPECIFIED",
    reduction_server_count=0,
    reduction_server_machine_type="n1-highcpu-16",
    reduction_server_image_uri=b"us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest",
):

    if accelerator_count > 0:
        machine_spec = {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        }
    else:
        machine_spec = {"machine_type": machine_type}

    container_spec = {
        "image_uri": image_uri,
        "args": args,
        "command": cmd,
    }

    chief_spec = {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": container_spec,
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            "replica_count": replica_count - 1,
            "machine_spec": machine_spec,
            "container_spec": container_spec,
        }
        worker_pool_specs.append(workers_spec)
    if reduction_server_count > 1:
        workers_spec = {
            "replica_count": reduction_server_count,
            "machine_spec": {
                "machine_type": reduction_server_machine_type,
            },
            "container_spec": {"image_uri": reduction_server_image_uri},
        }
        worker_pool_specs.append(workers_spec)

    return worker_pool_specs

### Acclerators and Device Strategy

In [18]:
import time

# # Single machine, single GPU
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
PER_MACHINE_ACCELERATOR_COUNT = 1
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
DISTRIBUTE_STRATEGY = 'single'

# # # Single Machine; multiple GPU
# WORKER_MACHINE_TYPE = 'a2-highgpu-4g' # a2-ultragpu-4g
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 4
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'mirrored'

# # # Multiple Machines, 1 GPU per Machine
# WORKER_MACHINE_TYPE = 'n1-standard-16'
# REPLICA_COUNT = 9
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 10                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

## Job Configs

### Write `train_config.py`

In [19]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/trainer/train_config.py

PROJECT_ID = 'hybrid-vertex'

NEW_ADAPTS = True
USE_CROSS_LAYER = True
USE_DROPOUT = True

SEED = 1234

MAX_PADDING = 375

EMBEDDING_DIM = 32
PROJECTION_DIM = 5
SEED = 1234
DROPOUT_RATE = 0.4

Writing src/trainer/train_config.py


#### Previously defined VARs

In [20]:
print(f"PROJECT: {PROJECT}")

print(f"APP: {APP}")
print(f"MODEL_TYPE: {MODEL_TYPE}")
print(f"FRAMEWORK: {FRAMEWORK}")
print(f"MODEL_VERSION: {MODEL_VERSION}")
print(f"PIPELINE_VERSION: {PIPELINE_VERSION}\n")
print(f"MODEL_ROOT_NAME: {MODEL_ROOT_NAME}")
print(f"OUTPUT_BUCKET: {OUTPUT_BUCKET}")
print(f"IMAGE_URI: {IMAGE_URI}")

PROJECT: hybrid-vertex
APP: sp
MODEL_TYPE: 2tower
FRAMEWORK: tfrs
MODEL_VERSION: v15
PIPELINE_VERSION: v0

MODEL_ROOT_NAME: sp-2tower-tfrs-v15-v0
OUTPUT_BUCKET: jt-tfrs-test
IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training


In [21]:
import time 
# from trainer import train_config as config

TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

# # "gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00000-of-00796.tfrecord"
# # gs://spotify-tfrs-dir/small-dataset/output-00000-of-00796.tfrecord

# GCS buckets & paths to source data
CANDIDATE_FILE_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
CANDIDATE_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

TRAIN_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
TRAIN_DIR_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

VALID_DIR = 'spotify-tfrs-dir' #'spotify-tfrecords-blog'
VALID_DIR_PREFIX = 'small-dataset/' # 'tfrecords_v1/train/'

# MODEL_DIR='spotify-tfrs-dir'  
# OUTPUT_BUCKET = 'jt-tfrs-test' # replaced MODEL_DIR='spotify-tfrs-dir' 

EXPERIMENT_PREFIX = 'dev'                                   # custom identifier for organizing experiments
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}'
RUN_NAME=f'run-{TIMESTAMP}'
DATA_REGIME = 'small-jt-tfrecord'

VALID_FREQUENCY = 10
# VALID_SIZE = 20_000

NUM_EPOCHS = 1
BATCH_SIZE = 256
LEARNING_RATE = 0.01

MAX_PADDING = 375
EMBEDDING_DIM = 32
PROJECTION_DIM = 5

DROPOUT_RATE = 0.4
LAYER_SIZES = '[64,32]'

WORKER_CMD = ["python", "trainer/task.py"]
# WORKER_CMD ["python", "-m", "trainer.task"]

WORKER_ARGS = [
    f'--project={PROJECT}',
    f'--train_output_gcs_bucket={OUTPUT_BUCKET}',
    f'--train_dir={TRAIN_DIR}',
    f'--train_dir_prefix={TRAIN_DIR_PREFIX}',
    f'--valid_dir={VALID_DIR}',
    f'--valid_dir_prefix={VALID_DIR_PREFIX}',
    # f'--model_dir={MODEL_DIR}',
    f'--candidate_file_dir={CANDIDATE_FILE_DIR}',
    f'--candidate_files_prefix={CANDIDATE_PREFIX}',
    f'--experiment_name={EXPERIMENT_NAME}',
    f'--experiment_run={RUN_NAME}',
    f'--num_epochs={NUM_EPOCHS}',
    f'--batch_size={BATCH_SIZE}',
    f'--embedding_dim={EMBEDDING_DIM}',
    f'--projection_dim={PROJECTION_DIM}',
    f'--layer_sizes={LAYER_SIZES}',
    f'--learning_rate={LEARNING_RATE}',
    f'--valid_frequency={VALID_FREQUENCY}',
    f'--distribute={DISTRIBUTE_STRATEGY}',
    f'--model_version={MODEL_VERSION}',
    f'--pipeline_version={PIPELINE_VERSION}',
    f'--data_regime={DATA_REGIME}',
]

# deprecated model args
    # f'--valid_size={VALID_SIZE}',
    # f'--new_adapts={new_adapts}',
    # f'--use_cross_layer={use_cross_layer}',
    # f'--use_dropout={use_dropout}',

    
WORKER_POOL_SPECS = prepare_worker_pool_specs(
    image_uri=IMAGE_URI,
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--train_output_gcs_bucket=jt-tfrs-test',
                              '--train_dir=spotify-tfrs-dir',
                              '--train_dir_prefix=small-dataset/',
                              '--valid_dir=spotify-tfrs-dir',
                              '--valid_dir_prefix=small-dataset/',
                              '--candidate_file_dir=spotify-tfrs-dir',
                              '--candidate_files_prefix=small-dataset/',
                              '--experiment_name=dev-2tower-tfrs-v15',
                              '--experiment_run=run-20220923-153130',
                              '--num_epochs=1',
                              '--batch_size=256',
                              '--embedding_dim=32',
                              '--projection_dim=5',
                              '--layer_sizes=[64,32]',
                              '--learning_rate=0.01',
                 

### Test Locally

In [73]:
!tree /home/jupyter/spotify-tfrs/src

[01;34m/home/jupyter/spotify-tfrs/src[00m
├── Dockerfile.tfrs
├── README.md
├── cloudbuild.yaml
├── downloaded_vocabs.txt
├── [01;34mpipelines[00m
│   ├── [01;34m__pycache__[00m
│   │   ├── build_custom_train_image.cpython-37.pyc
│   │   ├── build_vocabs_stats.cpython-37.pyc
│   │   ├── create_tensorboard.cpython-37.pyc
│   │   ├── find_model_endpoint_test.cpython-37.pyc
│   │   ├── generate_candidate_embedding_index.cpython-37.pyc
│   │   └── train_custom_model.cpython-37.pyc
│   ├── build_custom_train_image.py
│   ├── build_vocabs_stats.py
│   ├── create_tensorboard.py
│   ├── custom_container_pipeline_spec.json
│   ├── find_model_endpoint_test.py
│   ├── generate_candidate_embedding_index.py
│   └── train_custom_model.py
└── [01;34mtrainer[00m
    ├── __init__.py
    ├── _data.py
    ├── _model.py
    ├── interactive_train.py
    ├── requirements.txt
    ├── task.py
    └── train_config.py

3 directories, 24 files


In [74]:
os.chdir('/home/jupyter/spotify-tfrs')
os.getcwd()

'/home/jupyter/spotify-tfrs'

In [76]:
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

_DISTRIBUTE_STRATEGY='single'
_EXPERIMENT_NAME=f'local-testing-{MODEL_VERSION}'
_RUN_NAME=f'run-{TIMESTAMP}'

In [78]:
!cd src; python3 -m trainer.task \
    --project={PROJECT} --train_output_gcs_bucket={OUTPUT_BUCKET} --train_dir={TRAIN_DIR} --train_dir_prefix={TRAIN_DIR_PREFIX} \
    --valid_dir={VALID_DIR} --valid_dir_prefix={VALID_DIR_PREFIX} \
    --candidate_file_dir={CANDIDATE_FILE_DIR} --candidate_files_prefix={CANDIDATE_PREFIX} \
    --experiment_name={_EXPERIMENT_NAME} --experiment_run={_RUN_NAME} \
    --max_padding={MAX_PADDING} \
    --num_epochs={NUM_EPOCHS} --batch_size={BATCH_SIZE} --embedding_dim={EMBEDDING_DIM} --projection_dim={PROJECTION_DIM} \
    --dropout_rate={DROPOUT_RATE} --layer_sizes={LAYER_SIZES} --learning_rate={LEARNING_RATE} \
    --valid_frequency={VALID_FREQUENCY} --distribute={_DISTRIBUTE_STRATEGY} \
    --model_version={MODEL_VERSION} --pipeline_version={PIPELINE_VERSION} \
    --data_regime={DATA_REGIME}

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/jupyter/spotify-tfrs/src/trainer/task.py", line 38, in <module>
    import _data as trainer_data
ModuleNotFoundError: No module named '_data'


### Build Custom Train Image

In [22]:
print(f"DOCKERNAME: {DOCKERNAME}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"FILE_LOCATION: {FILE_LOCATION}")
print(f"MACHINE_TYPE: {MACHINE_TYPE}")

DOCKERNAME: tfrs
IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-v15-v0-training
FILE_LOCATION: ./src
MACHINE_TYPE: e2-highcpu-32


In [23]:
os.chdir('/home/jupyter/spotify-tfrs')
os.getcwd()

'/home/jupyter/spotify-tfrs'

### Submit to Cloud Build

In [24]:
! gcloud builds submit --config src/cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 38 file(s) totalling 214.6 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1663947103.184284-4ae8757944fb463fbc274addee0074dc.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/a4f3102a-1ba1-4920-a3c5-4dc8995c6018].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/a4f3102a-1ba1-4920-a3c5-4dc8995c6018?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "a4f3102a-1ba1-4920-a3c5-4dc8995c6018"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1663947103.184284-4ae8757944fb463fbc274addee0074dc.tgz#1663947189470531
Copying gs://hybrid-vertex_cloudbuild/source/1663947103.184284-4ae8757944fb463fbc274addee0074dc.tgz#1663947189470531...
- [1 files][ 85.5 MiB/ 85.5 MiB]                                                
Operation completed over 1 objects/85

## Submit train job to Vertex

```
Could not load library libcudnn_adv_train.so.8. Error: /opt/conda/bin/../lib/libcudnn_ops_train.so.8: undefined symbol: _Z22cudnnGenericOpTensorNdILi3EE13cudnnStatus_tP12cudnnContext16cudnnGenericOp_t21cudnnNanPropagation_tPKdPKvPK17cudnnTensorStructS8_S8_SB_S8_S8_SB_Pv, version libcudnn_ops_infer.so.8

```

In [25]:
MODEL_ROOT_NAME

'sp-2tower-tfrs-v15-v0'

In [26]:
JOB_NAME = f'train-{MODEL_ROOT_NAME}' #-{TIMESTAMP}'

# e.g., MODEL_DIR_GCS_URI = f'gs://{args.train_output_gcs_bucket}/{EXPERIMENT_NAME}/{RUN_NAME}/model-dir'
BASE_OUTPUT_DIR = f'gs://{OUTPUT_BUCKET}/{MODEL_ROOT_NAME}/{EXPERIMENT_NAME}/{RUN_NAME}'

print(f'JOB_NAME:{JOB_NAME}')
print(f'BASE_OUTPUT_DIR:{BASE_OUTPUT_DIR}')

JOB_NAME:train-sp-2tower-tfrs-v15-v0
BASE_OUTPUT_DIR:gs://jt-tfrs-test/dev-2tower-tfrs-v15/run-20220923-153130


In [27]:
job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=BASE_OUTPUT_DIR,
    # labels={'gpu':f'{ACCELERATOR_TYPE}'}
)
job.run(sync=False, 
        service_account=VERTEX_SA,
        tensorboard=TENSORBOARD,
        restart_job_on_worker_restart=False,
        enable_web_access=True,
)


Creating CustomJob
CustomJob created. Resource name: projects/934903580331/locations/us-central1/customJobs/971634852052860928
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/934903580331/locations/us-central1/customJobs/971634852052860928')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/971634852052860928?project=934903580331
View Tensorboard:
https://us-central1.tensorboard.googleusercontent.com/experiment/projects+934903580331+locations+us-central1+tensorboards+5925030667573264384+experiments+971634852052860928
CustomJob projects/934903580331/locations/us-central1/customJobs/971634852052860928 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/971634852052860928 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/971634852052860928 current state:
JobState.JOB_STATE_PENDING
CustomJob proj

# Local Testing

## Loading SavedModels

In [507]:
import tensorflow as tf

query_tower_uri = 'gs://spotify-tfrs-dir/v2/run-20220920-210334/query_tower'
candidate_tower_uri = 'gs://spotify-tfrs-dir/v2/run-20220920-210334/candidate_tower'
loaded_query_model = tf.saved_model.load(query_tower_uri)
loaded_candidate_model = tf.saved_model.load(candidate_tower_uri)

2022-09-20 21:28:45.720839: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-09-20 21:28:45.720904: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-09-20 21:28:45.720932: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jt-tfrs-spotify-sept-v2): /proc/driver/nvidia/version does not exist
2022-09-20 21:28:45.721353: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Candidate Model

In [512]:
print(list(loaded_candidate_model.signatures.keys()))

['serving_default']


In [513]:
infer = loaded_candidate_model.signatures["serving_default"]
print(infer.structured_outputs)

{'output_1': TensorSpec(shape=(None, 32), dtype=tf.float32, name='output_1')}


In [508]:
predict2 = loaded_candidate_model.signatures['serving_default']
predict2.output_shapes

{'output_1': TensorShape([None, 32])}

In [510]:
loaded_candidate_model.signatures

_SignatureMap({'serving_default': <ConcreteFunction signature_wrapper(*, album_name_can, album_uri_can, artist_followers_can, artist_genres_can, artist_name_can, artist_pop_can, artist_uri_can, duration_ms_can, track_name_can, track_pop_can, track_uri_can) at 0x7F27EA9BBA10>})

In [None]:
embs_iter = parsed_dataset_candidates.batch(1).map(lambda data: predict2(
                artist_name = data["artist_name_can"],
                track_name = data['track_name_can'],
                album_name = data['album_name_can'],
                track_uri = data['track_uri_can'],
                artist_uri = data['artist_uri_can'],
                album_uri = data['album_uri_can'],
                duration_ms = data['duration_ms_can'],
                track_pop = data['track_pop_can'],
                artist_pop = data['artist_pop_can'],
                artist_followers = data['artist_followers_can'],
                artist_genres = data['artist_genres_can']))

    
candidate_features = {
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
}

# TODOs:

> adapts vs vocab_dict

```
test_playlist_model = Playlist_Model(layer_sizes, vocab_dict_load)
test_playlist_model.pl_name_text_embedding.layers[0].adapt(parsed_dataset_padded.map(lambda x: x['name']).batch(1000))
```

In [None]:
avg_duration_ms_seed_pl = 13000151.68
var_duration_ms_seed_pl = 133092900971233.58
vocab_dict_load['avg_duration_ms_seed_pl']=avg_duration_ms_seed_pl
vocab_dict_load['var_duration_ms_seed_pl']=var_duration_ms_seed_pl

avg_n_songs_pl = 55.21
var_n_songs_pl = 2317.54
vocab_dict_load['avg_n_songs_pl']=avg_n_songs_pl
vocab_dict_load['var_n_songs_pl']=var_n_songs_pl

avg_n_artists_pl = 30.56
var_n_artists_pl = 769.26
vocab_dict_load['avg_n_artists_pl']=avg_n_artists_pl
vocab_dict_load['var_n_artists_pl']=var_n_artists_pl

avg_n_albums_pl = 40.25
var_n_albums_pl = 1305.54
vocab_dict_load['avg_n_albums_pl']=avg_n_albums_pl
vocab_dict_load['var_n_albums_pl']=var_n_albums_pl

avg_artist_pop = 16.08
var_artist_pop = 300.64
vocab_dict_load['avg_artist_pop']=avg_artist_pop
vocab_dict_load['var_artist_pop']=var_artist_pop

avg_duration_ms_songs_pl = 234823.14
var_duration_ms_songs_pl = 5558806228.41
vocab_dict_load['avg_duration_ms_songs_pl']=avg_duration_ms_songs_pl
vocab_dict_load['var_duration_ms_songs_pl']=var_duration_ms_songs_pl

avg_artist_followers = 43337.77
var_artist_followers = 377777790193.57
vocab_dict_load['avg_artist_followers']=avg_artist_followers
vocab_dict_load['var_artist_followers']=var_artist_followers

avg_track_pop = 10.85
var_track_pop = 202.18
vocab_dict_load['avg_track_pop']=avg_track_pop
vocab_dict_load['var_track_pop']=var_track_pop
# vocab_dict_load['unique_pids_string']

### Archived Dockerfiles

In [None]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

# # Dockerfile-gpu
# FROM gcr.io/deeplearning-platform-release/tf-gpu.2-9

# WORKDIR /src

# # Copies the trainer code to the docker image.
# COPY trainer/* trainer/ 

# RUN pip install -r trainer/requirements.txt

# # Sets up the entry point to invoke the trainer.
# # ENTRYPOINT ["python", "-m", "trainer.task"]