# Feature extraction with the Candidate Tower

After Two-Tower training, the `candidate_tower` is used to convert all candidate items into embeddings. 

The embeddings are indexed and deployed to an endpoint for serving.

In [1]:
PROJECT_ID = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
LOCATION = 'us-central1' 

### pip & package

In [None]:
# pip install tensorflow-recommenders==0.7.0

# pip install google-cloud-aiplatform==1.17.0
# pip install tensorflow-recommenders==0.7.0

# pip install tensorboard==2.9.1
# pip install tensorboard-data-server==0.6.1
# pip install tensorboard-plugin-profile==2.5.0
# pip install cloudml-hypertune
# pip install google-cloud-aiplatform[cloud_profiler]

In [35]:
import json
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_io as tfio

from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob

import google.cloud.aiplatform as vertex_ai

import numpy as np
import pickle as pkl
from pprint import pprint

import os

## Load `SavedModel`

In [4]:
# gcs uri for SavedModel dir
# candidate_tower_uri = 'gs://spotify-tfrs-dir/v2/run-20220920-210334/candidate_tower' # locally trained
candidate_tower_uri = 'gs://spotify-tfrs-dir/v11/run-20220921-163503/candidate_tower' # vertex trained

loaded_candidate_model = tf.saved_model.load(candidate_tower_uri)

loaded_candidate_model.signatures

2022-09-21 17:21:42.186998: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-21 17:21:42.291367: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-21 17:21:42.293192: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-21 17:21:42.295500: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

_SignatureMap({'serving_default': <ConcreteFunction signature_wrapper(*, artist_pop_can, album_name_can, track_pop_can, album_uri_can, artist_genres_can, track_uri_can, artist_uri_can, artist_followers_can, track_name_can, artist_name_can, duration_ms_can) at 0x7F23B93D4F50>})

In [5]:
print(list(loaded_candidate_model.signatures.keys()))

['serving_default']


In [6]:
candidate_predictor = loaded_candidate_model.signatures["serving_default"]
print(candidate_predictor.structured_outputs)

{'output_1': TensorSpec(shape=(None, 32), dtype=tf.float32, name='output_1')}


In [7]:
candidate_predictor.output_shapes

{'output_1': TensorShape([None, 32])}

## Candidate Dataset

### Helper functions 

In [9]:
storage_client = storage.Client(project=PROJECT_ID)

In [10]:
candidate_features = {
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
}

def parse_candidate_tfrecord_fn(example):
    example = tf.io.parse_single_example(
        example, 
        features=candidate_features
    )
    return example

# Candidate Records v1 
* tf-records from JT's pre-Argolis Beam pipeline

In [17]:
SAMPLE_FILES = ["gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00000-of-00796.tfrecord",
              "gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00002-of-00796.tfrecord"]

raw_dataset = tf.data.TFRecordDataset(SAMPLE_FILES)

In [19]:
parsed_candidate_dataset_v1 = raw_dataset.map(parse_candidate_tfrecord_fn)

for features in parsed_candidate_dataset_v1.take(1):
    pprint(features)
    print("_______________")

{'album_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Glee: The Music, The Complete Season Three'>,
 'album_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:album:082BH67sSIDefLxUp8GgNm'>,
 'artist_followers_can': <tf.Tensor: shape=(), dtype=float32, numpy=1452611.0>,
 'artist_genres_can': <tf.Tensor: shape=(), dtype=string, numpy=b"'glee club', 'hollywood', 'post-teen pop'">,
 'artist_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Glee Cast'>,
 'artist_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=76.0>,
 'artist_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:artist:0SCbttzoZTnLFebDYmAWCm'>,
 'duration_ms_can': <tf.Tensor: shape=(), dtype=float32, numpy=285493.0>,
 'track_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'You Get What You Give (Glee Cast Version)'>,
 'track_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=27.0>,
 'track_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:track:6KhJeYLg1AimCQjH6ii1Al

### Generate embeddings
* use`candidate_predictor` to produce embeddings for each candidate item
* store embeddings in list
* zip candidate embeddings and candidate IDs together
* write `json` or `csv` file for ANN Index  

### candidate embedding vectors

In [23]:
embs_iter = parsed_candidate_dataset_v1.batch(1).map(
    lambda data: candidate_predictor(
        artist_name_can = data["artist_name_can"],
        track_name_can = data['track_name_can'],
        album_name_can = data['album_name_can'],
        track_uri_can = data['track_uri_can'],
        artist_uri_can = data['artist_uri_can'],
        album_uri_can = data['album_uri_can'],
        duration_ms_can = data['duration_ms_can'],
        track_pop_can = data['track_pop_can'],
        artist_pop_can = data['artist_pop_can'],
        artist_followers_can = data['artist_followers_can'],
        artist_genres_can = data['artist_genres_can']
    )
)

embs = []
for emb in embs_iter:
    embs.append(emb)
    
print(f"Length of embs: {len(embs)}")
embs[0]

Length of embs: 166827


{'output_1': <tf.Tensor: shape=(1, 32), dtype=float32, numpy=
 array([[ 0.37292254,  0.12035738, -0.5353719 , -0.07446399, -0.01110614,
          0.06166476, -0.41633704, -0.7236851 , -0.3176018 , -0.33124822,
          0.01249374,  0.18589361, -0.02328215, -0.43030298, -0.28714353,
          0.11682993,  0.16103686, -0.13076518,  0.33740237, -0.0402809 ,
         -0.22093898,  0.24394853, -0.39488426, -0.2421201 , -0.26179013,
          0.06631055,  0.09302226,  0.2791449 ,  0.20307502,  0.2394592 ,
         -0.12311258,  0.15563585]], dtype=float32)>}

Clean embedding output...

In [24]:
cleaned_embs = [x['output_1'].numpy()[0] for x in embs] #clean up the output

print(f"Length of cleaned_embs: {len(cleaned_embs)}")
cleaned_embs[0]

Length of cleaned_embs: 166827


array([ 0.37292254,  0.12035738, -0.5353719 , -0.07446399, -0.01110614,
        0.06166476, -0.41633704, -0.7236851 , -0.3176018 , -0.33124822,
        0.01249374,  0.18589361, -0.02328215, -0.43030298, -0.28714353,
        0.11682993,  0.16103686, -0.13076518,  0.33740237, -0.0402809 ,
       -0.22093898,  0.24394853, -0.39488426, -0.2421201 , -0.26179013,
        0.06631055,  0.09302226,  0.2791449 ,  0.20307502,  0.2394592 ,
       -0.12311258,  0.15563585], dtype=float32)

### candidate IDs

In [25]:
# clean product IDs
track_uris = [x['track_uri_can'].numpy() for x in parsed_candidate_dataset_v1]
track_uris[0]

b'spotify:track:6KhJeYLg1AimCQjH6ii1Al'

In [26]:
track_uris_cleaned = [str(z).replace("b'","").replace("'","") for z in track_uris]
track_uris_cleaned[0]

'spotify:track:6KhJeYLg1AimCQjH6ii1Al'

In [27]:
print(f"Length of track_uris: {len(track_uris)}")
print(f"Length of track_uris_cleaned: {len(track_uris_cleaned)}")

Length of track_uris: 166827
Length of track_uris_cleaned: 166827


### Check for bad records

In [45]:
cleaned_embs[0]

array([ 0.37292254,  0.12035738, -0.5353719 , -0.07446399, -0.01110614,
        0.06166476, -0.41633704, -0.7236851 , -0.3176018 , -0.33124822,
        0.01249374,  0.18589361, -0.02328215, -0.43030298, -0.28714353,
        0.11682993,  0.16103686, -0.13076518,  0.33740237, -0.0402809 ,
       -0.22093898,  0.24394853, -0.39488426, -0.2421201 , -0.26179013,
        0.06631055,  0.09302226,  0.2791449 ,  0.20307502,  0.2394592 ,
       -0.12311258,  0.15563585], dtype=float32)

In [46]:
bad_records = []

for i, emb in enumerate(cleaned_embs):
    bool_emb = np.isnan(emb)
    for val in bool_emb:
        if val:
            bad_records.append(i)
            
bad_record_filter = np.unique(bad_records)

print(f"bad_records: {len(bad_records)}")
print(f"bad_record_filter: {len(bad_record_filter)}")

bad_records: 4608
bad_record_filter: 144


In [47]:
bad_record_filter[0]

1823

In [48]:
track_uris_valid = []
emb_valid = []

for i, pair in enumerate(zip(track_uris_cleaned, cleaned_embs)):
    if i in bad_record_filter:
        pass
    else:
        t_uri, embed = pair
        track_uris_valid.append(t_uri)
        emb_valid.append(embed)

In [51]:
emb_valid[0]

array([ 0.37292254,  0.12035738, -0.5353719 , -0.07446399, -0.01110614,
        0.06166476, -0.41633704, -0.7236851 , -0.3176018 , -0.33124822,
        0.01249374,  0.18589361, -0.02328215, -0.43030298, -0.28714353,
        0.11682993,  0.16103686, -0.13076518,  0.33740237, -0.0402809 ,
       -0.22093898,  0.24394853, -0.39488426, -0.2421201 , -0.26179013,
        0.06631055,  0.09302226,  0.2791449 ,  0.20307502,  0.2394592 ,
       -0.12311258,  0.15563585], dtype=float32)

### Write embedding vectors to json file

In [52]:
VERSION = 'local_v2'
TIMESTAMP = '092122'

embeddings_index_filename = f'candidate_embeddings_{VERSION}_{TIMESTAMP}.json'

with open(f'{embeddings_index_filename}', 'w') as f:
    for prod, emb in zip(track_uris_valid, emb_valid):
        f.write('{"id":"' + str(prod) + '",')
        f.write('"embedding":[' + ",".join(str(x) for x in list(emb)) + "]}")
        f.write("\n")

### Upload `json` to GCS

In [53]:
BUCKET = 'spotify-tfrs-dir'
PATH_TO_INDEX_DIR = 'v11/run-20220921-163503/candidate-index'
INDEX_GCS_URI = f'gs://{BUCKET}/{PATH_TO_INDEX_DIR}/'

print(f"INDEX_GCS_URI: {INDEX_GCS_URI}")

DESTINATION_BLOB_NAME = embeddings_index_filename
SOURCE_FILE_NAME = embeddings_index_filename

# spotify-tfrs/candidate_embeddings_local_v2_092122.json

blob = Blob.from_string(os.path.join(INDEX_GCS_URI, DESTINATION_BLOB_NAME))
blob.bucket._client = storage_client
blob.upload_from_filename(SOURCE_FILE_NAME)

INDEX_GCS_URI: gs://spotify-tfrs-dir/v11/run-20220921-163503/candidate-index/


In [38]:
# CONTENTS_DELTA_URI = 'spotify-tfrs-dir/v11/run-20220921-163503/candidate-index'

# gs://spotify-tfrs-dir/v11/run-20220921-163503/candidate-index/candidate_embeddings_local_v2_092122.json

# import json

# with open(f'{SOURCE_FILE_NAME}', 'r') as f:
#     data = json.load(f)

## Create Indexes

In [39]:
# initialize 

vertex_ai.init(project=PROJECT_ID, location=LOCATION)

[src code](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/matching_engine/matching_engine_index.py#L404) for creating index with Matching Engine SDK

In [40]:
VERSION

'local_v2'

In [54]:
# ANN Index
display_name = f'spotify_candidate_index_{VERSION}'
dimensions = 32
approximate_neighbors_count=25
distance_measure_type="DOT_PRODUCT_DISTANCE"
leaf_node_embedding_count=500
leaf_nodes_to_search_percent=7
# update_type='BatchUpdate'
emb_index_gcs_bucket_uri=INDEX_GCS_URI # directory path to embedding json

ann_index_labels={
    'version': f'{VERSION}',
    'dimensions': f'{dimensions}',
    'approx_neighbors': f'{approximate_neighbors_count}',
    'distance_measure': f'{distance_measure_type}',
    'leaf_node_embedding_count': f'{leaf_node_embedding_count}',
    'leaf_nodes_search_percent': f'{leaf_nodes_to_search_percent}',
    # 'update_type': f'{update_type}',
}

pprint(ann_index_labels)

{'approx_neighbors': '25',
 'dimensions': '32',
 'distance_measure': 'DOT_PRODUCT_DISTANCE',
 'leaf_node_embedding_count': '500',
 'leaf_nodes_search_percent': '7',
 'version': 'local_v2'}


### Create Index 

* github tutorial notebook for [matching_engine_sdk](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/sdk_matching_engine_for_indexing.ipynb)
* [Docs](https://cloud.google.com/vertex-ai/docs/matching-engine/configuring-indexes)

> this may take up to 45 minutes...

In [55]:
import time

start_time = time.time()

ann_index = vertex_ai.MatchingEngineIndex.create_tree_ah_index(
    display_name=f'{display_name}',
    contents_delta_uri=emb_index_gcs_bucket_uri,
    dimensions=dimensions,
    approximate_neighbors_count=approximate_neighbors_count,
    distance_measure_type=distance_measure_type,
    leaf_node_embedding_count=leaf_node_embedding_count,
    leaf_nodes_to_search_percent=leaf_nodes_to_search_percent,
    description=display_name,
    labels=ann_index_labels,
)

end_time = time.time()
elapsed_time = end_time - start_time
ann_index_resource_uri = ann_index.resource_name

print(f"Index created Elapsed time: {elapsed_time}")
print(f"To use in another session: {ann_index_resource_uri}")

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/934903580331/locations/us-central1/indexes/6447588961836597248/operations/3695012522831642624
MatchingEngineIndex created. Resource name: projects/934903580331/locations/us-central1/indexes/6447588961836597248
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/934903580331/locations/us-central1/indexes/6447588961836597248')
Index created Elapsed time: 2620.1704621315002
To use in another session: projects/934903580331/locations/us-central1/indexes/6447588961836597248


### Create Index Endpoint

### Deploy Index to Endpoint

### Query Index 

# Candidate Records v2 
* tf-records from JW's recent Beam pipeline

In [56]:
# candidate_dir = 'gs://spotify-beam-v3/v3/candidates'
CANDIDATE_BUCKET = 'spotify-beam-v3'
CANDIDATE_PREFIX = 'v3/candidates/'

candidate_files = []
for blob in storage_client.list_blobs(f'{CANDIDATE_BUCKET}', prefix=f'{CANDIDATE_PREFIX}', delimiter="/"):
    candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_files

['gs://spotify-beam-v3/v3/candidates/-00000-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00001-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00002-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00003-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00004-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00005-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00006-of-00008.tfrecords',
 'gs://spotify-beam-v3/v3/candidates/-00007-of-00008.tfrecords']

In [57]:
# Parse train dataset
raw_candidate_dataset = tf.data.TFRecordDataset(candidate_files)
parsed_candidate_dataset = raw_candidate_dataset.map(parse_candidate_tfrecord_fn)

In [58]:
parsed_candidate_dataset

<MapDataset element_spec={'album_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'album_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_followers_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_genres_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_pop_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'duration_ms_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'track_pop_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [59]:
# TEST_small = shuffled_parsed_ds.skip(80_000).take(20_000).batch(128)

for features in parsed_candidate_dataset.take(1):
    pprint(features)
    print("_______________")

{'album_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'The Sound of Everything Rmx'>,
 'album_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:album:4a8tMD6qq6GUuUwNae38VI'>,
 'artist_followers_can': <tf.Tensor: shape=(), dtype=float32, numpy=277649.0>,
 'artist_genres_can': <tf.Tensor: shape=(), dtype=string, numpy=b"'downtempo', 'electronica', 'funk', 'latin alternative', 'nu jazz', 'nu-cumbia', 'trip hop', 'world'">,
 'artist_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Quantic'>,
 'artist_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=64.0>,
 'artist_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:artist:5ZMwoAjeDtLJ0XRwRTgaK8'>,
 'duration_ms_can': <tf.Tensor: shape=(), dtype=float32, numpy=267130.0>,
 'track_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'The Sound of Everything - Watch TV & Se\xc3\xb1orlobo Remix'>,
 'track_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=53.0>,
 'track_uri_can': <tf.Tensor: shape=(),

### Generate Embeddings

### Candidate embedding vectors

In [60]:
embs_iter = parsed_candidate_dataset.batch(1).map(
    lambda data: candidate_predictor(
        artist_name_can = data["artist_name_can"],
        track_name_can = data['track_name_can'],
        album_name_can = data['album_name_can'],
        track_uri_can = data['track_uri_can'],
        artist_uri_can = data['artist_uri_can'],
        album_uri_can = data['album_uri_can'],
        duration_ms_can = data['duration_ms_can'],
        track_pop_can = data['track_pop_can'],
        artist_pop_can = data['artist_pop_can'],
        artist_followers_can = data['artist_followers_can'],
        artist_genres_can = data['artist_genres_can']
    )
)

embs = []
for emb in embs_iter:
    embs.append(emb)
    
print(f"Length of embs: {len(embs)}")
embs[0]

Length of embs: 2249561


{'output_1': <tf.Tensor: shape=(1, 32), dtype=float32, numpy=
 array([[ 0.01181636,  0.3014134 , -0.42354333, -0.99971014,  0.24810879,
          0.065205  , -0.44993484, -0.59487194, -0.25165173, -0.8841798 ,
          0.33247462,  0.33106828, -0.1390458 , -0.364905  , -0.51437426,
          0.36792147,  0.14636928,  0.20347618,  0.4378496 , -0.01698672,
         -0.4043599 ,  0.05605474, -0.4586042 ,  0.6386781 , -0.14450541,
          0.02818127, -0.33449343, -0.04326751,  0.20794246, -0.09052821,
          0.18821722, -0.35916686]], dtype=float32)>}

In [61]:
cleaned_embs = [x['output_1'].numpy()[0] for x in embs] #clean up the output

print(f"Length of cleaned_embs: {len(cleaned_embs)}")
cleaned_embs[0]

Length of cleaned_embs: 2249561


array([ 0.01181636,  0.3014134 , -0.42354333, -0.99971014,  0.24810879,
        0.065205  , -0.44993484, -0.59487194, -0.25165173, -0.8841798 ,
        0.33247462,  0.33106828, -0.1390458 , -0.364905  , -0.51437426,
        0.36792147,  0.14636928,  0.20347618,  0.4378496 , -0.01698672,
       -0.4043599 ,  0.05605474, -0.4586042 ,  0.6386781 , -0.14450541,
        0.02818127, -0.33449343, -0.04326751,  0.20794246, -0.09052821,
        0.18821722, -0.35916686], dtype=float32)

### candidate IDs

### Check for bad records

### Write emebdding vectors to json file

### Uplaod `json` to GCS