In [None]:
# !pip install gcsfs gsutil tensorflow

In [1]:
import os
import nvtabular as nvt
from time import time

# disable INFO and DEBUG logging everywhere
import logging

logging.disable(logging.WARNING)

from nvtabular.ops import (
    Categorify,
    TagAsUserID,
    TagAsItemID,
    TagAsItemFeatures,
    TagAsUserFeatures,
    AddMetadata,
    ListSlice
)
import nvtabular.ops as ops

from merlin.schema.tags import Tags

import merlin.models.tf as mm
from merlin.io.dataset import Dataset
import tensorflow as tf

# for running this example on CPU, comment out the line below
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

ModuleNotFoundError: No module named 'nvtabular'

In [None]:
BUCKET = 'gs://spotify-builtin-2t'
PROJECT = 'hybrid-vertex'

LOCATION = 'us-central1'

train = nvt.Dataset(f"{BUCKET}/train_data_parquet/0000000000**.snappy.parquet")
valid = nvt.Dataset(f"{BUCKET}/validation_data_parquet/00000000000*.snappy.parquet")
MAX_PADDING = 375

In [None]:
item_id = ["track_uri_can"] >> Categorify(dtype="int32") >> ops.TagAsItemID() >> ops.AddMetadata(tags=["user_item"])
# playlist_id = ["pid_pos_id"] >> Categorify(dtype="int32") >> TagAsUserID() 


item_features_cat = ['artist_name_can',
        'track_name_can',
        'artist_genres_can',
    ]

item_features_cont = [
        'duration_ms_can',
        'track_pop_can',
        'artist_pop_can',
        'artist_followers_can',
    ]

playlist_features_cat = [
        'artist_name_seed_track',
        'artist_uri_seed_track',
        'track_name_seed_track',
        'track_uri_seed_track',
        'album_name_seed_track',
        'album_uri_seed_track',
        'artist_genres_seed_track',
        'description_pl',
        'name',
        'collaborative',
    ]

playlist_features_cont = [
        'duration_seed_track',
        'track_pop_seed_track',
        'artist_pop_seed_track',
        'artist_followers_seed_track',
        'duration_ms_seed_pl',
        'n_songs_pl',
        'num_artists_pl',
        'num_albums_pl',
    ]

#subset of features to be tagged
seq_feats_cont = [
        'duration_ms_songs_pl',
        'artist_pop_pl',
        'artists_followers_pl',
        'track_pop_pl',
    ]

seq_feats_cat = [
        'artist_name_pl',
        # 'track_uri_pl',
        'track_name_pl',
        'album_name_pl',
        'artist_genres_pl',
        # 'pid_pos_id', 
        # 'pos_pl'
    ]

CAT = playlist_features_cat + item_features_cat
CONT = item_features_cont + playlist_features_cont

item_feature_cat_node = item_features_cat >> nvt.ops.FillMissing()>> Categorify(dtype="int32") >> TagAsItemFeatures()

item_feature_cont_node =  item_features_cont >> nvt.ops.FillMissing() >>  nvt.ops.Normalize() >> TagAsItemFeatures()

playlist_feature_cat_node = playlist_features_cat >> nvt.ops.FillMissing() >> Categorify(dtype="int32") >> TagAsUserFeatures() 

playlist_feature_cont_node = playlist_features_cont >> nvt.ops.FillMissing() >>  nvt.ops.Normalize() >> TagAsUserFeatures()

playlist_feature_cat_seq_node = seq_feats_cat >> nvt.ops.FillMissing() >> Categorify(dtype="int32") >> ListSlice(MAX_PADDING, pad=True, pad_value=0) >> TagAsUserFeatures() >> nvt.ops.AddTags(Tags.SEQUENCE) 

playlist_feature_cont_seq_node = seq_feats_cont >> nvt.ops.FillMissing() >>  nvt.ops.Normalize() >> TagAsUserFeatures() >> nvt.ops.AddTags(Tags.SEQUENCE)

# define a workflow
output = item_id \
+ item_feature_cat_node \
+ item_feature_cont_node \
+ playlist_feature_cat_node \
+ playlist_feature_cont_node \
+ playlist_feature_cont_seq_node \
+ playlist_feature_cat_seq_node \
# playlist_id \


workflow = nvt.Workflow(output)

In [None]:
output.graph

In [None]:
import pandas as pd

# Don't truncate text fields in the display
pd.set_option("display.max_colwidth", None)

valid.to_ddf().head(1)

In [None]:
output_path = os.path.join(BUCKET, "merlin-processed")
output_train_dir = os.path.join(output_path, 'train/')
output_valid_dir = os.path.join(output_path, 'valid/')
output_workflow_dir = os.path.join(output_path, 'workflow/')


print(f"Train data dir: {output_train_dir}\nValid data dir: {output_valid_dir}")

In [None]:
%%time

time_preproc = 0
time_preproc_start = time()

workflow.fit_transform(train).to_parquet(output_path=output_train_dir, 
                                         shuffle=nvt.io.Shuffle.PER_PARTITION,
                                         cats=CAT,
                                         conts=CONT, 
                                         output_files = 50
                                         ) #preserve_files=True keeps the original file sharding


time_preproc += time()-time_preproc_start

In [None]:
#save the workflow to disk after it has been fit
workflow.save(os.path.join(output_workflow_dir,'2t-spotify-workflow'))

#locally for demo too
workflow.save('2t-spotify-workflow')

In [None]:
schema = workflow.output_schema

schema

In [None]:
%%time

time_preproc_start = time()
wf_valid_op = workflow.transform(valid).to_parquet(output_path=output_valid_dir, 
                                         shuffle=nvt.io.Shuffle.PER_PARTITION,
                                         cats=CAT,
                                         conts=CONT,
                                         output_files=10
                                         )


time_preproc += time()-time_preproc_start

# Load the processed data into a Merlin Dataset and inspect the transforms

Now that ETL is over, the workflow is saved and data is processed to the `output_path`

In [None]:
!gsutil cp -r {output_workflow_dir} .

In [None]:
# load back the workflow and schema
# spotify-builtin-2t/merlin-processed/workflow/2t-spotify-workflow
workflow = nvt.Workflow.load("2t-spotify-workflow")
schema = workflow.output_schema
embeddings = ops.get_embedding_sizes(workflow)
embeddings

In [None]:
from merlin.models.utils.example_utils import workflow_fit_transform

from merlin.schema.tags import Tags

import merlin.models.tf as mm
from merlin.io.dataset import Dataset as MerlinDataset

train = MerlinDataset(output_train_dir + "/*.parquet", schema=schema, part_size="500MB")
valid = MerlinDataset(output_valid_dir + "/*.parquet", schema=schema, part_size="500MB")

#look at output
schema = train.schema

In [None]:
two_t_schema = schema.select_by_tag([Tags.ITEM_ID, Tags.ITEM, Tags.USER, Tags.USER_ID])
two_t_schema_seq = schema.select_by_tag([Tags.SEQUENCE])
non_seq_col_names = list(set(two_t_schema.column_names) - set(two_t_schema_seq.column_names))
non_seq_col_names
# two_t_schema = [x for x in two_t_schema_seq.column_names]
two_t_schema = two_t_schema[non_seq_col_names]
two_t_schema

### Embeddings

In [None]:
# format embeddings
embeddings_all = embeddings

emb_dims = {}
for k in list(embeddings_all.keys()):
    emb_dims.update({k: embeddings_all[k][1]})
emb_dims

### Model

In [None]:
model = mm.TwoTowerModel(
    two_t_schema,
    query_tower=mm.MLPBlock([1024,512,256], no_activation_last_layer=True),
    item_tower=mm.MLPBlock([1024,512,256], no_activation_last_layer=True),
    samplers=[mm.InBatchSampler()],
    embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),
)

In [None]:
%%time

model.compile(optimizer="adam", run_eagerly=False, metrics=[mm.RecallAt(1), mm.RecallAt(10), mm.NDCGAt(10)])
model.fit(train, validation_data=valid, batch_size=2048, epochs=3)

### Save Query Model

In [None]:
artifact_path = os.path.join(output_path, 'spotify-2t-query-model')
model.save(artifact_path) #saves keras model

### Save Track Embeddings

In [None]:
from merlin.models.utils.dataset import unique_rows_by_features

item_features = (
    unique_rows_by_features(train, Tags.ITEM, Tags.ITEM_ID)
    .compute()
    .reset_index(drop=True)
)

item_embs = model.item_embeddings(
    MerlinDataset(item_features, schema=schema), batch_size=1024
)
item_embs_df = item_embs.compute(scheduler="synchronous")