In [1]:
import datetime
import os
import pprint
import tempfile
from typing import Dict, Text
import numpy as np
import pandas as pd
import seaborn as sns
import surprise
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
import mlflow

# Load Data

In [23]:
df_implicit = pd.read_csv('implicit.csv')
df_ratings = pd.read_csv('ratings.csv')
df_content = pd.read_csv('content.csv')

In [21]:
df_ratings

Unnamed: 0,user_id,content_id,engaged_pct
0,000a544834,34caa29b86,0.000000
1,000a544834,3d0e786812,0.000000
2,000a544834,5c7ee2dd80,0.500000
3,000a544834,6f7ca40e85,0.000000
4,000a544834,745115bd62,0.000000
...,...,...,...
197557,fff5c815f7,b11fec4c92,0.000000
197558,fff5c815f7,b2f77c9143,0.666667
197559,fff5c815f7,bed9cfbff5,0.000000
197560,fff5c815f7,c29f53db23,0.000000


In [22]:
df_content

Unnamed: 0,content_id
0,f260f3f3ce
1,bce874342a
2,bbf2753b99
3,a9b6c93c3a
4,0c93cc1338
...,...
578,2ca0fce09f
579,5c57c25574
580,58bb410bc8
581,a842a97d1a


In [24]:
df_implicit

Unnamed: 0,user_id,content_id,count
0,000a544834,5c7ee2dd80,1
1,000a544834,f8e78525fc,1
2,000d2a6006,03bc268791,1
3,000d2a6006,0b3e27a1d1,1
4,000d2a6006,1d39c1c449,3
...,...,...,...
120237,fff07ebc97,5df80dbcc0,2
120238,fff07ebc97,bed9cfbff5,2
120239,fff5c815f7,1c64591529,1
120240,fff5c815f7,7e36cdfc2c,2


# Modeling
Implement both retrieval and ranking models. Track with MlFlow. This system is for: user -> item. Generally calculated using a dot product for matrix factorization. Item -> item is usualy on a product page to recommend similar products. Can get item to item using cosine similarity measure.  Optimizing for a single objective (multi object is avaailable - ratings and clicks)

<strong>Retrieval</strong> - two tower - https://blog.tensorflow.org/2020/09/introducing-tensorflow-recommenders.html
takes a user id and returns a subset of good candidates. Oversimplies model to implicit (engaged implicit +, viewed implicit -). The oversimplification speeds up the model.

<strong>ranking</strong> - https://www.tensorflow.org/recommenders/examples/basic_ranking
takes ad user id and a subset of good candidates and returns the best candidate. Here we use engagement pct as a proxy for an explicit rating. ranking is inherently expensive, so we want to only rank a small subset of possibilities.



## Load data into TensorFlow

In [5]:
# mlflow autolog not working with this model
#mlflow.tensorflow.autolog() #keras autolog is deprecated

In [27]:
#global variables used in getting predictions
USER_ID = "82216a2c7d"
CONTENT_ID = "49a9f915b5"

implicit_raw = tf.data.Dataset.from_tensor_slices(
    dict(df_implicit[["user_id", "content_id"]])
)

ratings_raw = tf.data.Dataset.from_tensor_slices(
    dict(df_ratings[["user_id", "content_id", "engaged_pct"]])
)
content_raw = tf.data.Dataset.from_tensor_slices(dict(df_content[["content_id"]]))
print('implicit: ', implicit_raw.element_spec)
print('ratings: ', ratings_raw.element_spec)
print('content: ', content_raw.element_spec)

implicit:  {'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'content_id': TensorSpec(shape=(), dtype=tf.string, name=None)}
ratings:  {'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'content_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'engaged_pct': TensorSpec(shape=(), dtype=tf.float64, name=None)}
content:  {'content_id': TensorSpec(shape=(), dtype=tf.string, name=None)}


## Retrieval

In [28]:
#map data
ratings_retrieval = implicit_raw.map(
    lambda x: {"user_id": x["user_id"], "content_id": x["content_id"]}
)

content_retrieval = content_raw.map(lambda x: x["content_id"])

print("users_content length: ", len(ratings_retrieval))
print("content length: ", len(content_retrieval))

users_content length:  120242
content length:  583


In [29]:
# Ratings: Shuffle and split into train and test

tf.random.set_seed(42)
shuffled_ratings = ratings_retrieval.shuffle(
    buffer_size=1_000, seed=42, reshuffle_each_iteration=False
)
print("length of shuffled: ", len(shuffled_ratings))

to_take_ratings = int(len(shuffled_ratings) * 0.75)
remaining_ratings = int(len(shuffled_ratings) - to_take_ratings)

print("training set size: ", to_take_ratings)
print("test set size: ", remaining_ratings)

train_retrieval = shuffled_ratings.take(to_take_ratings)
test_retrieval = shuffled_ratings.skip(to_take_ratings).take(remaining_ratings)

length of shuffled:  120242
training set size:  90181
test set size:  30061


In [30]:
# get uniques

content_ids_retrieval = content_retrieval.batch(1_000)
user_ids_retrieval = ratings_retrieval.batch(1_000).map(lambda x: x["user_id"])

unique_content_ids_retrieval = np.unique(np.concatenate(list(content_ids_retrieval)))
unique_user_ids_retrieval = np.unique(np.concatenate(list(user_ids_retrieval)))

print("number of unique titles: ", len(unique_content_ids_retrieval))
print("number of unique users: ", len(unique_user_ids_retrieval))

# look at top 10
unique_content_ids_retrieval[:5]

number of unique titles:  583
number of unique users:  7066


array([b'00733d9430', b'008c372b81', b'01af663e58', b'024775977e',
       b'02538907fb'], dtype=object)

### Define Model

In [31]:
embedding_dimension = 32

In [32]:
user_retrieval_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids_retrieval, mask_token=None
        ),
        tf.keras.layers.Embedding(
            len(unique_user_ids_retrieval) + 1, embedding_dimension
        ),
    ]
)

In [33]:
content_retrieval_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_content_ids_retrieval, mask_token=None
        ),
        tf.keras.layers.Embedding(
            len(unique_content_ids_retrieval) + 1, embedding_dimension
        ),
    ]
)

metrics = tfrs.metrics.FactorizedTopK(
    candidates=content_retrieval.batch(128).map(content_retrieval_model)
)
TASK_RETRIEVAL = tfrs.tasks.Retrieval(metrics=metrics)

In [34]:
# defined as the dot product between user and item embeddings. corresponds to matrix factorization.


class RetrievalModel(tfrs.Model):
    def __init__(self, user_model, content_model):
        super().__init__()
        self.content_model: tf.keras.Model = content_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = TASK_RETRIEVAL

    def compute_loss(
        self, features: Dict[Text, tf.Tensor], training=False
    ) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])

        # And pick out the content features and pass them into the coontent model,
        # getting embeddings back.
        positive_content_embeddings = self.content_model(features["content_id"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_content_embeddings)

### Fit and Evaluate

In [35]:
model_retrieval = RetrievalModel(user_retrieval_model, content_retrieval_model)
model_retrieval.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [36]:
cached_train_retrieval = (
    train_retrieval.shuffle(buffer_size=100_000).batch(8192).cache()
)
cached_test_retrieval = test_retrieval.batch(4096).cache()

In [37]:
# need shapes for mlflow
cached_train_retrieval.element_spec

{'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None),
 'content_id': TensorSpec(shape=(None,), dtype=tf.string, name=None)}

In [40]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_retrieval.fit(cached_train_retrieval, epochs=10, callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa3a74b8370>

In [59]:
#model_retrieval.get_config()
#model_retrieval.summary()
#model_retrieval.buid(input_shape=(None))
#model_retrieval.compute_output_shape(input_shape=((None,))) #set this for mlflow

In [42]:
model_retrieval.evaluate(cached_test_retrieval, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.001330627710558474,
 'factorized_top_k/top_5_categorical_accuracy': 0.009281128644943237,
 'factorized_top_k/top_10_categorical_accuracy': 0.02119024656713009,
 'factorized_top_k/top_50_categorical_accuracy': 0.10252486914396286,
 'factorized_top_k/top_100_categorical_accuracy': 0.19097834825515747,
 'loss': 10042.3583984375,
 'regularization_loss': 0,
 'total_loss': 10042.3583984375}

### Get Content Recommendations - Brute Force
The most straightforward way of retrieving top candidates in response to a query is to do it via brute force: compute user-movie scores for all possible movies, sort them, and pick a couple of top recommendations.

As the number of candidate grows, the amount of time needed grows linearly: with 10 million candidates, serving top candidates would take 250 milliseconds. This is clearly too slow for a live service. This is where approximate mechanisms come in. We can use ScaNN in TFRS to accomplish this via the tfrs.layers.factorized_top_k.ScaNN layer.

In [19]:
# Create BruteForce model that takes in raw user_id features, and
brute_force_index = tfrs.layers.factorized_top_k.BruteForce(model_retrieval.user_model)

# recommends content out of the entire content dataset.
brute_force_index.index_from_dataset(
    tf.data.Dataset.zip(
        (
            content_retrieval.batch(100),
            content_retrieval.batch(100).map(model_retrieval.content_model),
        )
    )
)

# Get recommendations for:
_, content_ids_recommended = brute_force_index(tf.constant([USER_ID]))
print(
    f"content_id recommendations for user {USER_ID}: {content_ids_recommended[0, :5]}"
)

content_id recommendations for user 82216a2c7d: [b'49a9f915b5' b'a533f2f57a' b'20b0745a31' b'487d868f96' b'19e9799800']


## Ranking
* pointwise - candidates are considered independently of each other (currently using)
* listwise - candidates pairs are considered, consider the ordering of the entire list
* can use explicit feedback (engage pct) as a ranking

### Load Data into TensorFlow

In [39]:
ratings_raw.element_spec

{'user_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'content_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'engaged_pct': TensorSpec(shape=(), dtype=tf.float64, name=None)}

In [40]:
# use user_rating
ratings_ranking = ratings_raw.map(
    lambda x: {
        "user_id": x["user_id"],
        "content_id": x["content_id"],
        "engaged_pct": x["engaged_pct"],
    }
)

In [42]:
# suffle and train/test split
tf.random.set_seed(42)
shuffled_ranking = ratings_ranking.shuffle(
    buffer_size=1_000, seed=42, reshuffle_each_iteration=False
)
print("length of shuffled: ", len(shuffled_ranking))

to_take_ranking = int(len(shuffled_ranking) * 0.75)
remaining_ranking = int(len(shuffled_ranking) - to_take_ranking)

print("training set size: ", to_take_ranking)
print("test set size: ", remaining_ranking)

train_ranking = shuffled_ranking.take(to_take_ranking)
test_ranking = shuffled_ranking.skip(to_take_ranking).take(remaining_ranking)

shuffled_ranking.element_spec

length of shuffled:  197562
training set size:  148171
test set size:  49391


{'user_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'content_id': TensorSpec(shape=(), dtype=tf.string, name=None),
 'engaged_pct': TensorSpec(shape=(), dtype=tf.float64, name=None)}

In [43]:
# get uniques. all from ratings
content_ids_ranking = ratings_ranking.batch(1_000_000).map(lambda x: x["content_id"])
user_ids_ranking = ratings_ranking.batch(1_000_000).map(lambda x: x["user_id"])

unique_content_ids_ranking = np.unique(np.concatenate(list(content_ids_ranking)))
unique_user_ids_ranking = np.unique(np.concatenate(list(user_ids_ranking)))

### Define Model

In [47]:
class RankingDefinitionModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # Compute embeddings for users.
        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_user_ids_ranking, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_user_ids_ranking) + 1, embedding_dimension
                ),
            ]
        )

        # Compute embeddings for content_ids.
        self.content_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_content_ids_ranking, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_content_ids_ranking) + 1, embedding_dimension
                ),
            ]
        )

        # Compute predictions.
        self.ratings = tf.keras.Sequential(
            [
                # Learn multiple dense layers.
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                # Make rating predictions in the final layer.
                tf.keras.layers.Dense(1),
            ]
        )

    def call(self, inputs):

        user_id, content_id = inputs

        user_embedding = self.user_embeddings(user_id)
        content_embedding = self.content_embeddings(content_id)

        return self.ratings(tf.concat([user_embedding, content_embedding], axis=1))

In [48]:
# user_id, content_id
RankingDefinitionModel()(([USER_ID], [CONTENT_ID]))



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00753504]], dtype=float32)>

In [49]:
TASK_RANKING = tfrs.tasks.Ranking(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()],
)

In [50]:
class RankingsModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingDefinitionModel()
        self.task: tf.keras.layers.Layer = TASK_RANKING

    def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
        return self.ranking_model((features["user_id"], features["content_id"]))

    def compute_loss(
        self, features: Dict[Text, tf.Tensor], training=False
    ) -> tf.Tensor:
        labels = features.pop("engaged_pct")

        rating_predictions = self(features)

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)

### Fit and Evaluate

In [51]:
model_ranking = RankingsModel()
model_ranking.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [52]:
cached_train_ranking = train_ranking.shuffle(100_000).batch(8192).cache()
cached_test_ranking = test_ranking.batch(4096).cache()

In [53]:
model_ranking.fit(cached_train_ranking, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fbd2f72eeb0>

In [54]:
model_ranking.evaluate(cached_test_ranking, return_dict=True)



{'root_mean_squared_error': 0.34809935092926025,
 'loss': 0.10958518087863922,
 'regularization_loss': 0,
 'total_loss': 0.10958518087863922}

### Get Content Recommendations - Ranking

In [57]:
final_ratings = {}

content_ids_from_retrieval = content_ids_recommended[0]
print("content from retrieval (unsorted content_ids): ", content_ids_from_retrieval)

for content_id in content_ids_from_retrieval:
    content_id = content_id.numpy()
    final_ratings[content_id] = model_ranking(
        # user | content_id
        {"user_id": np.array([f"{USER_ID}"]), "content_id": np.array([content_id])}
    )

print("----------------")
print("Ranked Content")
print(f"Ratings for user {USER_ID}:")
print("content_id | score")


for content_id, score in sorted(
    final_ratings.items(), key=lambda x: x[1], reverse=True
):
    print(f"{content_id}: {score}")

content from retrieval (unsorted content_ids):  tf.Tensor(
[b'49a9f915b5' b'a533f2f57a' b'20b0745a31' b'487d868f96' b'19e9799800'
 b'a1671b9953' b'9f943715a3' b'25ad9f4d85' b'd7dad09669' b'73e16a0447'], shape=(10,), dtype=string)
----------------
Ranked Content
Ratings for user 82216a2c7d:
content_id | score
b'73e16a0447': [[0.36834288]]
b'a1671b9953': [[0.367317]]
b'487d868f96': [[0.36708573]]
b'd7dad09669': [[0.36350384]]
b'25ad9f4d85': [[0.35258558]]
b'a533f2f57a': [[0.3521912]]
b'49a9f915b5': [[0.3467191]]
b'19e9799800': [[0.33223924]]
b'20b0745a31': [[0.3312234]]
b'9f943715a3': [[0.32872432]]
