In [4]:
import os
import pprint
import tempfile

In [5]:
from typing import Dict, Text

In [6]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [7]:
import tensorflow_recommenders as tfrs

In [11]:
import pandas as pd

### Preparing the dataset

In [12]:
def load_data_file_cold(file, stats):
    print('loading file:' + file)
    training_df = pd.read_csv(
        file,
        skiprows=[0],
        names=["viewer","broadcaster","viewer_age","viewer_gender","viewer_longitude","viewer_latitude","viewer_lang","viewer_country","broadcaster_age","broadcaster_gender","broadcaster_longitude","broadcaster_latitude","broadcaster_lang","broadcaster_country","duration", "viewer_network", "broadcaster_network", "count"], dtype={
            'viewer': np.unicode,
            'broadcaster': np.unicode,
            'viewer_age': np.single,
            'viewer_gender': np.unicode,
            'viewer_longitude': np.single,
            'viewer_latitude': np.single,
            'viewer_lang': np.unicode,
            'viewer_country': np.unicode,
            'broadcaster_age': np.single,
            'broadcaster_longitude': np.single,
            'broadcaster_latitude': np.single,
            'broadcaster_lang': np.unicode,
            'broadcaster_country': np.unicode,
            'viewer_network': np.unicode,
            'broadcaster_network': np.unicode,
            'count': np.int
        })

    values = {
        'viewer': 'unknown',
        'broadcaster': 'unknown',
        'viewer_age': 30,
        'viewer_gender': 'unknown',
        'viewer_longitude': 0,
        'viewer_latitude': 0,
        'viewer_lang': 'unknown',
        'viewer_country': 'unknown',
        'broadcaster_age': 30,
        'broadcaster_longitude': 0,
        'broadcaster_latitude': 0,
        'broadcaster_lang': 'unknown',
        'broadcaster_country': 'unknown',
        'duration': 0,
        'viewer_network': 'unknown',
        'broadcaster_network': 'unknown',
        'count': 0
    }
    training_df.fillna(value=values, inplace=True)
#     print(training_df.head(10))
#     print(training_df.iloc[-10:])
#     stats.send_stats('data-size', len(training_df.index))

    sampled_df = training_df.sample(frac=0.1)
    print(sampled_df.head(10))
    print(sampled_df.iloc[-10:])
    return sampled_df

def load_training_data_cold(file, stats):
    ratings_df = load_data_file_cold(file, stats)
    print('creating data set')
    training_ds = (
        tf.data.Dataset.from_tensor_slices(
            ({
                "viewer": tf.cast(
                    ratings_df['viewer'].values,
                    tf.string),
                "viewer_gender": tf.cast(
                    ratings_df['viewer_gender'].values,
                    tf.string),
                "viewer_lang": tf.cast(
                    ratings_df['viewer_lang'].values,
                    tf.string),
                "viewer_country": tf.cast(
                    ratings_df['viewer_country'].values,
                    tf.string),
                "viewer_age": tf.cast(
                    ratings_df['viewer_age'].values,
                    tf.int16),
                "viewer_longitude": tf.cast(
                    ratings_df['viewer_longitude'].values,
                    tf.float16),
                "viewer_latitude": tf.cast(
                    ratings_df['viewer_latitude'].values,
                    tf.float16),
                "broadcaster": tf.cast(
                    ratings_df['broadcaster'].values,
                    tf.string),
                "viewer_network": tf.cast(
                    ratings_df['viewer_network'].values,
                    tf.string),
                "broadcaster_network": tf.cast(
                    ratings_df['broadcaster_network'].values,
                    tf.string),
            })))

    return training_ds

In [16]:
def prepare_training_data_cold(train_ds):
    print('prepare_training_data')
    training_ds = train_ds.cache().map(lambda x: {
        "broadcaster": x["broadcaster"],
        "viewer": x["viewer"],
        "viewer_gender": x["viewer_gender"],
        "viewer_lang": x["viewer_lang"],
        "viewer_country": x["viewer_country"],
        "viewer_age": x["viewer_age"],
        "viewer_longitude": x["viewer_longitude"],
        "viewer_latitude": x["viewer_latitude"],
        "viewer_network": x["viewer_network"],
        "broadcaster_network": x["broadcaster_network"],
    }, num_parallel_calls=tf.data.AUTOTUNE,
       deterministic=False)

    print('done prepare_training_data')
    return training_ds

def get_broadcaster_data_set(train_ds):
    broadcasters = train_ds.cache().map(lambda x: x["broadcaster"], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    broadcasters_ds = tf.data.Dataset.from_tensor_slices(
        np.unique(list(broadcasters.as_numpy_iterator())))
    return broadcasters_ds

def get_list(training_data, key):
    return training_data.batch(1_000_000).map(lambda x: x[key], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)

def get_unique_list(data):
    return np.unique(np.concatenate(list(data)))

In [19]:
ratings = load_training_data_cold(file="a3d86f3b-eb45-4641-b05d-30dff7423e6b.csv", stats="")

loading file:a3d86f3b-eb45-4641-b05d-30dff7423e6b.csv
                   viewer       broadcaster  viewer_age viewer_gender  \
3508724     pof:327647531  meetme:284944392        41.0          male   
2917344     pof:333753109     pof:331124573        43.0          male   
2225781     pof:142085441   skout:173905746        32.0          male   
5015864  meetme:314815889  meetme:177004661        29.0          male   
821050    skout:170536582   skout:169786744        33.0          male   
1659425   skout:134491552  meetme:229602082        26.0        female   
2679288  meetme:318071034  meetme:280218146        22.0        female   
102293    meetme:29821282  meetme:264439298        48.0        female   
568169    skout:105089058   skout:178518892        36.0          male   
1448898  meetme:309480199  meetme:298423614        28.0          male   

         viewer_longitude  viewer_latitude viewer_lang viewer_country  \
3508724        -96.800003        33.000000          en             US

In [20]:
for x in ratings.take(2).as_numpy_iterator():
    pprint.pprint(x)

{'broadcaster': b'meetme:284944392',
 'broadcaster_network': b'meetme',
 'viewer': b'pof:327647531',
 'viewer_age': 41,
 'viewer_country': b'US',
 'viewer_gender': b'male',
 'viewer_lang': b'en',
 'viewer_latitude': 33.0,
 'viewer_longitude': -96.8,
 'viewer_network': b'pof'}
{'broadcaster': b'pof:331124573',
 'broadcaster_network': b'pof',
 'viewer': b'pof:333753109',
 'viewer_age': 43,
 'viewer_country': b'US',
 'viewer_gender': b'male',
 'viewer_lang': b'en',
 'viewer_latitude': 38.3,
 'viewer_longitude': -77.4,
 'viewer_network': b'pof'}


In [None]:
ratings = ratings.map(lambda x: {
    "broadcaster": x["broadcaster"],
    "viewer": x["viewer"],    
})

In [21]:
broadcaster = ratings.map(lambda x: x["broadcaster"])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [22]:
for x in broadcaster.take(2).as_numpy_iterator():
    pprint.pprint(x)

b'meetme:284944392'
b'pof:331124573'


In [23]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

### a vocabulary that maps a raw feature value to an integer in a contiguous range

In [24]:
broadcaster_id = broadcaster.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["viewer"])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [25]:
unique_broadcaster_id = np.unique(np.concatenate(list(broadcaster_id)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [26]:
unique_broadcaster_id[:10]

array([b'meetme:100081867', b'meetme:100104254', b'meetme:100114731',
       b'meetme:100130022', b'meetme:100190086', b'meetme:100201554',
       b'meetme:100237066', b'meetme:100279809', b'meetme:100300152',
       b'meetme:100345849'], dtype=object)

In [27]:
unique_user_ids[:10]

array([b'meetme:100116030', b'meetme:100142157', b'meetme:100151379',
       b'meetme:10015227', b'meetme:100190086', b'meetme:100196265',
       b'meetme:100197023', b'meetme:100200365', b'meetme:100201554',
       b'meetme:100237066'], dtype=object)

### Implementing a model

### The query tower

In [28]:
embedding_dimension = 32

In [29]:
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

### The candidate tower

In [30]:
broadcaster_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_broadcaster_id, mask_token=None),
  tf.keras.layers.Embedding(len(unique_broadcaster_id) + 1, embedding_dimension)
])

### Metrics

In [33]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=broadcaster.batch(128).map(broadcaster_model)
)

### Loss

In [34]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

### The full model

In [58]:
class FinalModel(tfrs.Model):
    def __init__(self, user_model, broadcaster_model):
        super().__init__()
        self.broadcaster_model: tf.keras.Model = broadcaster_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["viewer"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_broadcaster_embeddings = self.broadcaster_model(features["broadcaster"])
        
        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_broadcaster_embeddings)

In [59]:
class NoBaseClassModel ( tf.keras.Model ):
    def __init__ ( self , user_model , broadcaster_model ):
        super ( ).__init__ ( )
        self.broadcaster_model: tf.keras.Model = broadcaster_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def train_step ( self , features: Dict [ Text , tf.Tensor ] ) -> tf.Tensor:
        # Set up a gradient tape to record gradients.
        with tf.GradientTape ( ) as tape:
            # Loss computation.
            user_embeddings = self.user_model ( features [ "viewer" ] )
            positive_broadcaster_embeddings = self.broadcaster_model ( features [ "broadcaster" ] )
            loss = self.task ( user_embeddings , positive_broadcaster_embeddings )

            # Handle regularization losses as well.
            regularization_loss = sum ( self.losses )

            total_loss = loss + regularization_loss

        gradients = tape.gradient ( total_loss , self.trainable_variables )
        self.optimizer.apply_gradients ( zip ( gradients , self.trainable_variables ) )

        metrics = {metric.name: metric.result ( ) for metric in self.metrics}
        metrics [ "loss" ] = loss
        metrics [ "regularization_loss" ] = regularization_loss
        metrics [ "total_loss" ] = total_loss

        return metrics

    def test_step ( self , features: Dict [ Text , tf.Tensor ] ) -> tf.Tensor:
        # Loss computation.
        user_embeddings = self.user_model ( features [ "viewer" ] )
        positive_broadcaster_embeddings = self.broadcaster_model ( features [ "broadcaster" ] )
        loss = self.task ( user_embeddings , positive_broadcaster_embeddings )

        # Handle regularization losses as well.
        regularization_loss = sum ( self.losses )

        total_loss = loss + regularization_loss

        metrics = {metric.name: metric.result ( ) for metric in self.metrics}
        metrics [ "loss" ] = loss
        metrics [ "regularization_loss" ] = regularization_loss
        metrics [ "total_loss" ] = total_loss

        return metrics

### Fitting and evaluating

In [60]:
model = FinalModel(user_model, broadcaster_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [61]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [62]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and at

Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc4dc3d0950>

In [63]:
model.evaluate(cached_test, return_dict=True)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the f

{'factorized_top_k/top_1_categorical_accuracy': 0.00044999999227002263,
 'factorized_top_k/top_5_categorical_accuracy': 0.00044999999227002263,
 'factorized_top_k/top_10_categorical_accuracy': 0.00044999999227002263,
 'factorized_top_k/top_50_categorical_accuracy': 0.00044999999227002263,
 'factorized_top_k/top_100_categorical_accuracy': 0.0005000000237487257,
 'loss': 29620.759765625,
 'regularization_loss': 0,
 'total_loss': 29620.759765625}

### Making predictions

In [65]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((broadcaster.batch(100), ratings.batch(100).map(model.broadcaster_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["meetme:100116030"]))
print(f"Recommendations for user meetme:100116030: {titles[0, :3]}")

Consider rewriting this model with the Functional API.


  [n for n in tensors.keys() if n not in ref_input_names])


Recommendations for user meetme:100116030: [b'pof:312971369' b'pof:312971369' b'pof:312971369']


### Model serving

In [66]:
# Export the query model.
with tempfile.TemporaryDirectory ( ) as tmp:
    path = os.path.join(tmp , "model" )
    
    # Save the index.
    tf.saved_model.save (index , path )
    
    # Load it back; can also be done in TensorFlow Serving.
    loaded = tf.saved_model.load ( path )
    
    # Pass a user id in, get top predicted movie titles back.
    scores , broadcasters = loaded ( [ "meetme:100116030" ] )
    
    print ( f"Recommendations: {broadcasters [ 0 ] [ :3 ]}" )

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Caus



Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
INFO:tensorflow:Assets written to: /var/folders/pl/p61pv4q90019r6vd80xnfprc0000gn/T/tmpc41cp75_/model/assets


INFO:tensorflow:Assets written to: /var/folders/pl/p61pv4q90019r6vd80xnfprc0000gn/T/tmpc41cp75_/model/assets


Recommendations: [b'pof:312971369' b'pof:312971369' b'pof:312971369']
