### Setup envs

In [1]:
import boto3
from sagemaker import get_execution_role

In [2]:
!pip install tensorflow==2.5.0



In [3]:
!pip install tensorflow-recommenders==0.5.2



In [4]:
role = get_execution_role()
bucket = "ling-cold-start-data"
prefix = "2021-10-05"
data_key = "2021-10-05.csv"
data_location = "s3://{}/{}/{}".format(bucket, prefix, data_key)

In [5]:
import os
import tempfile
from typing import Dict, Text
import pprint 

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

### Model definition

In [7]:
class UserModel(tf.keras.Model) :

    def __init__(self, unique_genders, unique_langs, unique_countries, viewer_age, unique_networks, unique_clusters) :
        super().__init__()

        self.gender_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_genders, mask_token=None),
            tf.keras.layers.Embedding(len(unique_genders) + 1, 4),
        ])

        self.lang_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_langs, mask_token=None),
            tf.keras.layers.Embedding(len(unique_langs) + 1, 10),
        ])

        self.country_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_countries, mask_token=None),
            tf.keras.layers.Embedding(len(unique_countries) + 1, 10),
        ])

        self.network_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_networks, mask_token=None),
            tf.keras.layers.Embedding(len(unique_networks) + 1, 4),
        ])

        age_boundaries = np.array([18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float("inf")])
        self.viewer_age_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.Discretization(age_boundaries.tolist()),
            tf.keras.layers.Embedding(len(age_boundaries), 2)
        ])
        
        self.viewer_lat_long_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_clusters, mask_token=None),
            tf.keras.layers.Embedding(len(unique_clusters) + 1, 2),
        ])


    def call(self, inputs) :
        return tf.concat([
            self.gender_embedding(inputs["viewer_gender"]),
            self.lang_embedding(inputs["viewer_lang"]),
            self.country_embedding(inputs["viewer_country"]),
            self.network_embedding(inputs["viewer_network"]),
            self.viewer_age_embedding(inputs["viewer_age"]),
            self.viewer_lat_long_embedding(inputs["viewer_lat_long_cluster"]),
        ], axis = 1)

In [8]:
class BroadcasterModel(tf.keras.Model):

    def __init__(self, unique_movie_titles, dims):
        super().__init__()

        self.broadcaster_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, dims)
        ])

    def call(self, broadcaster):
        return tf.concat([
            self.broadcaster_embedding(broadcaster),
        ], axis=1)

### Load data

In [9]:
def load_data_file_cold(file, stats):
    print('loading file:' + file)
    training_df = pd.read_csv(
        file,
        skiprows=[0],
        names=["viewer",
               "broadcaster",
               "viewer_age",
               "viewer_gender",
               "viewer_longitude",
               "viewer_latitude",
               "viewer_lang",
               "viewer_country",
               "broadcaster_age",
               "broadcaster_gender",
               "broadcaster_longitude",
               "broadcaster_latitude",
               "broadcaster_lang",
               "broadcaster_country",
               "duration", 
               "viewer_network", 
               "broadcaster_network", 
               "viewer_lat_long_cluster",
               "rank"], 
        dtype={
            'viewer': np.unicode,
            'broadcaster': np.unicode,
            'viewer_age': np.single,
            'viewer_gender': np.unicode,
            'viewer_longitude': np.single,
            'viewer_latitude': np.single,
            'viewer_lang': np.unicode,
            'viewer_country': np.unicode,
            'broadcaster_age': np.single,
            'broadcaster_longitude': np.single,
            'broadcaster_latitude': np.single,
            'broadcaster_lang': np.unicode,
            'broadcaster_country': np.unicode,
            'viewer_network': np.unicode,
            'broadcaster_network': np.unicode,
            'viewer_lat_long_cluster': np.unicode,
            'rank': np.int
        })

    values = {
        'viewer': 'unknown',
        'broadcaster': 'unknown',
        'viewer_age': 30,
        'viewer_gender': 'unknown',
        'viewer_longitude': 0,
        'viewer_latitude': 0,
        'viewer_lang': 'unknown',
        'viewer_country': 'unknown',
        'broadcaster_age': 30,
        'broadcaster_longitude': 0,
        'broadcaster_latitude': 0,
        'broadcaster_lang': 'unknown',
        'broadcaster_country': 'unknown',
        'duration': 0,
        'viewer_network': 'unknown',
        'broadcaster_network': 'unknown',
        'viewer_lat_long_cluster': '0',
        'rank': 1
    }
    training_df.fillna(value=values, inplace=True)
    print(training_df.head(10))
    print(training_df.iloc[-10:])
    # stats.send_stats('data-size', len(training_df.index))
#     samples = training_df.sample(frac=.1)
    return training_df


def load_training_data_cold(file, stats):
    ratings_df = load_data_file_cold(file, stats)
    print('creating data set')
    training_ds = (
        tf.data.Dataset.from_tensor_slices(
            ({
                "viewer": tf.cast(
                    ratings_df['viewer'].values,
                    tf.string),
                "viewer_gender": tf.cast(
                    ratings_df['viewer_gender'].values,
                    tf.string),
                "viewer_lang": tf.cast(
                    ratings_df['viewer_lang'].values,
                    tf.string),
                "viewer_country": tf.cast(
                    ratings_df['viewer_country'].values,
                    tf.string),
                "viewer_age": tf.cast(
                    ratings_df['viewer_age'].values,
                    tf.int32),
                "viewer_longitude": tf.cast(
                    ratings_df['viewer_longitude'].values,
                    tf.float16),
                "viewer_latitude": tf.cast(
                    ratings_df['viewer_latitude'].values,
                    tf.float16),
                "broadcaster": tf.cast(
                    ratings_df['broadcaster'].values,
                    tf.string),
                "viewer_network": tf.cast(
                    ratings_df['viewer_network'].values,
                    tf.string),
                "broadcaster_network": tf.cast(
                    ratings_df['broadcaster_network'].values,
                    tf.string),
                "viewer_lat_long_cluster": tf.cast(
                    ratings_df['viewer_lat_long_cluster'].values,
                    tf.string),
            })))

    return training_ds

In [10]:
def prepare_training_data_cold(train_ds):
    print('prepare_training_data')
    training_ds = train_ds.cache().map(lambda x: {
        "broadcaster": x["broadcaster"],
        "viewer": x["viewer"],
        "viewer_gender": x["viewer_gender"],
        "viewer_lang": x["viewer_lang"],
        "viewer_country": x["viewer_country"],
        "viewer_age": x["viewer_age"],
        "viewer_longitude": x["viewer_longitude"],
        "viewer_latitude": x["viewer_latitude"],
        "viewer_network": x["viewer_network"],
        "broadcaster_network": x["broadcaster_network"],
        "viewer_lat_long_cluster": x["viewer_lat_long_cluster"],
    }, num_parallel_calls=tf.data.AUTOTUNE,
       deterministic=False)

    print('done prepare_training_data')
    return training_ds

def get_broadcaster_data_set(train_ds):
    broadcasters = train_ds.cache().map(lambda x: x["broadcaster"], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    broadcasters_ds = tf.data.Dataset.from_tensor_slices(
        np.unique(list(broadcasters.as_numpy_iterator())))
    return broadcasters_ds

In [11]:
training_dataset = load_training_data_cold(file=data_location, stats="")

loading file:s3://ling-cold-start-data/2021-10-05/2021-10-05.csv
                                            viewer  \
0  37 5e 49 40 70 ff b1 a1 51 e1 f0 d6 77 15 d2 32   
1  dd 83 c6 59 b2 3c e3 3c a1 49 47 1c ef 5f 1d 53   
2  d5 e3 54 3c be 21 ee 42 c5 db 19 60 3e 9e b7 72   
3  dd 83 c6 59 b2 3c e3 3c a1 49 47 1c ef 5f 1d 53   
4  08 c7 b8 62 05 d4 b9 80 5b 96 a2 93 0e 48 88 6b   
5  dd 83 c6 59 b2 3c e3 3c a1 49 47 1c ef 5f 1d 53   
6  37 5e 49 40 70 ff b1 a1 51 e1 f0 d6 77 15 d2 32   
7  dd 83 c6 59 b2 3c e3 3c a1 49 47 1c ef 5f 1d 53   
8  37 5e 49 40 70 ff b1 a1 51 e1 f0 d6 77 15 d2 32   
9  dd 83 c6 59 b2 3c e3 3c a1 49 47 1c ef 5f 1d 53   

                                       broadcaster  viewer_age viewer_gender  \
0  34 9a c3 50 0e cf a4 a8 b7 d0 a4 a1 b9 d1 bd f2        43.0        female   
1  b6 cb 91 77 b1 7f 4e 70 be 4d bb 5e 88 3f c2 dd        50.0          male   
2  7f 8c 21 c9 e4 6f a2 fe 01 fa 97 4f 5b 2a 82 3d        22.0        female   
3  73 83 97 f7 28 69

In [12]:
train = prepare_training_data_cold(training_dataset)

prepare_training_data
done prepare_training_data


In [13]:
broadcasters_data_set = get_broadcaster_data_set(training_dataset)

### Prepare features

In [14]:
def get_list(training_data, key):
    return training_data.batch(1_000_000).map(lambda x: x[key], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)


def get_unique_list(data):
    return np.unique(np.concatenate(list(data)))

In [15]:
user_genders = get_list(train, 'viewer_gender')

In [16]:
user_langs = get_list(train, 'viewer_lang')

In [17]:
user_countries = get_list(train, 'viewer_country')

In [18]:
viewer_age = get_list(train, 'viewer_age')

In [19]:
user_networks = get_list(train, 'viewer_network')

In [20]:
user_clusters = get_list(train, "viewer_lat_long_cluster")

### derive input dims

In [21]:
unique_user_genders = get_unique_list(user_genders)

In [22]:
len(unique_user_genders)

2

In [23]:
unique_user_langs = get_unique_list(user_langs)

In [24]:
len(unique_user_langs)

67

In [25]:
unique_user_countries = get_unique_list(user_countries)

In [26]:
len(unique_user_countries)

196

In [27]:
unique_user_networks = get_unique_list(user_networks)

In [28]:
len(unique_user_networks)

4

In [29]:
unique_user_clusters = get_unique_list(user_clusters)

In [30]:
len(unique_user_clusters)

8

### user model

In [31]:
age_boundaries = np.array([18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float("inf")])

In [32]:
user_model = UserModel(unique_user_genders, unique_user_langs, unique_user_countries, viewer_age, unique_user_networks, unique_user_clusters)

### broadcaster model

In [33]:
broadcaster_ids = get_list(train, 'broadcaster')

In [34]:
unique_broadcasters = get_unique_list(broadcaster_ids)

In [35]:
len(unique_broadcasters)

112919

In [36]:
broadcaster_embedding_dimension = 32

In [37]:
broadcaster_model = BroadcasterModel(unique_broadcasters, broadcaster_embedding_dimension)

### two tower model

In [38]:
metrics = tfrs.metrics.FactorizedTopK(candidates=broadcasters_data_set.batch(128).map(broadcaster_model))

In [39]:
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

In [46]:
class TwoTowers(tf.keras.Model):

    def __init__(self, broadcaster_model, user_model, task):
        super().__init__()
        self.broadcaster_model: tf.keras.Model = broadcaster_model
        self.embedding_model = user_model
        self.task: tf.keras.layers.Layer = task

    def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

        # Set up a gradient tape to record gradients.
        with tf.GradientTape() as tape:

            # Loss computation.

            user_embeddings = self.embedding_model({
                "viewer_gender": features["viewer_gender"],
                "viewer_lang": features["viewer_lang"],
                "viewer_country": features["viewer_country"],
                "viewer_age": features["viewer_age"],
                "viewer_network": features["viewer_network"],
                "viewer_latitude": features["viewer_latitude"],
                "viewer_longitude": features["viewer_longitude"],
                "viewer_lat_long_cluster": features["viewer_lat_long_cluster"],
            })
            positive_broadcaster_embeddings = self.broadcaster_model(
                features["broadcaster"])
            loss = self.task(user_embeddings, positive_broadcaster_embeddings)

            # Handle regularization losses as well.
            regularization_loss = sum(self.losses)

            total_loss = loss + regularization_loss

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.trainable_variables))

        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss

        return metrics

    def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

        # Loss computation.

        user_embeddings = self.embedding_model({
                "viewer_gender": features["viewer_gender"],
                "viewer_lang": features["viewer_lang"],
                "viewer_country": features["viewer_country"],
                "viewer_age": features["viewer_age"],
                "viewer_network": features["viewer_network"],
                "viewer_latitude": features["viewer_latitude"],
                "viewer_longitude": features["viewer_longitude"],
                "viewer_lat_long_cluster": features["viewer_lat_long_cluster"],
        })
        positive_broadcaster_embeddings = self.broadcaster_model(
            features["broadcaster"])
        loss = self.task(user_embeddings, positive_broadcaster_embeddings)

        # Handle regularization losses as well.
        regularization_loss = sum(self.losses)

        total_loss = loss + regularization_loss

        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss
        return metrics

In [47]:
model = TwoTowers(broadcaster_model, user_model, task)

In [48]:
learning_rate = 0.05
batch_size = 16384
# batch_size = 250
epochs = 2
top_k = 1999

In [49]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate))

In [50]:
train_ds = train.batch(batch_size).cache()
# train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
model.fit(train_ds, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7fee3ac57c18>

In [None]:
data_location

's3://ling-cold-start-data/2021-10-05/2021-10-05.csv'

In [None]:
from datetime import date

In [None]:
model_location = "s3://{}/{}/{}".format(bucket, prefix, date.today())

In [None]:
model_location

's3://ling-cold-start-data/2021-10-05/2021-10-06'

In [57]:
print("create index")
index = tfrs.layers.factorized_top_k.BruteForce(
    query_model=user_model,
    k=top_k,
)

index.index(
    broadcasters_data_set.batch(10000).map(
        model.broadcaster_model),
    broadcasters_data_set)

_, titles = index(
    {
        "viewer_gender": tf.constant(["male"]),
        "viewer_lang": tf.constant(["en"]),
        "viewer_country": tf.constant(["US"]),
        "viewer_age": tf.constant([38]),
        "viewer_longitude": tf.constant([-74.89611]),
        "viewer_latitude": tf.constant([40.36393]),
        "viewer_network": tf.constant(["meetme"]),
        "viewer_lat_long_cluster": tf.constant(["7"]),
    }
)

print(f"Recommendations for user lam: {titles}")

_, titles = index(
    {
        "viewer_gender": tf.constant(["male"]),
        "viewer_lang": tf.constant(["en"]),
        "viewer_country": tf.constant(["US"]),
        "viewer_age": tf.constant([28]),
        "viewer_longitude": tf.constant([-118.41625]),
        "viewer_latitude": tf.constant([34.10313]),
        "viewer_network": tf.constant(["pof"]),
        "viewer_lat_long_cluster": tf.constant(["5"]),
    }
)

print(f"Recommendations for user cal: {titles}")

_, titles = index(
    {
        "viewer_gender": tf.constant(["female"]),
        "viewer_lang": tf.constant(["en"]),
        "viewer_country": tf.constant(["US"]),
        "viewer_age": tf.constant([32]),
        "viewer_longitude": tf.constant([-74.89611]),
        "viewer_latitude": tf.constant([40.36393]),
        "viewer_network": tf.constant(["skout"]),
        "viewer_lat_long_cluster": tf.constant(["7"]),
    }
)

print(f"Recommendations for user 32: {titles}")

index.save(model_location)


create index
Recommendations for user lam: [[b'61 4d 78 09 b1 0a 4a 3d d1 06 f0 6b 70 ed 19 7a'
  b'e1 24 ec d4 81 50 19 b8 2f 37 e0 80 03 f9 4d d7'
  b'88 86 0f fb 4e ca 72 e3 ba c9 fe 88 a1 f2 d0 ed' ...
  b'35 b3 eb d4 84 c0 fd af 1f 3b 5b b7 40 1a 67 fc'
  b'bc e1 bf 5e 26 f3 1a 6f 6b 71 89 c2 a6 ec 89 9b'
  b'da 29 12 06 6f 8b 8a c4 ee 4b 75 76 40 11 18 34']]
Recommendations for user cal: [[b'e2 09 08 09 a0 12 dc b2 b7 d6 66 6d 69 1c 85 b1'
  b'99 5a e2 9e 4b 46 78 87 b8 50 90 f1 e6 4f 0d aa'
  b'58 24 9e 18 c8 d1 eb c2 3d f4 2b e7 43 81 a8 ec' ...
  b'5c 54 ce ef fa f0 eb 89 cc 74 39 33 37 aa 03 1b'
  b'9c 86 e8 b3 15 de 6e 3c cf aa e7 cc 1d 1a 01 d5'
  b'ac 4c 75 13 90 1e 21 d3 3a 8e c2 78 f0 85 43 ca']]
Recommendations for user 32: [[b'a7 2a a0 61 8f 9e 26 e5 7b 10 50 17 20 52 b9 7f'
  b'c8 29 47 f8 24 f1 10 72 d1 78 70 94 7e 80 9f 94'
  b'5f 77 25 ea a2 50 7a 5a 20 6c 10 44 78 56 fc 1d' ...
  b'eb 4d 38 53 88 17 0e 7c 35 12 76 d8 d4 4d 15 b0'
  b'be f8 93 1f ae 7d d4 69 7c 85 



INFO:tensorflow:Assets written to: s3://ling-cold-start-data/2021-10-05/2021-10-06/assets


INFO:tensorflow:Assets written to: s3://ling-cold-start-data/2021-10-05/2021-10-06/assets
