### Setup envs

In [1]:
import boto3
from sagemaker import get_execution_role

In [2]:
!pip install tensorflow==2.5.0



In [3]:
!pip install tensorflow-recommenders==0.5.2



In [4]:
role = get_execution_role()
bucket = "ling-cold-start-data"
prefix = "2021-09-23"
data_key = "2021-09-23.csv"
data_location = "s3://{}/{}/{}".format(bucket, prefix, data_key)

In [5]:
import os
import tempfile
from typing import Dict, Text
import pprint 

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

### Model definition

In [7]:
class UserModel(tf.keras.Model):

    def __init__(self, unique_genders, unique_langs, unique_countries, unique_networks, viewer_age):
        super().__init__()

        self.gender_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_genders, mask_token=None),
            tf.keras.layers.Embedding(len(unique_genders) + 1, 4),
        ])
        
        self.lang_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_langs, mask_token=None),
            tf.keras.layers.Embedding(len(unique_langs) + 1, 11),
        ])
        
        self.country_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_countries, mask_token=None),
            tf.keras.layers.Embedding(len(unique_countries) + 1, 11),
        ])
        
        self.network_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_networks, mask_token=None),
            tf.keras.layers.Embedding(len(unique_networks) + 1, 4),
        ])
        
#         self.normalized_age = tf.keras.layers.experimental.preprocessing.Normalization(axis = None)
#         self.normalized_age.adapt(viewer_age)
        self.viewer_age_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.Discretization(age_boundaries.tolist()),
            tf.keras.layers.Embedding(len(age_boundaries), 2)
        ])   

    def call(self, inputs):
        return tf.concat([
            self.gender_embedding(inputs["viewer_gender"]),
            self.lang_embedding(inputs["viewer_lang"]),
            self.country_embedding(inputs["viewer_country"]),
            self.network_embedding(inputs["viewer_network"]),
            self.viewer_age_embedding(inputs["viewer_age"])
        ], axis=1)

In [8]:
class BroadcasterModel(tf.keras.Model):

    def __init__(self, unique_movie_titles, dims):
        super().__init__()

        self.broadcaster_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, dims)
        ])

    def call(self, broadcaster):
        return tf.concat([
            self.broadcaster_embedding(broadcaster),
        ], axis=1)

### Load data

In [9]:
def load_data_file_cold(file, stats):
    print('loading file:' + file)
    training_df = pd.read_csv(
        file,
        skiprows=[0],
        names=["viewer","broadcaster","viewer_age","viewer_gender","viewer_longitude","viewer_latitude","viewer_lang","viewer_country","broadcaster_age","broadcaster_gender","broadcaster_longitude","broadcaster_latitude","broadcaster_lang","broadcaster_country","duration", "viewer_network", "broadcaster_network", "count"], dtype={
            'viewer': np.unicode,
            'broadcaster': np.unicode,
            'viewer_age': np.single,
            'viewer_gender': np.unicode,
            'viewer_longitude': np.single,
            'viewer_latitude': np.single,
            'viewer_lang': np.unicode,
            'viewer_country': np.unicode,
            'broadcaster_age': np.single,
            'broadcaster_longitude': np.single,
            'broadcaster_latitude': np.single,
            'broadcaster_lang': np.unicode,
            'broadcaster_country': np.unicode,
            'viewer_network': np.unicode,
            'broadcaster_network': np.unicode,
            'count': np.unicode
        })

    values = {
        'viewer': 'unknown',
        'broadcaster': 'unknown',
        'viewer_age': 30,
        'viewer_gender': 'unknown',
        'viewer_longitude': 0,
        'viewer_latitude': 0,
        'viewer_lang': 'unknown',
        'viewer_country': 'unknown',
        'broadcaster_age': 30,
        'broadcaster_longitude': 0,
        'broadcaster_latitude': 0,
        'broadcaster_lang': 'unknown',
        'broadcaster_country': 'unknown',
        'duration': 0,
        'viewer_network': 'unknown',
        'broadcaster_network': 'unknown',
        'count': 0
    }
    training_df.fillna(value=values, inplace=True)
    print(training_df.head(10))
    print(training_df.iloc[-10:])
#     stats.send_stats('data-size', len(training_df.index))
#     sampled_df = training_df.sample(frac=0.01)
    return training_df


def load_training_data_cold(file, stats):
    ratings_df = load_data_file_cold(file, stats)
    print('creating data set')
    training_ds = (
        tf.data.Dataset.from_tensor_slices(
            ({
                "viewer": tf.cast(
                    ratings_df['viewer'].values,
                    tf.string),
                "viewer_gender": tf.cast(
                    ratings_df['viewer_gender'].values,
                    tf.string),
                "viewer_lang": tf.cast(
                    ratings_df['viewer_lang'].values,
                    tf.string),
                "viewer_country": tf.cast(
                    ratings_df['viewer_country'].values,
                    tf.string),
                "viewer_age": tf.cast(
                    ratings_df['viewer_age'].values,
                    tf.int16),
                "viewer_longitude": tf.cast(
                    ratings_df['viewer_longitude'].values,
                    tf.float16),
                "viewer_latitude": tf.cast(
                    ratings_df['viewer_latitude'].values,
                    tf.float16),
                "broadcaster": tf.cast(
                    ratings_df['broadcaster'].values,
                    tf.string),
                "viewer_network": tf.cast(
                    ratings_df['viewer_network'].values,
                    tf.string),
                "broadcaster_network": tf.cast(
                    ratings_df['broadcaster_network'].values,
                    tf.string),
            })))

    return training_ds

In [10]:
def prepare_training_data_cold(train_ds):
    print('prepare_training_data')
    training_ds = train_ds.cache().map(lambda x: {
        "broadcaster": x["broadcaster"],
        "viewer": x["viewer"],
        "viewer_gender": x["viewer_gender"],
        "viewer_lang": x["viewer_lang"],
        "viewer_country": x["viewer_country"],
        "viewer_age": x["viewer_age"],
        "viewer_longitude": x["viewer_longitude"],
        "viewer_latitude": x["viewer_latitude"],
        "viewer_network": x["viewer_network"],
        "broadcaster_network": x["broadcaster_network"],
    }, num_parallel_calls=tf.data.AUTOTUNE,
       deterministic=False)

    print('done prepare_training_data')
    return training_ds

def get_broadcaster_data_set(train_ds):
    broadcasters = train_ds.cache().map(lambda x: x["broadcaster"], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    broadcasters_ds = tf.data.Dataset.from_tensor_slices(
        np.unique(list(broadcasters.as_numpy_iterator())))
    return broadcasters_ds

def get_list(training_data, key):
    return training_data.batch(1_000_000).map(lambda x: x[key], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)

def get_unique_list(data):
    return np.unique(np.concatenate(list(data)))

In [11]:
training_dataset = load_training_data_cold(file=data_location, stats="")

loading file:s3://ling-cold-start-data/2021-09-23/2021-09-23.csv
                                            viewer  \
0  21 fe c6 e0 f1 ec e6 02 19 65 7b 2f 1a 90 56 14   
1  6e 38 e1 5e c5 60 52 e5 81 56 3a 0c 39 39 d2 b7   
2  fd 95 7b f8 44 0f 43 4b ad 7d 07 23 42 81 3a 77   
3  79 4b 04 dd 6b b0 f2 17 b2 f6 c7 91 71 2a 45 d6   
4  6e 8c ac 89 fe b3 49 52 56 9b c8 41 ac 2b d9 a8   
5  ac b9 ba a2 2f 75 0a 12 8d e6 81 a7 0f 39 51 e2   
6  2e 05 82 e5 5a 84 da 98 7a 43 80 ec 76 e7 98 e5   
7  e4 14 b9 ca cd 3e bf c0 81 8c 29 f1 4a 1f 62 23   
8  85 19 a7 10 3c de d3 ab e3 dd f6 c3 bf bf 75 3a   
9  21 60 c6 18 de 29 37 21 72 00 3e 47 73 fc 79 aa   

                                       broadcaster  viewer_age viewer_gender  \
0  0f ff ee a8 b0 e3 0d 77 d7 6c 2a 5a 20 58 76 f0        31.0          male   
1  4b 97 ab 5a a3 09 1b 77 29 da 35 a7 a7 85 72 2f        25.0          male   
2  70 c9 b7 df f0 ae a9 fa 1b fe 91 eb 1a 93 40 b0        34.0        female   
3  ef 43 75 71 4b 1a

In [12]:
train = prepare_training_data_cold(training_dataset)

prepare_training_data
done prepare_training_data


In [13]:
broadcasters_data_set = get_broadcaster_data_set(training_dataset)

### Prepare features

In [14]:
def get_list(training_data, key):
    return training_data.batch(1_000_000).map(lambda x: x[key], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)


def get_unique_list(data):
    return np.unique(np.concatenate(list(data)))

In [15]:
user_genders = get_list(train, 'viewer_gender')

In [16]:
user_langs = get_list(train, 'viewer_lang')

In [17]:
user_countries = get_list(train, 'viewer_country')

In [18]:
viewer_age = get_list(train, 'viewer_age')

In [19]:
user_networks = get_list(train, 'viewer_network')

In [20]:
unique_user_genders = get_unique_list(user_genders)

In [21]:
len(unique_user_genders)

2

In [22]:
unique_user_langs = get_unique_list(user_langs)

In [23]:
len(unique_user_langs)

67

In [24]:
unique_user_countries = get_unique_list(user_countries)

In [25]:
len(unique_user_countries)

196

In [26]:
unique_user_networks = get_unique_list(user_networks)

In [27]:
len(unique_user_networks)

4

In [28]:
age_boundaries = np.array([18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float("inf")])

In [29]:
user_model = UserModel(unique_user_genders, unique_user_langs, unique_user_countries, unique_user_networks, viewer_age)  

In [30]:
broadcaster_ids = get_list(train, 'broadcaster')

In [None]:
unique_broadcasters = get_unique_list(broadcaster_ids)

In [None]:
len(unique_broadcasters)

In [None]:
broadcaster_embedding_dimension = 32

In [None]:
broadcaster_model = BroadcasterModel(unique_broadcasters, broadcaster_embedding_dimension)

In [None]:
metrics = tfrs.metrics.FactorizedTopK(candidates=broadcasters_data_set.batch(128).map(broadcaster_model))

In [None]:
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

In [None]:
class TwoTowers(tf.keras.Model):

    def __init__(self, broadcaster_model, user_model, task):
        super().__init__()
        self.broadcaster_model: tf.keras.Model = broadcaster_model
        self.embedding_model = user_model
        self.task: tf.keras.layers.Layer = task

    def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

        # Set up a gradient tape to record gradients.
        with tf.GradientTape() as tape:

            # Loss computation.

            user_embeddings = self.embedding_model({
                "viewer_gender": features["viewer_gender"],
                "viewer_lang": features["viewer_lang"],
                "viewer_country": features["viewer_country"],
                "viewer_network": features["viewer_network"],
                "viewer_age": features["viewer_age"],
            })
            positive_movie_embeddings = self.broadcaster_model(
                features["broadcaster"])
            loss = self.task(user_embeddings, positive_movie_embeddings)

            # Handle regularization losses as well.
            regularization_loss = sum(self.losses)

            total_loss = loss + regularization_loss

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.trainable_variables))

        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss

        return metrics

    def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

        # Loss computation.

        user_embeddings = self.embedding_model({
            "viewer": features["viewer"],
        })
        positive_movie_embeddings = self.broadcaster_model(
            features["broadcaster"])
        loss = self.task(user_embeddings, positive_movie_embeddings)

        # Handle regularization losses as well.
        regularization_loss = sum(self.losses)

        total_loss = loss + regularization_loss

        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss
        return metrics


In [None]:
model = TwoTowers(broadcaster_model, user_model, task)

In [None]:
learning_rate = 0.05
batch_size = 16384
# batch_size = 250
epochs = 2
top_k = 1999

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate))

In [None]:
train_ds = train.batch(batch_size).cache()
# train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
model.fit(train_ds, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7fba09bb0cf8>

In [None]:
data_location

's3://ling-cold-start-data/2021-09-23/2021-09-23.csv'

In [None]:
from datetime import date

In [None]:
model_location = "s3://{}/{}/{}".format(bucket, prefix, date.today())

In [None]:
model_location

's3://ling-cold-start-data/2021-09-23/2021-09-29'

In [None]:
print("create index")
index = tfrs.layers.factorized_top_k.BruteForce(
    query_model=user_model,
    k=top_k,
)

index.index(
    broadcasters_data_set.batch(10000).map(
        model.broadcaster_model),
    broadcasters_data_set)

_, titles = index(
    {
        "viewer_gender": tf.constant(["male"]),
        "viewer_lang": tf.constant(["en"]),
        "viewer_country": tf.constant(["US"]),
        "viewer_age": tf.constant([38]),
        "viewer_longitude": tf.constant([-74.89611]),
        "viewer_latitude": tf.constant([40.36393]),
        "viewer_network": tf.constant(["meetme"]),
    }
)

print(f"Recommendations for user lam: {titles}")

_, titles = index(
    {
        "viewer_gender": tf.constant(["male"]),
        "viewer_lang": tf.constant(["en"]),
        "viewer_country": tf.constant(["US"]),
        "viewer_age": tf.constant([28]),
        "viewer_longitude": tf.constant([-118.41625]),
        "viewer_latitude": tf.constant([34.10313]),
        "viewer_network": tf.constant(["pof"]),
    }
)

print(f"Recommendations for user cal: {titles}")

_, titles = index(
    {
        "viewer_gender": tf.constant(["female"]),
        "viewer_lang": tf.constant(["en"]),
        "viewer_country": tf.constant(["US"]),
        "viewer_age": tf.constant([32]),
        "viewer_longitude": tf.constant([-74.89611]),
        "viewer_latitude": tf.constant([40.36393]),
        "viewer_network": tf.constant(["skout"]),
    }
)

print(f"Recommendations for user 32: {titles}")

index.save(model_location)


create index
Recommendations for user lam: [[b'07 3c d8 00 0d a9 92 ad 16 34 43 84 51 35 aa 01'
  b'31 d1 a6 d0 59 03 4e 7e 7b 63 86 7c 47 42 c8 54'
  b'2e ab 65 48 77 87 54 4d 95 21 9f 9e 7c aa 15 78' ...
  b'ce 4d d5 95 4e 51 69 f4 12 df 98 80 c6 4c 81 97'
  b'fe 1b 2a c1 e2 93 29 ee 39 a0 3f 04 39 f4 7d b8'
  b'7a e3 26 6b d1 e3 99 47 d0 16 df e8 30 f1 5e 50']]
Recommendations for user cal: [[b'45 e8 b1 41 ae 77 51 9a e1 69 75 8f 5b 96 5d f1'
  b'bf 35 60 c1 b8 1c f4 d8 3b be fb c4 f0 ad 81 f4'
  b'77 41 e1 0d af 59 17 af 12 b0 62 15 61 d6 46 8e' ...
  b'90 ad 07 e2 31 84 79 eb 7a c0 5a 83 9f ba 3b b4'
  b'6a 6b b4 20 72 24 00 6b f4 cd 05 d9 2c 9e 14 1b'
  b'fe d7 ee ca 01 17 94 68 10 5a ee ac d8 44 98 2e']]
Recommendations for user 32: [[b'49 0f 0d bc 33 8a 7a b1 73 06 bd 2e 53 d3 5c 28'
  b'e9 63 e3 a4 5c ba 61 63 a0 10 e7 65 34 8b d1 4c'
  b'52 f5 2b 82 fa 3a db 39 2d d5 a6 9d d2 f0 b4 f7' ...
  b'02 f8 3e eb 04 52 61 cd 13 9a ed 31 54 36 9b c5'
  b'af e5 45 46 f1 7e d5 64 d7 d6 



INFO:tensorflow:Assets written to: s3://ling-cold-start-data/2021-09-23/2021-09-29/assets


INFO:tensorflow:Assets written to: s3://ling-cold-start-data/2021-09-23/2021-09-29/assets
