In [1]:
import boto3
import s3fs
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

In [2]:
!pip install tensorflow==2.5.0

Collecting tensorflow==2.5.0
  Downloading tensorflow-2.5.0-cp36-cp36m-manylinux2010_x86_64.whl (454.3 MB)
[K     |████████████████████████████████| 454.3 MB 8.3 kB/s  eta 0:00:01    |█████▍                          | 76.6 MB 53.1 MB/s eta 0:00:08
Collecting termcolor~=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Collecting opt-einsum~=3.3.0
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 6.1 MB/s  eta 0:00:01
[?25hCollecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting keras-preprocessing~=1.1.2
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.8 MB/s  eta 0:00:01
[?25hCollecting gast==0.4.0
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras-nightly~=2.5.0.dev
  Downloading keras_nightly-2.5.0.dev2021032900-py2.py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40

In [3]:
!pip install tensorflow-recommenders==0.5.2

Collecting tensorflow-recommenders==0.5.2
  Downloading tensorflow_recommenders-0.5.2-py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 4.9 MB/s  eta 0:00:01


Installing collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.5.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
role = get_execution_role()
bucket = "ling-cold-start-data"
prefix = "2021-09-08"
data_key = "a3d86f3b-eb45-4641-b05d-30dff7423e6b.csv"
data_location = "s3://{}/{}/{}".format(bucket, prefix, data_key)

### Loading data

In [5]:
import os
import tempfile

%matplotlib inline
import matplotlib.pyplot as plt

In [6]:
import numpy as np
import pandas as pd
import pprint

In [7]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [8]:
def load_data_file_cold(file, stats):
    print('loading file:' + file)
    training_df = pd.read_csv(
        file,
        skiprows=[0],
        names=["viewer","broadcaster","viewer_age","viewer_gender","viewer_longitude","viewer_latitude","viewer_lang","viewer_country","broadcaster_age","broadcaster_gender","broadcaster_longitude","broadcaster_latitude","broadcaster_lang","broadcaster_country","duration", "viewer_network", "broadcaster_network", "count"], dtype={
            'viewer': np.unicode,
            'broadcaster': np.unicode,
            'viewer_age': np.single,
            'viewer_gender': np.unicode,
            'viewer_longitude': np.single,
            'viewer_latitude': np.single,
            'viewer_lang': np.unicode,
            'viewer_country': np.unicode,
            'broadcaster_age': np.single,
            'broadcaster_longitude': np.single,
            'broadcaster_latitude': np.single,
            'broadcaster_lang': np.unicode,
            'broadcaster_country': np.unicode,
            'viewer_network': np.unicode,
            'broadcaster_network': np.unicode,
            'count': np.int
        })

    values = {
        'viewer': 'unknown',
        'broadcaster': 'unknown',
        'viewer_age': 30,
        'viewer_gender': 'unknown',
        'viewer_longitude': 0,
        'viewer_latitude': 0,
        'viewer_lang': 'unknown',
        'viewer_country': 'unknown',
        'broadcaster_age': 30,
        'broadcaster_longitude': 0,
        'broadcaster_latitude': 0,
        'broadcaster_lang': 'unknown',
        'broadcaster_country': 'unknown',
        'duration': 0,
        'viewer_network': 'unknown',
        'broadcaster_network': 'unknown',
        'count': 0
    }
    training_df.fillna(value=values, inplace=True)
    print(training_df.head(10))
    print(training_df.iloc[-10:])
#     stats.send_stats('data-size', len(training_df.index))

#     sampled_df = training_df.sample(frac=0.1)
#     print(sampled_df.head(10))
#     print(sampled_df.iloc[-10:])
#     return sampled_df
    return training_df

def load_training_data_cold(file, stats):
    ratings_df = load_data_file_cold(file, stats)
    print('creating data set')
    training_ds = (
        tf.data.Dataset.from_tensor_slices(
            ({
                "viewer": tf.cast(
                    ratings_df['viewer'].values,
                    tf.string),
                "viewer_gender": tf.cast(
                    ratings_df['viewer_gender'].values,
                    tf.string),
                "viewer_lang": tf.cast(
                    ratings_df['viewer_lang'].values,
                    tf.string),
                "viewer_country": tf.cast(
                    ratings_df['viewer_country'].values,
                    tf.string),
                "viewer_age": tf.cast(
                    ratings_df['viewer_age'].values,
                    tf.int32),
                "viewer_longitude": tf.cast(
                    ratings_df['viewer_longitude'].values,
                    tf.float16),
                "viewer_latitude": tf.cast(
                    ratings_df['viewer_latitude'].values,
                    tf.float16),
                "broadcaster": tf.cast(
                    ratings_df['broadcaster'].values,
                    tf.string),
                "viewer_network": tf.cast(
                    ratings_df['viewer_network'].values,
                    tf.string),
                "broadcaster_network": tf.cast(
                    ratings_df['broadcaster_network'].values,
                    tf.string),
                "duration": tf.cast(
                    ratings_df['duration'].values,
                    tf.float16),
                "count": tf.cast(
                    ratings_df['count'].values,
                    tf.int16),
            })))

    return training_ds

In [9]:
ratings = load_training_data_cold(file=data_location, stats="")

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

loading file:s3://ling-cold-start-data/2021-09-08/a3d86f3b-eb45-4641-b05d-30dff7423e6b.csv
             viewer       broadcaster  viewer_age viewer_gender  \
0   meetme:19714617  meetme:242525021        39.0        female   
1   skout:161675320   skout:167570679        46.0          male   
2     pof:333093026      pof:77411971        28.0          male   
3     pof:324991301     pof:207032171        39.0          male   
4   skout:177541297  meetme:316500815        41.0          male   
5  meetme:258247855  meetme:314497940        36.0          male   
6    meetme:8237459  meetme:213050479        52.0          male   
7   meetme:84182876   skout:175969618        32.0          male   
8   skout:176797432  meetme:309441196        51.0          male   
9  meetme:282314746  meetme:226200204        32.0          male   

   viewer_longitude  viewer_latitude viewer_lang viewer_country  \
0       -118.380096        34.093899          en             US   
1        120.963997        14.693000 

### Model definition

In [10]:
def get_list(training_data, key):
    return training_data.batch(1_000_000).map(lambda x: x[key], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)


def get_unique_list(data):
    return np.unique(np.concatenate(list(data)))

In [11]:
user_genders = get_list(ratings, 'viewer_gender')
for x in user_genders.batch(1).take(1):
    pprint.pprint(x[0:10])

<tf.Tensor: shape=(1, 1000000), dtype=string, numpy=
array([[b'female', b'male', b'male', ..., b'female', b'male', b'male']],
      dtype=object)>


In [12]:
user_langs = get_list(ratings, 'viewer_lang')
for x in user_langs.batch(1).take(1):
    pprint.pprint(x[0, :10])

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'en', b'en', b'en', b'en', b'en', b'en', b'en', b'en', b'fa',
       b'id'], dtype=object)>


In [13]:
user_countries = get_list(ratings, 'viewer_country')
for x in user_countries.batch(1).take(1).as_numpy_iterator():
    pprint.pprint(x[0, :10])

array([b'US', b'GB', b'US', b'US', b'GB', b'US', b'US', b'US', b'AF',
       b'ID'], dtype=object)


In [14]:
viewer_age = get_list(ratings, 'viewer_age')
for x in viewer_age.batch(1).take(1).as_numpy_iterator():
    pprint.pprint(x[0, :10])

array([39, 46, 28, 39, 41, 36, 52, 32, 51, 32], dtype=int32)


In [15]:
user_networks = get_list(ratings, 'viewer_network')
for x in user_networks.batch(1).take(1).as_numpy_iterator():
    pprint.pprint(x[0, :10])

array([b'meetme', b'skout', b'pof', b'pof', b'skout', b'meetme',
       b'meetme', b'meetme', b'skout', b'meetme'], dtype=object)


In [16]:
unique_user_genders = get_unique_list(user_genders)
print(unique_user_genders)

[b'female' b'male']


In [17]:
unique_user_langs = get_unique_list(user_langs)
print(unique_user_langs)

[b'af' b'ar' b'az' b'bg' b'bn' b'bs' b'ca' b'cs' b'da' b'de' b'el' b'en'
 b'es' b'et' b'eu' b'fa' b'fi' b'fr' b'gl' b'gu' b'he' b'hi' b'hr' b'hu'
 b'id' b'in' b'it' b'iw' b'ja' b'ka' b'km' b'ko' b'ks' b'lo' b'lt' b'lv'
 b'mi' b'mk' b'ml' b'mr' b'ms' b'my' b'nb' b'ne' b'nl' b'pa' b'pl' b'ps'
 b'pt' b'ro' b'ru' b'sd' b'si' b'sk' b'sl' b'sq' b'sr' b'sv' b'ta' b'te'
 b'th' b'ti' b'tl' b'to' b'tr' b'uk' b'ur' b'uz' b'vi' b'zh']


In [18]:
unique_user_countries = get_unique_list(user_countries)
print(unique_user_countries)

[b'419' b'AD' b'AE' b'AF' b'AG' b'AI' b'AL' b'AM' b'AO' b'AQ' b'AR' b'AS'
 b'AT' b'AU' b'AW' b'AX' b'AZ' b'BA' b'BB' b'BD' b'BE' b'BF' b'BG' b'BH'
 b'BI' b'BJ' b'BN' b'BO' b'BR' b'BS' b'BT' b'BV' b'BW' b'BY' b'BZ' b'CA'
 b'CD' b'CH' b'CI' b'CL' b'CM' b'CN' b'CO' b'CP' b'CR' b'CU' b'CV' b'CW'
 b'CY' b'CZ' b'DE' b'DK' b'DO' b'DZ' b'EA' b'EC' b'EE' b'EG' b'EN' b'ER'
 b'ES' b'ET' b'FI' b'FJ' b'FM' b'FR' b'GA' b'GB' b'GD' b'GE' b'GF' b'GH'
 b'GL' b'GM' b'GN' b'GP' b'GR' b'GT' b'GU' b'GW' b'HK' b'HN' b'HR' b'HT'
 b'HU' b'ID' b'IE' b'IL' b'IM' b'IN' b'IO' b'IQ' b'IR' b'IS' b'IT' b'JE'
 b'JM' b'JO' b'JP' b'KE' b'KH' b'KM' b'KR' b'KW' b'KY' b'KZ' b'LA' b'LB'
 b'LC' b'LI' b'LK' b'LT' b'LU' b'LV' b'LY' b'MA' b'MC' b'ME' b'MF' b'MG'
 b'MH' b'MK' b'ML' b'MM' b'MO' b'MP' b'MR' b'MT' b'MU' b'MV' b'MX' b'MY'
 b'MZ' b'NG' b'NI' b'NL' b'NO' b'NP' b'NZ' b'OM' b'PA' b'PE' b'PH' b'PK'
 b'PL' b'PR' b'PS' b'PT' b'PY' b'QA' b'RO' b'RS' b'RU' b'SA' b'SC' b'SD'
 b'SE' b'SG' b'SH' b'SI' b'SJ' b'SK' b'SM' b'SN' b

In [19]:
unique_user_networks = get_unique_list(user_networks)
print(unique_user_networks)

[b'meetme' b'pof' b'skout' b'zoosk']


### Candidate Models

In [20]:
broadcaster_ids = get_list(ratings, 'broadcaster')

In [21]:
unique_broadcasters = get_unique_list(broadcaster_ids)

In [22]:
print(unique_broadcasters)

[b'meetme:100081867' b'meetme:100104254' b'meetme:100114731' ...
 b'zoosk:ff0ba42fa32cddbec949c96694895fe2'
 b'zoosk:ff39ee369909e9bdef9e61bc5bb75155'
 b'zoosk:ffd69ee0bb59b722020f374298b9e0b9']


In [23]:
class BroadcasterModel(tf.keras.Model):

    def __init__(self, unique_movie_titles, dims):
        super().__init__()

        self.broadcaster_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, dims)
        ])

    def call(self, broadcaster):
        return tf.concat([
            self.broadcaster_embedding(broadcaster),
        ], axis=1)

### Combined model

In [24]:
def get_broadcaster_data_set(train_ds):
    broadcasters = train_ds.cache().map(lambda x: x["broadcaster"], num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    broadcasters_ds = tf.data.Dataset.from_tensor_slices(
        np.unique(list(broadcasters.as_numpy_iterator())))
    return broadcasters_ds

In [25]:
broadcasters_data_set = get_broadcaster_data_set(ratings)

In [26]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [27]:
class UserModel(tf.keras.Model):

    def __init__(self, unique_genders, unique_langs, unique_countries, unique_networks, viewer_age):
        super().__init__()

        self.gender_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_genders, mask_token=None),
            tf.keras.layers.Embedding(len(unique_genders) + 1, 4),
        ])
        
        self.lang_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_langs, mask_token=None),
            tf.keras.layers.Embedding(len(unique_langs) + 1, 11),
        ])
        
        self.country_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_countries, mask_token=None),
            tf.keras.layers.Embedding(len(unique_countries) + 1, 11),
        ])
        
        self.network_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_networks, mask_token=None),
            tf.keras.layers.Embedding(len(unique_networks) + 1, 5),
        ])
        
        self.normalized_age = tf.keras.layers.experimental.preprocessing.Normalization(axis = None)
        self.normalized_age.adapt(viewer_age)

    def call(self, inputs):
        return tf.concat([
            self.gender_embedding(inputs["viewer_gender"]),
            self.lang_embedding(inputs["viewer_lang"]),
            self.country_embedding(inputs["viewer_country"]),
            self.network_embedding(inputs["viewer_network"]),
            tf.reshape(self.normalized_age(inputs["viewer_age"]), (-1, 1))
        ], axis=1)

In [28]:
class FinalModel(tfrs.models.Model) :

	def __init__(self, unique_genders, unique_langs, unique_countries, unique_networks, viewer_age, unique_movie_titles, dims) :
		super().__init__()
		self.user_model = tf.keras.Sequential([
			UserModel(unique_genders, unique_langs, unique_countries, unique_networks, viewer_age),
			tf.keras.layers.Dense(32)
		])
        
		self.candidate_model = tf.keras.Sequential([
			BroadcasterModel(unique_movie_titles, dims),
			tf.keras.layers.Dense(32)
		])
        
		self.task = tfrs.tasks.Retrieval(
			metrics = tfrs.metrics.FactorizedTopK(
				candidates=broadcasters_data_set.batch(128).map(self.candidate_model),
			),
		)

	def compute_loss(self, features, training = False) :
		user_embeddings = self.user_model({
				"viewer_gender": features["viewer_gender"],
				"viewer_lang": features["viewer_lang"],
				"viewer_country": features["viewer_country"],
				"viewer_network": features["viewer_network"],
				"viewer_age": features["viewer_age"],
			})
		broadcaster_embeddings = self.candidate_model(features["broadcaster"], 32)

		return self.task(user_embeddings, broadcaster_embeddings)

In [29]:
broadcaster_embedding_dimension = 32

In [30]:
model = FinalModel(unique_user_genders, unique_user_langs, unique_user_countries,  unique_user_networks, viewer_age, unique_broadcasters, broadcaster_embedding_dimension)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=1)

train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Top-100 accuracy (train): 0.10.
Top-100 accuracy (test): 0.06.
