# Imports

In [1]:
#!pip install -q tensorflow-recommenders
#!pip install -q scann

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

2023-07-15 00:17:15.030941: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-15 00:17:15.160853: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-15 00:17:15.160868: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-07-15 00:17:15.739620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

# Data Prep

In [23]:
# Contains User actions (buy/play associated with games - the users game library w/metadata for each game)
df_users = pd.read_csv('final/user_steam_lib_final.csv') 
# Contains Games (unique games by their appid and metadata e.g. price, ratings, genre, tags)
df_games = pd.read_csv('final/games_list_final.csv')     

In [4]:
print(df_users.shape)
print(df_games.shape)

(83921, 22)
(1363, 19)


In [5]:
print(df_users.head())

   Unnamed: 0  user_id                 name   play  purchase     appid  \
0           0     5250           alienswarm    4.9         1     630.0   
1           1     5250       citiesskylines  144.0         1  255710.0   
2           2     5250        counterstrike    0.0         1      10.0   
3           3     5250  counterstrikesource    0.0         1     240.0   
4           4     5250          dayofdefeat    0.0         1      30.0   

  release_date  english            developer            publisher  ...  \
0   2010-07-19      1.0                Valve                Valve  ...   
1   2015-03-10      1.0  Colossal Order Ltd.  Paradox Interactive  ...   
2   2000-11-01      1.0                Valve                Valve  ...   
3   2004-11-01      1.0                Valve                Valve  ...   
4   2003-05-01      1.0                Valve                Valve  ...   

                                          categories               genres  \
0  Single-player;Multi-player;Co-

In [6]:
print(df_games.head())

   Unnamed: 0                 name     appid release_date  english  \
0           0           alienswarm     630.0   2010-07-19      1.0   
1           1       citiesskylines  255710.0   2015-03-10      1.0   
2           2        counterstrike      10.0   2000-11-01      1.0   
3           3  counterstrikesource     240.0   2004-11-01      1.0   
4           4          dayofdefeat      30.0   2003-05-01      1.0   

             developer            publisher          platforms  required_age  \
0                Valve                Valve            windows           0.0   
1  Colossal Order Ltd.  Paradox Interactive  windows;mac;linux           0.0   
2                Valve                Valve  windows;mac;linux           0.0   
3                Valve                Valve  windows;mac;linux           0.0   
4                Valve                Valve  windows;mac;linux           0.0   

                                          categories               genres  \
0  Single-player;Mult

In [7]:
display(df_games.dtypes)

Unnamed: 0            int64
name                 object
appid               float64
release_date         object
english             float64
developer            object
publisher            object
platforms            object
required_age        float64
categories           object
genres               object
steamspy_tags        object
achievements        float64
positive_ratings    float64
negative_ratings    float64
average_playtime    float64
median_playtime     float64
owners               object
price               float64
dtype: object

In [8]:
display(df_users.dtypes)

Unnamed: 0            int64
user_id               int64
name                 object
play                float64
purchase              int64
appid               float64
release_date         object
english             float64
developer            object
publisher            object
platforms            object
required_age        float64
categories           object
genres               object
steamspy_tags        object
achievements        float64
positive_ratings    float64
negative_ratings    float64
average_playtime    float64
median_playtime     float64
owners               object
price               float64
dtype: object

In [24]:
# converting "appid", "english", and other columns from float to int
df_games = df_games.astype({"name":'string', "appid":'int', "english":'int', "required_age":'int',
                     "achievements": 'int', "positive_ratings":'int', "negative_ratings":'int',
                     "average_playtime":'int', "median_playtime":'int'}) 
df_users = df_users.astype({"name":'string',"appid":'int', "purchase":'int',
                     "english": 'int', "required_age":'int',
                     "achievements": 'int', "positive_ratings":'int', "negative_ratings":'int',
                     "average_playtime":'int', "median_playtime":'int'}) 

In [10]:
display(df_games.dtypes)

Unnamed: 0            int64
name                 string
appid                 int64
release_date         object
english               int64
developer            object
publisher            object
platforms            object
required_age          int64
categories           object
genres               object
steamspy_tags        object
achievements          int64
positive_ratings      int64
negative_ratings      int64
average_playtime      int64
median_playtime       int64
owners               object
price               float64
dtype: object

In [11]:
display(df_users.dtypes)

Unnamed: 0            int64
user_id               int64
name                 string
play                float64
purchase              int64
appid                 int64
release_date         object
english               int64
developer            object
publisher            object
platforms            object
required_age          int64
categories           object
genres               object
steamspy_tags        object
achievements          int64
positive_ratings      int64
negative_ratings      int64
average_playtime      int64
median_playtime       int64
owners               object
price               float64
dtype: object

In [12]:
df_games['appid'].nunique()

1363

In [25]:
# Convert dataframe to tensors
ds_users = tf.data.Dataset.from_tensor_slices(dict(df_users))
ds_games = tf.data.Dataset.from_tensor_slices(dict(df_games))

list(ds_games.as_numpy_iterator())[0]

{'Unnamed: 0': 0,
 'name': b'alienswarm',
 'appid': 630,
 'release_date': b'2010-07-19',
 'english': 1,
 'developer': b'Valve',
 'publisher': b'Valve',
 'platforms': b'windows',
 'required_age': 0,
 'categories': b'Single-player;Multi-player;Co-op;Steam Achievements;Captions available;Steam Cloud;Stats;Includes level editor',
 'genres': b'Action',
 'steamspy_tags': b'Free to Play;Co-op;Action',
 'achievements': 66,
 'positive_ratings': 17435,
 'negative_ratings': 941,
 'average_playtime': 371,
 'median_playtime': 83,
 'owners': b'2000000-5000000',
 'price': 0.0}

In [79]:
list(ds_users.as_numpy_iterator())[0]

{'Unnamed: 0': 0,
 'user_id': 5250,
 'name': b'alienswarm',
 'play': 4,
 'purchase': 1,
 'appid': 630,
 'release_date': b'2010-07-19',
 'english': 1,
 'developer': b'Valve',
 'publisher': b'Valve',
 'platforms': b'windows',
 'required_age': 0,
 'categories': b'Single-player;Multi-player;Co-op;Steam Achievements;Captions available;Steam Cloud;Stats;Includes level editor',
 'genres': b'Action',
 'steamspy_tags': b'Free to Play;Co-op;Action',
 'achievements': 66,
 'positive_ratings': 17435,
 'negative_ratings': 941,
 'average_playtime': 371,
 'median_playtime': 83,
 'owners': b'2000000-5000000',
 'price': 0.0}

In [26]:
# Select fields
ds_users = ds_users.map(lambda x: {
    'user_id': tf.strings.as_string(x['user_id']),
    'appid': tf.strings.as_string(x['appid'])
})
#'play': tf.strings.as_string(x['play']),
#'purchase': tf.string.as_string(x['purchase'])

ds_games = ds_games.batch(32).map(lambda x: tf.strings.as_string(x['appid']))

# Get all the user IDs
user_ids = ds_users.batch(1000000).map(lambda x: x["user_id"])
#unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_users.batch(1000000).map(lambda x: x["appid"])
#unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

In [27]:
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

In [28]:
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

In [29]:
print("user_ids :", list(user_ids.as_numpy_iterator()))

print("product_ids :", list(product_ids.as_numpy_iterator()))

user_ids : [array([b'5250', b'5250', b'5250', ..., b'309812026', b'309824202',
       b'309903146'], dtype=object)]
product_ids : [array([b'630', b'255710', b'10', ..., b'301520', b'570', b'570'],
      dtype=object)]


In [30]:
print("ds_users :", list(ds_users.as_numpy_iterator())[0])

print("user_ids :", list(user_ids.as_numpy_iterator()))

print("product_ids :", list(product_ids.as_numpy_iterator()))

ds_users : {'user_id': b'5250', 'appid': b'630'}
user_ids : [array([b'5250', b'5250', b'5250', ..., b'309812026', b'309824202',
       b'309903146'], dtype=object)]
product_ids : [array([b'630', b'255710', b'10', ..., b'301520', b'570', b'570'],
      dtype=object)]


In [31]:
print(len(unique_user_ids))
print(len(unique_product_ids))

11413
1363


# Two Towers (IDs only)

In [37]:
# User and Product models.
class UserModel(tf.keras.Model):

  def __init__(self, unique_user_ids):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])

  def call(self, inputs):
    return self.user_embedding(inputs) #shape: (len(inputs), 32)

class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

In [38]:
# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features["user_id"])
    positive_product_embeddings = self.product_model(features["appid"])

    return self.task(user_embeddings, positive_product_embeddings)

In [39]:
# Instantiate and compile the model.
user_model = UserModel(unique_user_ids)
product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = tf.data.Dataset.from_tensor_slices(unique_product_ids).batch(128).map(product_model)

emb1 = np.vstack(list(product_embeddings))
emb2 = product_model.call(unique_product_ids)
print(np.all(emb1 == emb2))

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

True


In [40]:
# Train for 5 epochs.
model.fit(ds_users.batch(50), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fad65fe45e0>

In [43]:
K = 10

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Populate the index with the embeddings from the product model. And, perform this
# operation in batch with a size of 32 observations for efficiency.
product_embeddings_to_index = ds_games.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# # Get some recommendations.
user_id = "1000000000"
_, products = index(np.array([user_id]))
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")

Top 3 recommendations for user 1: [ 24  23 206  20 222 219 448  21  64 175]
