# Imports

In [1]:
#!pip install -q tensorflow-recommenders
#!pip install -q scann

In [52]:
import os
import pprint
import tempfile

from typing import Dict, Text

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Data Prep

In [76]:
# Contains User actions (buy/play associated with games - the users game library w/metadata for each game)
df_users = pd.read_csv('final2/user_steam_lib_final.csv') 
# Contains Games (unique games by their appid and metadata e.g. price, ratings, genre, tags)
df_games = pd.read_csv('final2/games_list_final.csv')     

In [77]:
print(df_users.shape)
print(df_games.shape)

(83921, 23)
(1363, 20)


In [78]:
print(df_users.head())

   Unnamed: 0  user_id                 name   play  purchase  \
0           0     5250           alienswarm    4.9         1   
1           1     5250       citiesskylines  144.0         1   
2           2     5250        counterstrike    0.0         1   
3           3     5250  counterstrikesource    0.0         1   
4           4     5250          dayofdefeat    0.0         1   

                 og_name     appid release_date  english            developer  \
0            Alien Swarm     630.0   2010-07-19      1.0                Valve   
1        Cities Skylines  255710.0   2015-03-10      1.0  Colossal Order Ltd.   
2         Counter-Strike      10.0   2000-11-01      1.0                Valve   
3  Counter-Strike Source     240.0   2004-11-01      1.0                Valve   
4          Day of Defeat      30.0   2003-05-01      1.0                Valve   

   ...                                         categories  \
0  ...  Single-player;Multi-player;Co-op;Steam Achieve...   
1  ...

In [79]:
print(df_games.head())

   Unnamed: 0                 name                og_name     appid  \
0           0           alienswarm            Alien Swarm     630.0   
1           1       citiesskylines        Cities Skylines  255710.0   
2           2        counterstrike         Counter-Strike      10.0   
3           3  counterstrikesource  Counter-Strike Source     240.0   
4           4          dayofdefeat          Day of Defeat      30.0   

  release_date  english            developer            publisher  \
0   2010-07-19      1.0                Valve                Valve   
1   2015-03-10      1.0  Colossal Order Ltd.  Paradox Interactive   
2   2000-11-01      1.0                Valve                Valve   
3   2004-11-01      1.0                Valve                Valve   
4   2003-05-01      1.0                Valve                Valve   

           platforms  required_age  \
0            windows           0.0   
1  windows;mac;linux           0.0   
2  windows;mac;linux           0.0   
3  win

In [80]:
display(df_games.dtypes)

Unnamed: 0            int64
name                 object
og_name              object
appid               float64
release_date         object
english             float64
developer            object
publisher            object
platforms            object
required_age        float64
categories           object
genres               object
steamspy_tags        object
achievements        float64
positive_ratings    float64
negative_ratings    float64
average_playtime    float64
median_playtime     float64
owners               object
price               float64
dtype: object

In [81]:
display(df_users.dtypes)

Unnamed: 0            int64
user_id               int64
name                 object
play                float64
purchase              int64
og_name              object
appid               float64
release_date         object
english             float64
developer            object
publisher            object
platforms            object
required_age        float64
categories           object
genres               object
steamspy_tags        object
achievements        float64
positive_ratings    float64
negative_ratings    float64
average_playtime    float64
median_playtime     float64
owners               object
price               float64
dtype: object

In [82]:
# converting "appid", "english", and other columns from float to int
df_games = df_games.astype({"name":'string', "og_name": 'string', "appid":'int', "release_date":'string',
                            "english":'int', "developer":'string', "publisher":'string', "platforms":'string',
                            "required_age":'int', "categories":'string', "genres":'string', 
                            "steamspy_tags":'string', "achievements": 'int', "positive_ratings":'int', 
                            "negative_ratings":'int', "average_playtime":'int', "median_playtime":'int',
                            "owners":'string', "price":'float64'
                           }) 
df_users = df_users.astype({ "user_id":'string', "name":'string', "play":'int', "purchase":'int', 
                            "og_name": 'string', "appid":'int', "release_date":'string',
                            "english":'int', "developer":'string', "publisher":'string', "platforms":'string',
                            "required_age":'int', "categories":'string', "genres":'string', 
                            "steamspy_tags":'string', "achievements": 'int', "positive_ratings":'int', 
                            "negative_ratings":'int', "average_playtime":'int', "median_playtime":'int',
                            "owners":'string', "price":'float64'
                           }) 

In [83]:
display(df_games.dtypes)

Unnamed: 0            int64
name                 string
og_name              string
appid                 int64
release_date         string
english               int64
developer            string
publisher            string
platforms            string
required_age          int64
categories           string
genres               string
steamspy_tags        string
achievements          int64
positive_ratings      int64
negative_ratings      int64
average_playtime      int64
median_playtime       int64
owners               string
price               float64
dtype: object

In [84]:
display(df_users.dtypes)

Unnamed: 0            int64
user_id              string
name                 string
play                  int64
purchase              int64
og_name              string
appid                 int64
release_date         string
english               int64
developer            string
publisher            string
platforms            string
required_age          int64
categories           string
genres               string
steamspy_tags        string
achievements          int64
positive_ratings      int64
negative_ratings      int64
average_playtime      int64
median_playtime       int64
owners               string
price               float64
dtype: object

In [85]:
df_games['appid'].nunique()

1363

In [86]:
df_games['og_name'].nunique()

1363

In [87]:
# Convert dataframe to tensors
ds_users = tf.data.Dataset.from_tensor_slices(dict(df_users))
ds_games = tf.data.Dataset.from_tensor_slices(dict(df_games))

list(ds_games.as_numpy_iterator())[0]

{'Unnamed: 0': 0,
 'name': b'alienswarm',
 'og_name': b'Alien Swarm',
 'appid': 630,
 'release_date': b'2010-07-19',
 'english': 1,
 'developer': b'Valve',
 'publisher': b'Valve',
 'platforms': b'windows',
 'required_age': 0,
 'categories': b'Single-player;Multi-player;Co-op;Steam Achievements;Captions available;Steam Cloud;Stats;Includes level editor',
 'genres': b'Action',
 'steamspy_tags': b'Free to Play;Co-op;Action',
 'achievements': 66,
 'positive_ratings': 17435,
 'negative_ratings': 941,
 'average_playtime': 371,
 'median_playtime': 83,
 'owners': b'2000000-5000000',
 'price': 0.0}

In [14]:
list(ds_users.as_numpy_iterator())[0]

{'Unnamed: 0': 0,
 'user_id': b'5250',
 'name': b'alienswarm',
 'play': 4,
 'purchase': 1,
 'og_name': b'Alien Swarm',
 'appid': b'630.0',
 'release_date': b'2010-07-19',
 'english': 1,
 'developer': b'Valve',
 'publisher': b'Valve',
 'platforms': b'windows',
 'required_age': 0,
 'categories': b'Single-player;Multi-player;Co-op;Steam Achievements;Captions available;Steam Cloud;Stats;Includes level editor',
 'genres': b'Action',
 'steamspy_tags': b'Free to Play;Co-op;Action',
 'achievements': 66,
 'positive_ratings': 17435,
 'negative_ratings': 941,
 'average_playtime': 371,
 'median_playtime': 83,
 'owners': b'2000000-5000000',
 'price': 0.0}

In [88]:
# Select fields
#ds_users = ds_users.map(lambda x: {
#    'user_id': tf.strings.as_string(x['user_id']),
#    'og_name': tf.strings.as_string(x['og_name'])
#})
#'play': tf.strings.as_string(x['play']),
#'purchase': tf.string.as_string(x['purchase'])

ds_users = ds_users.map(lambda x: {
    'user_id': x['user_id'],
    'appid': tf.strings.as_string(x['appid']) # need to cast int into tf string
})

#ds_games = ds_games.batch(32).map(lambda x: tf.strings.as_string(x['og_name']))
#ds_games = ds_games.batch(32).map(lambda x: x['appid'])
ds_games = ds_games.batch(32).map(lambda x: tf.strings.as_string(x['appid'])) # need to cast int into tf string

# Get all the user IDs
user_ids = ds_users.batch(1000000).map(lambda x: x["user_id"])
#unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_users.batch(1000000).map(lambda x: x["appid"])
#unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

In [89]:
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

In [90]:
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

In [91]:
print("user_ids :", list(user_ids.as_numpy_iterator()))

print("product_ids :", list(product_ids.as_numpy_iterator()))

user_ids : [array([b'5250', b'5250', b'5250', ..., b'309812026', b'309824202',
       b'309903146'], dtype=object)]
product_ids : [array([b'630', b'255710', b'10', ..., b'301520', b'570', b'570'],
      dtype=object)]


In [44]:
print("ds_users :", list(ds_users.as_numpy_iterator())[0])

print("user_ids :", list(user_ids.as_numpy_iterator()))

print("product_ids :", list(product_ids.as_numpy_iterator()))

ds_users : {'user_id': b'5250', 'appid': b'630'}
user_ids : [array([b'5250', b'5250', b'5250', ..., b'309812026', b'309824202',
       b'309903146'], dtype=object)]
product_ids : [array([b'630', b'255710', b'10', ..., b'301520', b'570', b'570'],
      dtype=object)]


In [92]:
print(len(unique_user_ids))
print(len(unique_product_ids))

11413
1363


# Two Towers (IDs only)

In [93]:
# User and Product models.
class UserModel(tf.keras.Model):

  def __init__(self, unique_user_ids):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])

  def call(self, inputs):
    return self.user_embedding(inputs) #shape: (len(inputs), 32)

class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

In [94]:
# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features["user_id"])
    positive_product_embeddings = self.product_model(features["appid"])

    return self.task(user_embeddings, positive_product_embeddings)

In [95]:
# Instantiate and compile the model.
user_model = UserModel(unique_user_ids)
product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = tf.data.Dataset.from_tensor_slices(unique_product_ids).batch(128).map(product_model)

emb1 = np.vstack(list(product_embeddings))
emb2 = product_model.call(unique_product_ids)
print(np.all(emb1 == emb2))

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

True


In [96]:
# Train for 4 epochs.
model.fit(ds_users.batch(50), epochs=4)

# What is batch doing?

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f97c616e140>

In [97]:
# How many recommendations
K = 5

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Populate the index with the embeddings from the product model. And, perform this
# operation in batch with a size of 32 observations for efficiency.
product_embeddings_to_index = ds_games.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# # Get some recommendations.
user_id = "76767"
_, products = index(np.array([user_id]))
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")

Top 3 recommendations for user 76767: [351  24 458 238  25]


In [98]:
# How many recommendations
K = 5

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((ds_games, ds_games.map(model.product_model)))
)

# Get recommendations.
my_user_id = '76767'
_, titles = index(tf.constant([my_user_id]))
print(f"Recommendations for user {my_user_id}: {titles[0, :K]}")

Recommendations for user 76767: [b'301520' b'10180' b'363970' b'50130' b'10090']
