# Imports

In [None]:
#!pip install -q tensorflow-recommenders
#!pip install -q scann

In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

2023-07-29 00:22:47.493177: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-29 00:22:47.531064: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-29 00:22:47.531597: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Prep

In [2]:
# Contains User actions (buy/play associated with games - the users game library w/metadata for each game)
df_users = pd.read_csv('clean_datasets/final2/user_steam_lib_final.csv') 
# Contains Games (unique games by their appid and metadata e.g. price, ratings, genre, tags)
df_games = pd.read_csv('clean_datasets/final2/games_list_final.csv')     

In [3]:
# converting "appid", "english", and other columns from float to int
df_games = df_games.astype({"name":'string', "og_name": 'string', "appid":'int', "release_date":'string',
                            "english":'int', "developer":'string', "publisher":'string', "platforms":'string',
                            "required_age":'int', "categories":'string', "genres":'string', 
                            "steamspy_tags":'string', "achievements": 'int', "positive_ratings":'int', 
                            "negative_ratings":'int', "average_playtime":'int', "median_playtime":'int',
                            "owners":'string', "price":'float64'
                           }) 
df_users = df_users.astype({ "user_id":'string', "name":'string', "play":'int', "purchase":'int', 
                            "og_name": 'string', "appid":'int', "release_date":'string',
                            "english":'int', "developer":'string', "publisher":'string', "platforms":'string',
                            "required_age":'int', "categories":'string', "genres":'string', 
                            "steamspy_tags":'string', "achievements": 'int', "positive_ratings":'int', 
                            "negative_ratings":'int', "average_playtime":'int', "median_playtime":'int',
                            "owners":'string', "price":'float64'
                           }) 

In [4]:
# Convert dataframe to tensors
ds_users = tf.data.Dataset.from_tensor_slices(dict(df_users))
ds_games = tf.data.Dataset.from_tensor_slices(dict(df_games))

# list(ds_games.as_numpy_iterator())[0]

In [5]:
# list(ds_users.as_numpy_iterator())[0]

In [6]:
# Select fields
ds_users = ds_users.map(lambda x: {
    'user_id': x['user_id'],
    'appid': tf.strings.as_string(x['appid']) # need to cast int into tf string
})

#ds_games = ds_games.batch(32).map(lambda x: tf.strings.as_string(x['og_name']))
#ds_games = ds_games.batch(32).map(lambda x: x['appid'])
ds_games = ds_games.batch(32).map(lambda x: tf.strings.as_string(x['appid'])) # need to cast int into tf string

# Get all the user IDs
user_ids = ds_users.batch(1000000).map(lambda x: x["user_id"])
#unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_users.batch(1000000).map(lambda x: x["appid"])
#unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

In [7]:
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

In [8]:
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

In [9]:
# print("user_ids :", list(user_ids.as_numpy_iterator()))

# print("product_ids :", list(product_ids.as_numpy_iterator()))

In [10]:
# print("ds_users :", list(ds_users.as_numpy_iterator())[0])

# print("user_ids :", list(user_ids.as_numpy_iterator()))

# print("product_ids :", list(product_ids.as_numpy_iterator()))

In [11]:
# print(len(unique_user_ids))
# print(len(unique_product_ids))

# Preprocess input function, turn list of appids into a similar user_id 
# Calculate similarity using difflib given game list (in appid)

In [12]:
import difflib

In [13]:
input_games = ['630','255710','10', '240','30','40','540']
input_games2 = ['630','255710','10', '240','30','40','540']
teamfort = ['440']
df_appid = df_games[['og_name','appid']]
df_appid = df_appid.astype({"appid":'int'})
df_appid = df_appid.astype({"og_name":'string',"appid":'string'})
df_appid

Unnamed: 0,og_name,appid
0,Alien Swarm,630
1,Cities Skylines,255710
2,Counter-Strike,10
3,Counter-Strike Source,240
4,Day of Defeat,30
...,...,...
1358,60 Seconds!,368360
1359,Five Nights at Freddy's 3,354140
1360,Victim of Xen,300220
1361,Metal War Online Retribution,412470


In [14]:
df_user_lib = df_users[["user_id", "appid"]]
df_user_lib = df_user_lib.astype({"appid":'int'})
df_user_lib = df_user_lib.astype({"user_id":'string',"appid":'string'})
df_user_lib

Unnamed: 0,user_id,appid
0,5250,630
1,5250,255710
2,5250,10
3,5250,240
4,5250,30
...,...,...
83916,309554670,389570
83917,309812026,273110
83918,309812026,301520
83919,309824202,570


In [15]:
uniq_user_ids = df_user_lib.user_id.unique()
uniq_user_ids

<StringArray>
[     '5250',     '76767',     '86540',    '103360',    '144736',    '181212',
    '229911',    '298950',    '299153',    '381543',
 ...
 '309228590', '309255941', '309262440', '309265377', '309404240', '309434439',
 '309554670', '309812026', '309824202', '309903146']
Length: 11413, dtype: string

In [16]:
# Important to convert dataframe into a dictionary to process, key is user, values is list of appids the user owns
d = df_user_lib.groupby('user_id')['appid'].apply(list).to_dict()
d

{'100053304': ['570', '335820', '206480', '24240', '620', '212500'],
 '100057229': ['219640', '17390', '24720', '72850'],
 '100070732': ['33900', '33930', '255710', '222750', '58610', '251060'],
 '100096071': ['113400',
  '24200',
  '9050',
  '570',
  '227300',
  '22380',
  '13520',
  '220240',
  '233270',
  '206210',
  '70',
  '220',
  '320',
  '380',
  '420',
  '340',
  '130',
  '360',
  '50',
  '280',
  '8190',
  '24400',
  '249130',
  '211500',
  '301520',
  '20',
  '72850',
  '267530',
  '304930'],
 '100168166': ['440'],
 '100267049': ['8930'],
 '100311267': ['113400',
  '346900',
  '203290',
  '57300',
  '266430',
  '304030',
  '8870',
  '21680',
  '10090',
  '275490',
  '287100',
  '730',
  '273110',
  '223710',
  '24200',
  '224600',
  '70000',
  '333930',
  '220440',
  '317360',
  '206480',
  '253980',
  '377160',
  '265630',
  '4000',
  '227940',
  '287120',
  '254060',
  '253900',
  '392950',
  '226720',
  '283370',
  '39000',
  '109600',
  '224260',
  '218620',
  '24240',
 

In [17]:
u1 = d.get('76767')
# u1

In [18]:
u0 = d.get('5250')
# u0

In [19]:
# display(u1)
# display(input_games)
# display(teamfort)

In [20]:
# test the SequenceMatcher similarity calculator for 2 sample lists of appids
# sm = difflib.SequenceMatcher(None, u1, input_games)
# sm.ratio()

In [21]:
# iterate through the unique_user_ids list, compute similarity with the user_input_list
# return a list where the first element is the most similar user ID, 2nd element is the similarity score
def find_similarity(user_lib_dict, user_input_list, unique_user_ids):
    most_id = 'dummy'
    most_sim = 0
    results = []
    for x in unique_user_ids: 
        sm = difflib.SequenceMatcher(None, user_lib_dict.get(x), user_input_list)
        smr = sm.ratio()
        if smr > most_sim:
            most_id = x
            most_sim = smr
    results = [most_id, most_sim]
    return results

In [22]:
display(find_similarity(d, input_games, uniq_user_ids))

['49319563', 0.6153846153846154]

In [23]:
display(d.get('49319563'))

['10', '80', '240', '30', '40', '60']

In [24]:
len(unique_user_ids)

11413

# Two Towers (IDs only)

In [25]:
# User and Product models.
class UserModel(tf.keras.Model):

  def __init__(self, unique_user_ids):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])

  def call(self, inputs):
    return self.user_embedding(inputs) #shape: (len(inputs), 32)

class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

In [26]:
# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features["user_id"])
    positive_product_embeddings = self.product_model(features["appid"])

    return self.task(user_embeddings, positive_product_embeddings)

In [27]:
# Instantiate and compile the model.
user_model = UserModel(unique_user_ids)
product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = tf.data.Dataset.from_tensor_slices(unique_product_ids).batch(128).map(product_model)

emb1 = np.vstack(list(product_embeddings))
emb2 = product_model.call(unique_product_ids)
print(np.all(emb1 == emb2))

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

True


In [28]:
# Train for 4 epochs.
model.fit(ds_users.batch(50), epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x7f01586b7f90>

In [29]:
# How many recommendations
K = 5

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Populate the index with the embeddings from the product model. And, perform this
# operation in batch with a size of 32 observations for efficiency.
product_embeddings_to_index = ds_games.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# # Get some recommendations.
user_id = "76767"
_, products = index(np.array([user_id]))
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")

Top 3 recommendations for user 76767: [  67  206 1193  448  222]


In [30]:
# How many recommendations
K = 5

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((ds_games, ds_games.map(model.product_model)))
)

# Get recommendations.
my_user_id = '76767'
_, titles = index(tf.constant([my_user_id]))
byte_string = titles[0].numpy().tolist()
new_string = [s.decode() for s in byte_string]
for i in new_string:
    print(i)
print(f"Recommendations for user {my_user_id}: {titles[0]}")

346110
304930
335330
107410
236390
317470
10090
301520
339280
259130
Recommendations for user 76767: [b'346110' b'304930' b'335330' b'107410' b'236390' b'317470' b'10090'
 b'301520' b'339280' b'259130']


In [31]:
# Export the query model.
#with tempfile.TemporaryDirectory() as tmp:
  #path = os.path.join(tmp, "model")
path = 'tf_models/model2'

# Save the index.
tf.saved_model.save(index, path)

# Load it back; can also be done in TensorFlow Serving.
loaded = tf.saved_model.load(path)

# Pass a user id in, get top predicted game titles back.
scores, titles = loaded(["76767"])

print(f"Recommendations: {titles[0][:10]}")









INFO:tensorflow:Assets written to: tf_models/model2/assets


INFO:tensorflow:Assets written to: tf_models/model2/assets


Recommendations: [b'346110' b'304930' b'335330' b'107410' b'236390' b'317470' b'10090'
 b'301520' b'339280' b'259130']


In [32]:
path

'tf_models/model2'

# SCANN Model

In [36]:
# install SCANN
!pip install -q scann

[31mERROR: Could not find a version that satisfies the requirement scann (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for scann[0m[31m
[0m

In [34]:
# Use the TFRS ScaNN layer to build an index

scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
scann_index.index_from_dataset(
    tf.data.Dataset.zip((ds_games, ds_games.map(model.product_model)))
)

ImportError: The scann library is not present. Please install it using `pip install scann` to use the ScaNN layer.

In [None]:
# Get recommendations.
my_user_id = '76767'
_, titles = scann_index(tf.constant([my_user_id]))
print(f"Recommendations for user {my_user_id}: {titles[0, :K]}")

In [None]:
path2 = 'tf_models/scann_model2'

# Save the index.
tf.saved_model.save(scann_index, path2)

# Load it back; can also be done in TensorFlow Serving.
loaded2 = tf.saved_model.load(path2)

In [None]:
# K = max 10 how many recommendations, my_user_id
K = 5
my_user_id = '76767'

# Pass a user id in, get top predicted game titles back.
scores, titles = loaded2([my_user_id])

print(f"Recommendations for user_id {my_user_id}: {titles[0][:K]}")