In [84]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import random
from sklearn.metrics import roc_curve, auc, average_precision_score

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [2]:
#path = '../input/steam-200k.csv'
path = 'steam-200k.csv'
df = pd.read_csv(path, header = None,
                 names = ['UserID', 'Game', 'Action', 'Hours', 'Not Needed'])
df.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [3]:
df['Hours_Played'] = df['Hours'].astype('float32')

In [4]:
df.loc[(df['Action'] == 'purchase') & (df['Hours'] == 1.0), 'Hours_Played'] = 0

In [5]:
df.UserID = df.UserID.astype('int')
df = df.sort_values(['UserID', 'Game', 'Hours_Played'])

In [6]:
clean_df = df.drop_duplicates(['UserID', 'Game'], keep = 'last').drop(['Action', 'Hours', 'Not Needed'], axis = 1)
clean_df.head()

Unnamed: 0,UserID,Game,Hours_Played
65430,5250,Alien Swarm,4.9
65424,5250,Cities Skylines,144.0
65435,5250,Counter-Strike,0.0
65436,5250,Counter-Strike Source,0.0
65437,5250,Day of Defeat,0.0


In [7]:
n_users = len(clean_df.UserID.unique())
n_games = len(clean_df.Game.unique())

print('There are {0} users and {1} games in the data'.format(n_users, n_games))

There are 12393 users and 5155 games in the data


In [8]:
sparsity = clean_df.shape[0] / float(n_users * n_games)
print('{:.2%} of the user-item matrix is filled'.format(sparsity))

0.20% of the user-item matrix is filled


In [9]:
from collections import Counter

user_counter = Counter()
for user in clean_df.UserID.tolist():
    user_counter[user] +=1

game_counter = Counter()
for game in clean_df.Game.tolist():
    game_counter[game] += 1

In [10]:
user2idx = {user: i for i, user in enumerate(user_counter.keys())}
idx2user = {i: user for user, i in user2idx.items()}

game2idx = {game: i for i, game in enumerate(game_counter.keys())}
idx2game = {i: game for game, i in game2idx.items()}

In [11]:
user_idx = clean_df['UserID'].apply(lambda x: user2idx[x]).values
game_idx = clean_df['gameIdx'] = clean_df['Game'].apply(lambda x: game2idx[x]).values
pref = np.repeat([1], clean_df.shape[0])
hours = clean_df['Hours_Played'].values

In [12]:
#from scipy.sparse import csr_matrix
#user_game_matrix = csr_matrix((pref, (user_idx, game_idx)), shape = (n_users, n_games))
#interactions_matrix = csr_matrix((hours, (user_idx, game_idx)), shape = (n_users, n_games))
zero_matrix = np.zeros(shape = (n_users, n_games))
user_game_pref = zero_matrix.copy()
user_game_pref[user_idx, game_idx] = 1

user_game_interactions = zero_matrix.copy()
user_game_interactions[user_idx, game_idx] = hours + 1

In [169]:
k = 5

# Count the number of purchases for each user
purchase_counts = np.apply_along_axis(np.bincount, 1, user_game_pref.astype(int))
buyers_idx = np.where(purchase_counts[:, 1] >= 2 * k)[0] #find the users who purchase 2 * k games
print('{0} users bought {1} or more games'.format(len(buyers_idx), 2 * k))

2189 users bought 10 or more games


In [170]:
test_frac = 0.4
test_users_idx = np.random.choice(buyers_idx,
                                  size = int(np.ceil(len(buyers_idx) * test_frac)),
                                  replace = False)

In [171]:
val_users_idx = test_users_idx[:int(len(test_users_idx) / 2)]
test_users_idx = test_users_idx[int(len(test_users_idx) / 2):]

In [176]:
train_matrix = user_game_pref.copy()
test_matrix = zero_matrix.copy()
for user in test_users_idx:
    purchases = np.where(user_game_pref[user, :] == 1)[0]
    test_purchases = np.random.choice(purchases, size = k, replace = False)
    
    train_matrix[user, test_purchases] = 0
    test_matrix[user, test_purchases] = user_game_pref[user, test_purchases]

In [173]:
test_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]

array([ 1.,  1.,  1.,  1.,  1.])

In [174]:
train_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]

array([ 0.,  0.,  0.,  0.,  0.])

In [13]:
#nonzero_users, nonzero_games = user_game_pref.nonzero()

In [14]:
#nonzero_pairs = list(zip(nonzero_users, nonzero_games))

In [15]:
#np.random.shuffle(nonzero_pairs)

In [16]:
#test_idx = int(np.ceil(len(nonzero_pairs) * 0.4))
#val_idx = int(np.ceil(test_idx * 0.5))

In [17]:
#val_pairs = nonzero_pairs[:val_idx]
#test_pairs = nonzero_pairs[val_idx:test_idx]

In [18]:
#training_matrix = user_game_pref.copy()
#for pair in val_pairs + test_pairs:
#    training_matrix[pair[0], pair[1]] = 0

In [39]:
#val_users_idx = np.unique(np.asarray(val_pairs)[:, 0])
#test_users_idx = np.unique(np.asarray(test_pairs)[:, 0])

In [238]:
tf.reset_default_graph()
pref = tf.placeholder(tf.float32, (n_users, n_games))
interactions = tf.placeholder(tf.float32, (n_users, n_games))
users_idx = tf.placeholder(tf.int32, (None))

In [239]:
n_features = 30
X = tf.Variable(tf.random_normal([n_users, n_features], mean = 0, stddev = 0.1))
Y = tf.Variable(tf.random_normal([n_games, n_features], mean = 0, stddev = 0.1))
conf_alpha = tf.Variable(tf.random_uniform([1], 0, 1))

In [240]:
pred_pref = tf.matmul(X, Y, transpose_b=True)
conf = 1 + conf_alpha * interactions

In [241]:
cost = tf.reduce_sum(tf.multiply(conf, tf.square(tf.subtract(pref, pred_pref))))
l2_sqr = tf.nn.l2_loss(X) + tf.nn.l2_loss(Y)
lambda_c = 0.01
loss = cost + lambda_c * l2_sqr

In [242]:
optimize = tf.train.AdamOptimizer(0.02).minimize(loss)

In [243]:
def k_precision(pred, train, test, k, user_idx, training = False):
    precisions = []
    
    
    for user in user_idx:
        rec = np.argsort(-pred[user, :])
        if not training:
            purchase_history = np.where(train[user, :] != 0)[0]
            mat = test
            
            rec = rec[~np.in1d(rec, purchase_history)]
        else:
            mat = train
    
        top_k = rec[:k]
        labels = np.where(mat[user, :] != 0)[0]
        
        precision = len(set(top_k) & set(labels)) / float(k)
        precisions.append(precision)
    return np.mean(precisions)

In [244]:
iterations = 70
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(iterations):
        sess.run(optimize, feed_dict = {pref: train_matrix,
                                        interactions: user_game_interactions})
        
        if i % 10 == 0:
            mod_loss = sess.run(loss, feed_dict = {pref: train_matrix,
                                                   interactions: user_game_interactions})            
            mod_pred = pred_pref.eval()
            train_precision = k_precision(mod_pred, train_matrix, test_matrix, k, test_users_idx, training = True)
            test_precision = k_precision(mod_pred, train_matrix, test_matrix, k, test_users_idx, training = False)
            
            print('Iterations {0}...'.format(i),
                  'Training Loss {:.2f}...'.format(mod_loss),
                  'Train Precision {:.3f}'.format(train_precision),
                  'Test Precision {:.3f}'.format(test_precision)
                )

    rec = pred_pref.eval()

Iterations 0... Training Loss 62212232.00... Train Precision 0.021 Test Precision 0.002
Iterations 10... Training Loss 13916562.00... Train Precision 0.380 Test Precision 0.019
Iterations 20... Training Loss 5316009.00... Train Precision 0.329 Test Precision 0.027
Iterations 30... Training Loss 2672846.00... Train Precision 0.315 Test Precision 0.033
Iterations 40... Training Loss 1970542.25... Train Precision 0.352 Test Precision 0.030
Iterations 50... Training Loss 1620559.25... Train Precision 0.375 Test Precision 0.037
Iterations 60... Training Loss 1438968.00... Train Precision 0.383 Test Precision 0.038


In [245]:
users = np.random.choice(test_users_idx, size = 10, replace = False)
rec_games = np.argsort(-rec)

In [246]:
for user in users:
    print('Recommended Games for {0} are ...'.format(idx2user[user]))
    purchase_history = np.where(train_matrix[user, :] != 0)[0]
    recommendations = rec_games[user, :]

    
    new_recommendations = recommendations[~np.in1d(recommendations, purchase_history)][:k]
    
    #ground_truth = clean_df[clean_df['UserID'] == idx2user[user]]['gameIdx'].values
    print('User bought these games')
    print(', '.join([idx2game[purchase] for purchase in purchase_history.tolist()]))
    print('\n')
    print('We recommend these games')
    print(', '.join([idx2game[game] for game in new_recommendations]))
    print('\n')
    print('The games that the user actually purchased are ...')
    print(', '.join([idx2game[game] for game in np.where(test_matrix[user, :] != 0)[0]]))
    print('\n')
    print('Precision of {0}'.format(len(set(new_recommendations) & set(np.where(test_matrix[user, :] != 0)[0])) / float(k)))
    print('--------------------------------------')
    print('\n')

Recommended Games for 86912006 are ...
User bought these games
Nosgoth, Divine Souls, World of Guns Gun Disassembly, Combat Arms, Canyon Capers, The Darkness II, Orborun, Uriel's Chasm, Famaze, Dungeon Party, Mabinogi, EverQuest II, Retro/Grade IGF Demo, Mount & Blade, Saints Row The Third, Dino D-Day, Coil, Arma Gold Edition, Gumboy Crazy Adventures, Make it indie!, Space Hack, S.K.I.L.L. - Special Force 2, Age of Empires Online, The Incredible Adventures of Van Helsing, War Thunder, Heavy Fire Afghanistan, Tropico 3 - Steam Special Edition, Grimoire Manastorm, CrimeCraft GangWars, Windosill, Puzzle Pirates, Sky Battles, Galactic Civilizations II Ultimate Edition, Villagers and Heroes, Heroes & Generals, Star Trek Online, PAYDAY The Web Series - Episode 1, Tropico 4, Super Crate Box, The Ship Single Player, Global Agenda, The Great Jitters Pudding Panic, Robotex, Crystal Rift, Conquest of Champions, Starlaxis Supernova Edition, Loadout, Grass Simulator, Tiny and Big Grandpa's Leftover