In [82]:
import pandas as pd
import numpy as np

# 1. Data Preparation

In [1]:
max_item_id = 1000		# use as a way to constrict memory errors, increase as needed

## 1.1 Playtime Matrix

Source file: User_items.json 

{'user_id': '76561197970982479',
 'items_count': 277,
 'steam_id': '76561197970982479',
 'user_url': 'http://steamcommunity.com/profiles/76561197970982479',
 'items': [{'item_id': '10',
 'item_name': 'Counter-Strike',
 'playtime_forever': 6,
 'playtime_2weeks': 0},
 {'item_id': '20',
 'item_name': 'Team Fortress Classic',
 'playtime_forever': 0,
 'playtime_2weeks': 0},}
 
 
 We reorganize it as a user_item matrix. This is the main input layer.

## To do
    
1. Instead of Setting limit on games, want to filter out games that are played by less than n people.

In [2]:
import json
import pandas as pd
import ast
import numpy as np
from collections import OrderedDict
from sklearn.preprocessing import MinMaxScaler

filepath = './user_items.json'
writefilepath = './user_items_playtime.csv'

data = []
game_ids_valid = set()	# We only want to have game ids that appear in any user's game list

num_games_considered = 0 # Just for profiling
num_valid_users = 0		# Also profiling

with open(filepath,encoding='gb18030',errors='ignore') as f:
	for line in f:
		l = ast.literal_eval(line)	# Convert single quotes in json to proper double quotes

		# Don't consider a user that doesn't own any games
		if int(l["items_count"]) > 0:		
			games = l["items"]
			games_filter = []
			for g in games:
				playtime = g["playtime_forever"]
				if playtime > 0:	# Also don't consider games not played
					game_id = int(g["item_id"])
					if game_id <=  max_item_id: 		# Set limit on games due to space
						games_filter.append({game_id: playtime})
						game_ids_valid.add(game_id)
						num_games_considered += 1

			if len(games_filter) >= 1:		
				num_valid_users += 1	
				data.append({l["user_id"]: games_filter})

print(max_item_id)
print(num_games_considered)
print(len(game_ids_valid))
print(num_valid_users)

1000
215786
24
56810


In [4]:
data

[{'76561197970982479': [{10: 6},
   {30: 7},
   {300: 4733},
   {240: 1853},
   {220: 696},
   {340: 37},
   {380: 168},
   {400: 173},
   {420: 323},
   {500: 513},
   {550: 1474},
   {620: 887},
   {730: 23532}]},
 {'js41637': [{300: 220},
   {240: 62},
   {220: 750},
   {340: 21},
   {380: 181},
   {400: 169},
   {420: 295},
   {620: 3464},
   {550: 83},
   {730: 265}]},
 {'evcentric': [{220: 1323},
   {340: 90},
   {380: 234},
   {400: 113},
   {420: 507},
   {550: 82},
   {620: 1485},
   {730: 1569}]},
 {'Riot-Punch': [{300: 67},
   {240: 2304},
   {220: 92},
   {400: 1234},
   {420: 119},
   {500: 103},
   {550: 330},
   {620: 5138},
   {730: 4167}]},
 {'doctr': [{300: 1131},
   {20: 89},
   {50: 178},
   {70: 108},
   {130: 313},
   {10: 93},
   {30: 16},
   {40: 4},
   {80: 13},
   {100: 47},
   {220: 89},
   {320: 187},
   {340: 19},
   {380: 491},
   {400: 167},
   {420: 233},
   {550: 9216},
   {240: 164},
   {620: 802},
   {730: 19800}]},
 {'MinxIsBetterThanPotatoes': [{50:

In [5]:
# Now with format as data (list of nested JSON objects) = 	[	{user_id: [	{item_id: playtime_forever}		]	}	]
# Convert to dataframe where
#		item_id		0		..		max_item_id		
#  user_id 			
#	x1				playtime_forever
#	...
#	xn


# Create list of game_id's only based on the valid game ids
game_list = list(game_ids_valid)
game_list = sorted(game_list)
print(*game_list)
games_dict = OrderedDict()

df = pd.DataFrame(columns = list(game_list))
items_list = np.zeros(len(game_list))


for user in data:
	
	# Clear entries of sorted Dictionary (and creates the first set of key-value pairs)
	for k in game_list:	
		games_dict[k] = 0

	key = next(iter(user))	# Note there's only one key. User is {user_id: [...]}
	df.loc[key] = 0
	items = user[key]
	for i in items:
		[(k, v)] = i.items()
		games_dict[k] = v

	# Make the list based on the sorted values in dict
	val = list(games_dict.values())
	for i in range(len(game_list)):
		df.loc[key][game_list[i]] = val[i]

print(df)

scaler = MinMaxScaler(feature_range=(0,10))
scaled_values = scaler.fit_transform(df)
df.loc[:,:] = scaled_values

print(df)

df.to_csv(writefilepath, index=False)

10 20 30 40 50 60 70 80 100 130 220 240 280 300 320 340 360 380 400 420 500 550 620 730
                           10   20   30   40   50   60    70   80   100  130  \
76561197970982479            6    0    7    0    0    0     0    0    0    0   
js41637                      0    0    0    0    0    0     0    0    0    0   
evcentric                    0    0    0    0    0    0     0    0    0    0   
Riot-Punch                   0    0    0    0    0    0     0    0    0    0   
doctr                       93   89   16    4  178    0   108   13   47  313   
MinxIsBetterThanPotatoes     0    0    0    0  256    0     0    0    0    0   
NitemarePK                   0    0    0    0    0    0     0    0    0    0   
themanwich                   0    0    0    0    0    0    32    0    0    0   
maplemage                    0   21    0    0    0    0    10    0    0    0   
corrupted_soul             108    0    0    0   45    0   227  245   37    0   
jorellpogi                   0  

  return self.partial_fit(X, y)


                               10        20        30        40        50   \
76561197970982479         0.000168  0.000000  0.000478  0.000000  0.000000   
js41637                   0.000000  0.000000  0.000000  0.000000  0.000000   
evcentric                 0.000000  0.000000  0.000000  0.000000  0.000000   
Riot-Punch                0.000000  0.000000  0.000000  0.000000  0.000000   
doctr                     0.002611  0.006291  0.001092  0.000541  0.024064   
MinxIsBetterThanPotatoes  0.000000  0.000000  0.000000  0.000000  0.034609   
NitemarePK                0.000000  0.000000  0.000000  0.000000  0.000000   
themanwich                0.000000  0.000000  0.000000  0.000000  0.000000   
maplemage                 0.000000  0.001484  0.000000  0.000000  0.000000   
corrupted_soul            0.003032  0.000000  0.000000  0.000000  0.006084   
jorellpogi                0.000000  0.000000  0.000000  0.000000  0.000000   
cadmusthreepointoh        0.000000  0.000000  0.000000  0.000000

In [6]:
df_matrix = pd.read_csv(writefilepath,engine='python')
df_matrix

Unnamed: 0,10,20,30,40,50,60,70,80,100,130,...,320,340,360,380,400,420,500,550,620,730
0,0.000168,0.000000,0.000478,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.090509,0.0,0.066643,0.085605,0.127577,0.027028,0.042998,0.075009,0.625407
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.051370,0.0,0.071800,0.083626,0.116518,0.000000,0.002421,0.292931,0.007043
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.220157,0.0,0.092824,0.055916,0.200253,0.000000,0.002392,0.125578,0.041699
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.610619,0.047002,0.005427,0.009626,0.434492,0.110746
4,0.002611,0.006291,0.001092,0.000541,0.024064,0.000000,0.014044,0.000643,0.079499,0.042322,...,0.003787,0.046477,0.0,0.194772,0.082636,0.092029,0.000000,0.268841,0.067821,0.526222
5,0.000000,0.000000,0.000000,0.000000,0.034609,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.074404,0.070939,0.0,0.067436,0.136078,0.214867,0.303522,0.324995,0.101393,1.968708
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.001721,0.178571,0.0,0.000000,0.207333,0.000000,0.615000,0.222109,0.273059,0.860347
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004161,0.000000,0.000000,0.000000,...,0.000081,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.010210,0.043974,0.312890
8,0.000000,0.001484,0.000000,0.000000,0.000000,0.000000,0.001300,0.000000,0.000000,0.000000,...,0.012171,0.039139,0.0,0.000793,0.238013,0.000395,0.000000,0.065635,0.243968,0.370880
9,0.003032,0.000000,0.000000,0.000000,0.006084,0.000000,0.029519,0.012115,0.062585,0.000000,...,0.003504,0.080724,0.0,0.078940,0.117769,0.110988,0.132979,0.376541,0.274327,0.249105


## 1.2 Game names

In [7]:
import ast
import json
import pandas as pd

filepath = './user_items.json'
writefilepath = './user_game_names.csv'

# max_game_id = 99920 for 100k cap
# 				= 19990 for 20k cap
# total cap is 530720

df = pd.DataFrame(columns = ['userid', 'game_names'])

with open(filepath,encoding='gb18030',errors='ignore') as f:
	i = 0
	for line in f:
		l = ast.literal_eval(line)	# Convert single quotes in json to proper double quotes
	
		# Don't consider a user that doesn't own any games
		if int(l["items_count"]) > 0:
			games = l["items"]
			games_filter = []
			for g in games:
				playtime = g["playtime_forever"]
				if playtime > 0:	# Also don't consider games not played
					game_id = int(g["item_id"])
					if game_id <= max_item_id: 		# Set limit on games due to space
						games_filter.append(g["item_name"])
			
			if len(games_filter) >= 1:
				df.loc[i] = pd.Series({'userid': l["user_id"], 'game_names': games_filter})
				i += 1

df.to_csv(writefilepath, index=False)

In [53]:
df_user_gamenames = pd.read_csv(writefilepath)
df_user_gamenames

Unnamed: 0,userid,game_names
0,76561197970982479,"['Counter-Strike', 'Day of Defeat', 'Day of De..."
1,js41637,"['Day of Defeat: Source', 'Counter-Strike: Sou..."
2,evcentric,"['Half-Life 2', 'Half-Life 2: Lost Coast', 'Ha..."
3,Riot-Punch,"['Day of Defeat: Source', 'Counter-Strike: Sou..."
4,doctr,"['Day of Defeat: Source', 'Team Fortress Class..."
5,MinxIsBetterThanPotatoes,"['Half-Life: Opposing Force', 'Counter-Strike:..."
6,NitemarePK,"['Counter-Strike: Source', 'Half-Life 2: Death..."
7,themanwich,"['Half-Life 2', 'Half-Life 2: Deathmatch', 'Po..."
8,maplemage,"['Counter-Strike: Source', 'Day of Defeat: Sou..."
9,corrupted_soul,"['Counter-Strike', 'Half-Life: Opposing Force'..."


In [33]:
#transform games names from word to int, using one_hot

from keras.preprocessing.text import one_hot

str = "Day of Defeat, Half-Life, Counter-Strike Source"

vocab_size   = 100
encoded_gamenames = [one_hot(d, vocab_size) for d in df_user_gamenames.game_names]

In [35]:
df_user_gamenames_encoded = pd.DataFrame(data=encoded_gamenames)    # values

In [52]:
df_user_gamenames_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
0,9,83.0,90.0,23.0,13.0,90.0,23.0,76.0,64.0,9.0,...,,,,,,,,,,
1,90,23.0,76.0,64.0,9.0,44.0,64.0,89.0,55.0,53.0,...,,,,,,,,,,
2,89,55.0,53.0,89.0,55.0,27.0,54.0,8.0,89.0,55.0,...,,,,,,,,,,
3,90,23.0,76.0,64.0,9.0,44.0,64.0,89.0,55.0,53.0,...,,,,,,,,,,
4,90,23.0,76.0,64.0,55.0,21.0,52.0,89.0,55.0,2.0,...,,,,,,,,,,
5,89,55.0,2.0,50.0,9.0,44.0,64.0,89.0,55.0,27.0,...,,,,,,,,,,
6,9,44.0,64.0,89.0,55.0,27.0,7.0,89.0,55.0,27.0,...,,,,,,,,,,
7,89,55.0,53.0,89.0,55.0,27.0,7.0,53.0,53.0,89.0,...,,,,,,,,,,
8,9,44.0,64.0,90.0,23.0,76.0,64.0,89.0,55.0,27.0,...,,,,,,,,,,
9,9,83.0,89.0,55.0,2.0,50.0,89.0,69.0,9.0,44.0,...,,,,,,,,,,


## 1.3 Other game info

## 1.4 User review

# 2. Model Compilation & Training

In [23]:
#From https://github.com/marlesson/recsys_autoencoders/blob/master/notebooks/DeepAutoEncoderContent%20-%20Simple%20Train.ipynb
from keras.optimizers import Adam
from keras.layers import Input, Dense, Dropout, Embedding, Flatten, add
from keras.models import Model

def autoEncoder(X):
    '''
    Autoencoder for Collaborative Filter Model
    '''
    users_items_matrix, content_info = X
    
    # Input
    input_layer   = Input(shape=(users_items_matrix.shape[1],), name='UserScore')
    input_content = Input(shape=(content_info.shape[1],), name='Itemcontent')
    
    # Encoder
    # -----------------------------
    enc = Dense(512, activation='selu', name='EncLayer1')(input_layer)

    # Content Information
    x_content = Embedding(100, 256, input_length=content_info.shape[1])(input_content)
    x_content = Flatten()(x_content)
    x_content = Dense(256, activation='selu', 
                                name='ItemLatentSpace')(x_content)
    
    # Latent Space
    # -----------------------------
    lat_space = Dense(256, activation='selu', name='UserLatentSpace')(enc)
    
    lat_space= add([lat_space, x_content], name='LatentSpace')
    lat_space = Dropout(0.8, name='Dropout')(lat_space) # Dropout

    # Decoder
    # -----------------------------
    dec = Dense(512, activation='selu', name='DecLayer1')(lat_space)

    # Output
    output_layer = Dense(users_items_matrix.shape[1], activation='linear', name='UserScorePred')(dec)

    # this model maps an input to its reconstruction
    model = Model([input_layer, input_content], output_layer)    
    #model = Model(input_layer, output_layer) 
    
    return model

In [48]:
# input
X = [df_matrix.values, df_user_gamenames_encoded.values]

#X = df_matrix.values
y = df_matrix.values

In [49]:
X

[array([[1.68436655e-04, 0.00000000e+00, 4.77867890e-04, ...,
         4.29982060e-02, 7.50086679e-02, 6.25406958e-01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         2.42120156e-03, 2.92931258e-01, 7.04287115e-03],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         2.39203045e-03, 1.25578209e-01, 4.16991126e-02],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 7.93851175e-02],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 1.21110807e-01]]),
 array([[ 9., 83., 90., ..., nan, nan, nan],
        [90., 23., 76., ..., nan, nan, nan],
        [89., 55., 53., ..., nan, nan, nan],
        ...,
        [ 9., 44., 33., ..., nan, nan, nan],
        [89., 55., 53., ..., nan, nan, nan],
        [ 9., 44., 33., ...

In [50]:
# Build model
model = autoEncoder(X)

model.compile(optimizer = Adam(lr=0.001), loss='mse')
    
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Itemcontent (InputLayer)        (None, 81)           0                                            
__________________________________________________________________________________________________
UserScore (InputLayer)          (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 81, 256)      25600       Itemcontent[0][0]                
__________________________________________________________________________________________________
EncLayer1 (Dense)               (None, 512)          12800       UserScore[0][0]                  
__________________________________________________________________________________________________
flatten_1 

In [51]:
model.fit(x=X, y=y,
                  epochs=50,
                  batch_size=64,
                  shuffle=True,
                  validation_split=0.1)

ValueError: All input arrays (x) should have the same number of samples. Got array shapes: [(56273, 24), (56810, 81)]

# 3. Predition and Recommendation

In [71]:
pred = model.predict(X)
pred

array([[ 0.02221716,  0.05469583,  0.05352887, ..., -0.03056788,
        -0.02662914, -0.01671449],
       [ 0.00067504,  0.0147235 , -0.00884739, ...,  0.01214318,
         0.13862145,  0.03182004],
       [-0.03920986,  0.10399526, -0.24858133, ...,  0.04826691,
        -0.19889656,  0.06890316],
       ...,
       [ 0.01545891,  0.01747779,  0.14518891, ..., -0.11534646,
         0.16622132,  0.27535713],
       [-0.24956739,  0.0381196 ,  0.22388308, ...,  0.05052831,
        -0.01953029,  0.11004142],
       [-0.09816835, -0.10622425, -0.10666654, ..., -0.09350001,
         0.12592831, -0.19054885]], dtype=float32)

In [70]:
# Set played game to zero, so that we dont recommend games played
mix_matrix = model.predict(X) * (X[0] == 0)
mix_matrix

array([[ 0.02221716,  0.05469583,  0.05352887, ..., -0.03056788,
        -0.02662914, -0.01671449],
       [ 0.00067504,  0.0147235 , -0.00884739, ...,  0.01214318,
         0.13862145,  0.03182004],
       [-0.03920986,  0.10399526, -0.24858133, ...,  0.04826691,
        -0.19889656,  0.06890316],
       ...,
       [ 0.01545891,  0.01747779,  0.14518891, ..., -0.11534646,
         0.16622132,  0.27535713],
       [-0.24956739,  0.0381196 ,  0.22388308, ...,  0.05052831,
        -0.01953029,  0.11004142],
       [-0.09816835, -0.10622425, -0.10666654, ..., -0.09350001,
         0.12592831, -0.19054885]], dtype=float32)

In [73]:
# converting the reconstructed matrix back to a Pandas dataframe
new_users_items_matrix_df  = pd.DataFrame(new_matrix, 
                                          columns = users_items_matrix_df.columns, 
                                          index   = users_items_matrix_df.index)
new_users_items_matrix_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.022217,0.054696,0.053529,-0.018703,-0.001404,0.050948,-0.0014,0.069496,-0.00035,-0.002506,...,0.038535,-0.028338,0.02416,-0.003957,-0.001961,-0.020634,-0.022172,-0.030568,-0.026629,-0.016714
1,0.000675,0.014724,-0.008847,0.003281,0.000425,0.002,0.027687,0.045296,-0.028931,0.030006,...,0.002335,0.00962,-0.00078,-0.042696,-0.014151,-0.011042,0.022031,0.012143,0.138621,0.03182
2,-0.03921,0.103995,-0.248581,0.042018,0.010824,-0.020828,0.028183,0.01744,0.100516,0.04619,...,0.035234,-0.049639,-0.026626,-0.07226,-0.010683,0.235524,0.036035,0.048267,-0.198897,0.068903
3,-0.005842,0.000468,0.012859,0.016692,0.009812,0.009096,-0.015284,0.052233,-0.019719,-0.05414,...,0.044339,-0.020546,-0.03746,-0.108308,0.059574,0.074207,0.015598,0.028231,0.028365,0.069245
4,0.001504,-0.005938,0.100381,0.010601,0.086651,0.008334,0.023461,-0.051387,-0.028575,0.015412,...,0.115842,0.078342,0.023092,-0.006845,0.040746,0.027805,0.046079,0.012721,-0.017885,0.007794


In [74]:
##TO DO:
#Match back to games to report recommend game names
#Match back to games and playtime of this user to report the history, so that we check if recommend make sense
def recommendor(user_id, interact_matrix, df_content, topn = 10):
    '''
    Recommender Games
    '''
    pred_scores = interact_matrix.loc[user_id].values

    df_scores   = pd.DataFrame({'content_id': list(users_items_matrix_df.columns), 
                               'score': pred_scores})

    df_rec      = df_scores.set_index('content_id')\
                    .join(df_content.set_index('content_id'))\
                    .sort_values('score', ascending=False)\
                    .head(topn)[['score', 'game']]
    
    return df_rec[df_rec.score > 0]

In [75]:
recommendor(user_id         = 1011, 
                     interact_matrix = users_items_matrix_df, 
                     df_content      = df_game)

NameError: name 'df_game' is not defined