In [1]:
import pandas as pd
import numpy as np

# 1. Data Preparation

## 1.1 Playtime Matrix

Source file: User_items.json 

{'user_id': '76561197970982479',
 'items_count': 277,
 'steam_id': '76561197970982479',
 'user_url': 'http://steamcommunity.com/profiles/76561197970982479',
 'items': [{'item_id': '10',
 'item_name': 'Counter-Strike',
 'playtime_forever': 6,
 'playtime_2weeks': 0},
 {'item_id': '20',
 'item_name': 'Team Fortress Classic',
 'playtime_forever': 0,
 'playtime_2weeks': 0},}
 
 
 We reorganize it as a user_item matrix. This is the main input layer.

## To do
    
1. Instead of Setting limit on games, want to filter out games that are played by less than n people.

In [2]:
import ast

filepath = './user_items.json'
#writefilepath = './clean_user_items.json'
writefilepath = './user_items_matrix.csv'


data = []
max_item_id = 0
with open(filepath,encoding='gb18030',errors='ignore') as f:
    for line in f:
        l = ast.literal_eval(line)    # Convert single quotes in json to proper double quotes

        # Don't consider a user that doesn't own any games
        if int(l["items_count"]) > 0:        
            games = l["items"]
            games_filter = []
            for g in games:
                playtime = g["playtime_forever"]
                if playtime > 0:    # Also don't consider games not played
                    game_id = int(g["item_id"])
                    if game_id <= 1000:         # Set limit on games due to space
                        if game_id > max_item_id:
                            max_item_id = game_id
                        games_filter.append({game_id: playtime})
            data.append({l["user_id"]: games_filter})

print(max_item_id)

KeyboardInterrupt: 

In [None]:
data

In [None]:
#Transform to matrix
##will be out of memory

# Intermediate data written out (not much use without a little more profiling; we still need max_item_id)
#with open(writefilepath, 'w') as outfile:
#    json.dump(data, outfile)

# Now with format as data (list of nested JSON objects) =     [    {user_id: [    {item_id: playtime_forever}        ]    }    ]
# Convert to dataframe where
#        item_id        0        ..        max_item_id        
#  user_id             
#    x1                playtime_forever
#    ...
#    xn

max_item_id = max_item_id+1
df = pd.DataFrame(columns = list(range(max_item_id)))

for user in data:
    key = next(iter(user))    # Note there's only one key. User is {user_id: [...]}
    items = user[key]
    items_list = [0] * max_item_id
    for i in items:
        [(k, v)] = i.items()
        items_list[k] = v
    df[key] = items_list

df.head(10)
df.to_csv(writefilepath)

In [58]:
#generate dummy data

import json
import pandas as pd
import ast
import numpy as np
import random

writefilepath = './dummy_playtime.csv'

# Create 100 users
# 1000 game id's (sequential)
# Random number of games played between 1 and 10
# Assign random number as playtimes
# Randomly slot in the columns

users = []
for i in range(100):
	users.append(i)

game_ids = list(range(1000))

df = pd.DataFrame(index = users, columns = game_ids)

for i in range(100):
	row = np.zeros(1000)

	num_games = random.randint(1,11)
	random_game_ids = random.sample(range(1000), num_games)

	for j in random_game_ids:
		playtime = random.randint(1, 5)
		row[j] = playtime

	df.loc[i] = row

In [61]:
df.to_csv(writefilepath, index=False)
# np.save(writefilepath, df.values)

In [5]:
%cd E:\课程\2019 spr\ML\project

E:\课程\2019 spr\ML\project


In [62]:
df = pd.read_csv("dummy_playtime.csv",engine='python')

In [63]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1.2 Game names

In [None]:
str = "Day of Defeat, Half-Life, Counter-Strike Source"

from keras.preprocessing.text import one_hot
one_hot(str,100)  #one_hot matches word to integers, ignoring , -, etc

## 1.3 Other game info

## 1.4 User review

# 2. Model Compilation & Training

In [17]:
#From https://github.com/marlesson/recsys_autoencoders/blob/master/notebooks/DeepAutoEncoderContent%20-%20Simple%20Train.ipynb
from keras.optimizers import Adam
from keras.layers import Input, Dense, Dropout, Embedding, Flatten, add
from keras.models import Model

def autoEncoder(X):
    '''
    Autoencoder for Collaborative Filter Model
    '''
    ##users_items_matrix, content_info = X
    users_items_matrix = X
    
    # Input
    input_layer   = Input(shape=(users_items_matrix.shape[1],), name='UserScore')
    ##input_content = Input(shape=(content_info.shape[1],), name='Itemcontent')
    
    # Encoder
    # -----------------------------
    enc = Dense(512, activation='selu', name='EncLayer1')(input_layer)

    ## Content Information
    ##x_content = Embedding(100, 256, input_length=content_info.shape[1])(input_content)
    ##x_content = Flatten()(x_content)
    ##x_content = Dense(256, activation='selu', 
    ##                            name='ItemLatentSpace')(x_content)
    
    # Latent Space
    # -----------------------------
    lat_space = Dense(256, activation='selu', name='UserLatentSpace')(enc)
    
    ##lat_space= add([lat_space, x_content], name='LatentSpace')
    lat_space = Dropout(0.8, name='Dropout')(lat_space) # Dropout

    # Decoder
    # -----------------------------
    dec = Dense(512, activation='selu', name='DecLayer1')(lat_space)

    # Output
    output_layer = Dense(users_items_matrix.shape[1], activation='linear', name='UserScorePred')(dec)

    # this model maps an input to its reconstruction
    ##model = Model([input_layer, input_content], output_layer)    
    model = Model(input_layer, output_layer) 
    
    return model

In [64]:
# input
##X = [users_items_matrix_df.values, padded_docs]
users_items_matrix_df = df
X = users_items_matrix_df.values
y = users_items_matrix_df.values

In [65]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [68]:
# Build model
model = autoEncoder(X)

model.compile(optimizer = Adam(lr=0.001), loss='mse')
    
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
UserScore (InputLayer)       (None, 1000)              0         
_________________________________________________________________
EncLayer1 (Dense)            (None, 512)               512512    
_________________________________________________________________
UserLatentSpace (Dense)      (None, 256)               131328    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
DecLayer1 (Dense)            (None, 512)               131584    
_________________________________________________________________
UserScorePred (Dense)        (None, 1000)              513000    
Total params: 1,288,424
Trainable params: 1,288,424
Non-trainable params: 0
_________________________________________________________________


In [69]:
model.fit(x=X, y=y,
                  epochs=50,
                  batch_size=64,
                  shuffle=True,
                  validation_split=0.1)

Train on 90 samples, validate on 10 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x25d24db1a90>

# 3. Predition and Recommendation

In [71]:
pred = model.predict(X)
pred

array([[ 0.02221716,  0.05469583,  0.05352887, ..., -0.03056788,
        -0.02662914, -0.01671449],
       [ 0.00067504,  0.0147235 , -0.00884739, ...,  0.01214318,
         0.13862145,  0.03182004],
       [-0.03920986,  0.10399526, -0.24858133, ...,  0.04826691,
        -0.19889656,  0.06890316],
       ...,
       [ 0.01545891,  0.01747779,  0.14518891, ..., -0.11534646,
         0.16622132,  0.27535713],
       [-0.24956739,  0.0381196 ,  0.22388308, ...,  0.05052831,
        -0.01953029,  0.11004142],
       [-0.09816835, -0.10622425, -0.10666654, ..., -0.09350001,
         0.12592831, -0.19054885]], dtype=float32)

In [70]:
# Set played game to zero, so that we dont recommend games played
mix_matrix = model.predict(X) * (X[0] == 0)
mix_matrix

array([[ 0.02221716,  0.05469583,  0.05352887, ..., -0.03056788,
        -0.02662914, -0.01671449],
       [ 0.00067504,  0.0147235 , -0.00884739, ...,  0.01214318,
         0.13862145,  0.03182004],
       [-0.03920986,  0.10399526, -0.24858133, ...,  0.04826691,
        -0.19889656,  0.06890316],
       ...,
       [ 0.01545891,  0.01747779,  0.14518891, ..., -0.11534646,
         0.16622132,  0.27535713],
       [-0.24956739,  0.0381196 ,  0.22388308, ...,  0.05052831,
        -0.01953029,  0.11004142],
       [-0.09816835, -0.10622425, -0.10666654, ..., -0.09350001,
         0.12592831, -0.19054885]], dtype=float32)

In [73]:
# converting the reconstructed matrix back to a Pandas dataframe
new_users_items_matrix_df  = pd.DataFrame(new_matrix, 
                                          columns = users_items_matrix_df.columns, 
                                          index   = users_items_matrix_df.index)
new_users_items_matrix_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.022217,0.054696,0.053529,-0.018703,-0.001404,0.050948,-0.0014,0.069496,-0.00035,-0.002506,...,0.038535,-0.028338,0.02416,-0.003957,-0.001961,-0.020634,-0.022172,-0.030568,-0.026629,-0.016714
1,0.000675,0.014724,-0.008847,0.003281,0.000425,0.002,0.027687,0.045296,-0.028931,0.030006,...,0.002335,0.00962,-0.00078,-0.042696,-0.014151,-0.011042,0.022031,0.012143,0.138621,0.03182
2,-0.03921,0.103995,-0.248581,0.042018,0.010824,-0.020828,0.028183,0.01744,0.100516,0.04619,...,0.035234,-0.049639,-0.026626,-0.07226,-0.010683,0.235524,0.036035,0.048267,-0.198897,0.068903
3,-0.005842,0.000468,0.012859,0.016692,0.009812,0.009096,-0.015284,0.052233,-0.019719,-0.05414,...,0.044339,-0.020546,-0.03746,-0.108308,0.059574,0.074207,0.015598,0.028231,0.028365,0.069245
4,0.001504,-0.005938,0.100381,0.010601,0.086651,0.008334,0.023461,-0.051387,-0.028575,0.015412,...,0.115842,0.078342,0.023092,-0.006845,0.040746,0.027805,0.046079,0.012721,-0.017885,0.007794


In [74]:
##TO DO:
#Match back to games to report recommend game names
#Match back to games and playtime of this user to report the history, so that we check if recommend make sense
def recommendor(user_id, interact_matrix, df_content, topn = 10):
    '''
    Recommender Games
    '''
    pred_scores = interact_matrix.loc[user_id].values

    df_scores   = pd.DataFrame({'content_id': list(users_items_matrix_df.columns), 
                               'score': pred_scores})

    df_rec      = df_scores.set_index('content_id')\
                    .join(df_content.set_index('content_id'))\
                    .sort_values('score', ascending=False)\
                    .head(topn)[['score', 'game']]
    
    return df_rec[df_rec.score > 0]

In [75]:
recommendor(user_id         = 1011, 
                     interact_matrix = users_items_matrix_df, 
                     df_content      = df_game)

NameError: name 'df_game' is not defined