In [1]:
import pandas as pd
import numpy as np

# 1. Data Preparation

## 1.1 Playtime Matrix

Source file: User_items.json 

{'user_id': '76561197970982479',
 'items_count': 277,
 'steam_id': '76561197970982479',
 'user_url': 'http://steamcommunity.com/profiles/76561197970982479',
 'items': [{'item_id': '10',
 'item_name': 'Counter-Strike',
 'playtime_forever': 6,
 'playtime_2weeks': 0},
 {'item_id': '20',
 'item_name': 'Team Fortress Classic',
 'playtime_forever': 0,
 'playtime_2weeks': 0},}
 
 
 We reorganize it as a user_item matrix. This is the main input layer.

## To do
    
1. Instead of Setting limit on games, want to filter out games that are played by less than n people.

In [19]:
import ast

filepath = './user_items.json'
#writefilepath = './clean_user_items.json'
writefilepath = './user_items_matrix.csv'


data = []
max_item_id = 0
with open(filepath,encoding='gb18030',errors='ignore') as f:
    for line in f:
        l = ast.literal_eval(line)    # Convert single quotes in json to proper double quotes

        # Don't consider a user that doesn't own any games
        if int(l["items_count"]) > 0:        
            games = l["items"]
            games_filter = []
            for g in games:
                playtime = g["playtime_forever"]
                if playtime > 0:    # Also don't consider games not played
                    game_id = int(g["item_id"])
                    if game_id <= 100000:         # Set limit on games due to space
                        if game_id > max_item_id:
                            max_item_id = game_id
                        games_filter.append({game_id: playtime})
            data.append({l["user_id"]: games_filter})

print(max_item_id)

99920


In [22]:
data

[{'76561197970982479': [{10: 6},
   {30: 7},
   {300: 4733},
   {240: 1853},
   {3830: 333},
   {2630: 75},
   {3900: 338},
   {3920: 2},
   {6400: 286},
   {6910: 2685},
   {7670: 633},
   {220: 696},
   {340: 37},
   {380: 168},
   {400: 173},
   {420: 323},
   {9340: 692},
   {7940: 1185},
   {4700: 477},
   {12900: 115},
   {15700: 28},
   {17330: 31},
   {22000: 195},
   {500: 513},
   {4560: 1061},
   {17460: 1613},
   {10500: 186},
   {22200: 271},
   {26800: 445},
   {1250: 10006},
   {3590: 4413},
   {23120: 53},
   {35700: 199},
   {10140: 25},
   {35010: 570},
   {29180: 940},
   {15520: 110},
   {32370: 5},
   {37700: 782},
   {6020: 77},
   {24860: 437},
   {39530: 503},
   {550: 1474},
   {8980: 3061},
   {41500: 536},
   {20900: 139},
   {10180: 1886},
   {10190: 1784},
   {17450: 4431},
   {3170: 1021},
   {25900: 1389},
   {31410: 95},
   {24980: 5001},
   {8850: 1504},
   {46000: 11},
   {33230: 1636},
   {20820: 46},
   {47700: 85},
   {24960: 5716},
   {43110: 834},

In [31]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99910,99911,99912,99913,99914,99915,99916,99917,99918,99919


In [None]:
#Transform to matrix
##will be out of memory

# Intermediate data written out (not much use without a little more profiling; we still need max_item_id)
#with open(writefilepath, 'w') as outfile:
#    json.dump(data, outfile)

# Now with format as data (list of nested JSON objects) =     [    {user_id: [    {item_id: playtime_forever}        ]    }    ]
# Convert to dataframe where
#        item_id        0        ..        max_item_id        
#  user_id             
#    x1                playtime_forever
#    ...
#    xn


df = pd.DataFrame(columns = list(range(max_item_id)))

for user in data:
    key = next(iter(user))    # Note there's only one key. User is {user_id: [...]}
    items = user[key]
    items_list = [0] * max_item_id
    for i in items:
        [(k, v)] = i.items()
        items_list[k] = v
    df[key] = items_list

df.head(10)
df.to_csv(writefilepath)

## 1.2 Game names

In [28]:
str = "Day of Defeat, Half-Life, Counter-Strike Source"

from keras.preprocessing.text import one_hot
one_hot(str,100)  #one_hot matches word to integers, ignoring , -, etc

[10, 98, 24, 75, 21, 17, 50, 27]

## 1.3 Other game info

## 1.4 User review

# 2. Model Compilation & Training

In [32]:
#From https://github.com/marlesson/recsys_autoencoders/blob/master/notebooks/DeepAutoEncoderContent%20-%20Simple%20Train.ipynb
from keras.optimizers import Adam
from keras.layers import Input, Dense, Dropout, Embedding, Flatten, add
from keras.models import Model

def autoEncoder(X):
    '''
    Autoencoder for Collaborative Filter Model
    '''
    ##users_items_matrix, content_info = X
    users_items_matrix = X
    
    # Input
    input_layer   = Input(shape=(users_items_matrix.shape[1],), name='UserScore')
    ##input_content = Input(shape=(content_info.shape[1],), name='Itemcontent')
    
    # Encoder
    # -----------------------------
    enc = Dense(512, activation='selu', name='EncLayer1')(input_layer)

    ## Content Information
    ##x_content = Embedding(100, 256, input_length=content_info.shape[1])(input_content)
    ##x_content = Flatten()(x_content)
    ##x_content = Dense(256, activation='selu', 
    ##                            name='ItemLatentSpace')(x_content)
    
    # Latent Space
    # -----------------------------
    lat_space = Dense(256, activation='selu', name='UserLatentSpace')(enc)
    
    ##lat_space= add([lat_space, x_content], name='LatentSpace')
    lat_space = Dropout(0.8, name='Dropout')(lat_space) # Dropout

    # Decoder
    # -----------------------------
    dec = Dense(512, activation='selu', name='DecLayer1')(lat_space)

    # Output
    output_layer = Dense(users_items_matrix.shape[1], activation='linear', name='UserScorePred')(dec)

    # this model maps an input to its reconstruction
    ##model = Model([input_layer, input_content], output_layer)    
    model = Model(input_layer, output_layer) 
    
    return model

In [None]:
# input
##X = [users_items_matrix_df.values, padded_docs]
X = users_items_matrix_df.values
y = users_items_matrix_df.values

In [None]:
# Build model
model = autoEncoder(X)

model.compile(optimizer = Adam(lr=0.0001), loss='mse')
    
model.summary()

In [None]:
model.fit(x=X, y=y,
                  epochs=50,
                  batch_size=64,
                  shuffle=True,
                  validation_split=0.1)

# 3. Predition and Recommendation

In [None]:
# Predict new Matrix Interactions, set score zero on visualized games
new_matrix = model.predict(X) * (X[0] == 0)

In [None]:
# converting the reconstructed matrix back to a Pandas dataframe
new_users_items_matrix_df  = pd.DataFrame(new_matrix, 
                                          columns = users_items_matrix_df.columns, 
                                          index   = users_items_matrix_df.index)
new_users_items_matrix_df.head()

In [None]:
def recommendor(user_id, interact_matrix, df_content, topn = 10):
    '''
    Recommender Games for UserWarning
    '''
    pred_scores = interact_matrix.loc[user_id].values

    df_scores   = pd.DataFrame({'content_id': list(users_items_matrix_df.columns), 
                               'score': pred_scores})

    df_rec      = df_scores.set_index('content_id')\
                    .join(df_content.set_index('content_id'))\
                    .sort_values('score', ascending=False)\
                    .head(topn)[['score', 'game']]
    
    return df_rec[df_rec.score > 0]

In [None]:
recommendor(user_id         = 1011, 
                     interact_matrix = users_items_matrix_df, 
                     df_content      = df_game)