### Content based approach wtih Doc2Vec

In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
#get the data and create a dataframe
csvPath = "../Datasets/dataWithRatings.csv"
df = pd.read_csv (csvPath)
df = df[df["popular_tags"].notna()].reset_index(drop=True)

Selecting Random User

In [3]:
#select a random user that has played more that minHours #JUST LIKE LAB1
minHours = 500
df['hours'] = df['hours'].str.replace(',', '.')
df['hours'] = df['hours'].astype(float)
users_ratings = df.groupby('user_id').hours.agg(['sum']).reindex(df.user_id).reset_index()
# users_ratings = df.groupby('user_id').total.agg(['sum']).reindex(df.idx).reset_index()
display(users_ratings)
selected = users_ratings['sum'] > minHours

selected_users = df.loc[selected]

random_selected = selected_users.sample() 
select_column_df = random_selected.reset_index()['user_id'] 
selected_user = select_column_df.iloc[0] 
print("Selected user: " + str(selected_user))

Unnamed: 0,user_id,sum
0,151603712,134.5
1,87445402,87.7
2,25096601,208.0
3,211925330,848.0
4,115396529,365.7
...,...,...
36175,154230723,923.1
36176,116564064,489.0
36177,135400225,1203.2
36178,135400225,1203.2


Selected user: 118838369


In [4]:
def tokenization(text):
    return text.split(",")

In [5]:
#create a dataframe with the games and tags
df1 = df.drop_duplicates(subset = ["name"])
df1 = df1[["name","popular_tags"]].reset_index(drop=True)

In [6]:
#tokenize the tags and create a doc2vec model
tags_doc = [TaggedDocument(words=tokenization(tags), tags=[str(num)]) for num, tags in enumerate(df1.popular_tags.values)]
max_epochs = 50
alpha = 0.025

model = Doc2Vec(alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm=0)
  
model.build_vocab(tags_doc)
#training the doc2vec model in 50 epochs
print('Epoch', end = ': ')
for epoch in range(max_epochs):
  print(epoch, end = ' ')
  model.train(tags_doc,
              total_examples=model.corpus_count,
              epochs=model.epochs)
  # decrease the learning rate
  model.alpha -= 0.0002
  # fix the learning rate, no decay
  model.min_alpha = model.alpha

Epoch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 

In [7]:
# example of how to recommend games based on a game
tags_vectors = model.docvecs
game = "MirrorMoon EP"
game_index = df1[df1["name"]==game].index.values[0]
#print(tags_vectors[game_index])
sims = model.docvecs.most_similar(positive = [game_index], topn = 10)

for i, j in sims:
  print(df1.loc[int(i), "name"].strip())

FRACT OSC
The Beginner's Guide
Kairo
Thirty Flights of Loving
VVVVVV
hocus
Cubot
FOTONICA
Antichamber
Chip's Challenge 2


In [8]:
games_played = df[['user_id','name','Rating M1','Rating M2' ]]
games_played = games_played[games_played['user_id']==selected_user]
games_played = games_played[games_played['Rating M1']>=4]
user_games_played = list(games_played['name'])

In [14]:
games_played = df[['user_id','name','Rating M1','Rating M2' ]]
games_played = games_played[games_played['user_id']==selected_user]
games_played = games_played[games_played['Rating M1']>=4]
user_games_played = list(games_played['name'])
#average vector of games played by the user
user_games_vec = np.zeros(shape = tags_vectors.vector_size)
for game in user_games_played:
    game_index = df1[df1["name"]==game].index.values[0]
    user_games_vec += tags_vectors[game_index]
#normalizing the vector    
user_games_vec = user_games_vec/len(user_games_played)     
#getting the most similar games of a user using the user vector
games = model.docvecs.most_similar(positive = [user_games_vec], topn = 20)  
for i, j in games:
  game = df1.loc[int(i), "name"].strip()
  if game not in user_games_played:
    print(game)  #the name of the game that the user never played
    print(j)
    print("------------")
    




Super Hipster Lumberjack
0.7811200618743896
------------
Mutant Mudds Deluxe
0.7804561853408813
------------
Mighty Gunvolt
0.7466316223144531
------------
Squirreltopia
0.7465349435806274
------------
Slip
0.7412266135215759
------------
Jet Gunner
0.740158200263977
------------
CreaVures
0.7400778532028198
------------
Super Puzzle Platformer Deluxe
0.7374098896980286
------------
A Walk in the Dark
0.7326610088348389
------------
Gateways
0.7315119504928589
------------
La-Mulana
0.727887749671936
------------
Stonerid
0.7241158485412598
------------
Environmental Station Alpha
0.7216774821281433
------------
Starseed Pilgrim
0.7171435952186584
------------
GunWorld
0.7169877290725708
------------
SUPER DISTRO
0.7146975994110107
------------
Toki Tori
0.7143628597259521
------------


In [12]:
# creating random groups
users_ratings = df.groupby(['user_id']).count()

selected = users_ratings['Rating M1'] > 100

selected_users = users_ratings.loc[selected]

random_selected = selected_users.sample(n=5) # sample() returns now n random rows from the dataframe. The returned object is a dataframe with five rows. 

select_column_df = random_selected.reset_index()['user_id'] # reset_index() create a new index, and the userId became a column. Then, we can filter using the column name

group_users = list(select_column_df) # iloc select by index, since our dataframe only has one row we read it from the index 0

group_ratings = df.loc[df['user_id'].isin(group_users)]

all_games = set(df.index.tolist())

num_ratings_df = df.groupby(['name']).count()

considered_games = set(num_ratings_df.loc[num_ratings_df['user_id'] > 10].reset_index()['name'])


group_seen_games = set(group_ratings['name'].tolist())

group_unseen_games = considered_games - group_seen_games



In [13]:
# calculate vector for each user
group_games_vec = np.zeros(shape = tags_vectors.vector_size)
for user in group_users:
  games_played = df[['user_id','name','Rating M1','Rating M2' ]]
  games_played = games_played[games_played['user_id']==user]
  games_played = games_played[games_played['Rating M1']>=4]
  user_games_played = list(games_played['name'])
  #average vector of games played by the user
  user_games_vec = np.zeros(shape = tags_vectors.vector_size)
  for game in user_games_played:
      game_index = df1[df1["name"]==game].index.values[0]
      user_games_vec += tags_vectors[game_index]
  #normalizing the vector    
  user_games_vec = user_games_vec/len(user_games_played)
  group_games_vec += user_games_vec    

#normalize group vector
group_games_vec = group_games_vec/len(group_users)
  
#getting the most similar games of a user using the user vector
games = model.docvecs.most_similar(positive = [group_games_vec], topn = 1514)
preferred_game = ""
preferred_game_similarity = 0
for i, j in games:
  game = df1.loc[int(i), "name"].strip()
  if game not in group_seen_games:
    preferred_game = game
    preferred_game_similarity = j
    break
  

print("The game \"" + preferred_game + "\" has been chosen since it acheives the highest rating score")
   
    




The game "Coniclysm" has been chosen since it acheives the highest rating score
