### Content based approach wtih Doc2Vec

In [239]:
# importing required libraries
import numpy as np
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial

In [78]:
# get the data and create a dataframe
csvPath = "../Datasets/dataWithRatings.csv"
df = pd.read_csv (csvPath)
df = df[df["popular_tags"].notna()].reset_index(drop=True)

Selecting Random User

In [79]:
# select a random user that has played more that minHours #JUST LIKE LAB1
minHours = 500
df['hours'] = df['hours'].str.replace(',', '.')
df['hours'] = df['hours'].astype(float)
users_ratings = df.groupby('user_id').hours.agg(['sum']).reindex(df.user_id).reset_index()
#users_ratings = df.groupby('user_id').total.agg(['sum']).reindex(df.idx).reset_index()
display(users_ratings)
selected = users_ratings['sum'] > minHours

selected_users = df.loc[selected]

random_selected = selected_users.sample() 
select_column_df = random_selected.reset_index()['user_id'] 
selected_user = select_column_df.iloc[0] 
print("Selected user: " + str(selected_user))

test_group = selected_users["user_id"].values
test_group = list(set(test_group))



Unnamed: 0,user_id,sum
0,151603712,134.5
1,87445402,87.7
2,25096601,208.0
3,211925330,848.0
4,115396529,365.7
...,...,...
36175,154230723,923.1
36176,116564064,489.0
36177,135400225,1203.2
36178,135400225,1203.2


Selected user: 138941587


In [5]:
def tokenization(text):
    return text.split(",")

In [80]:
#create a dataframe with the games and tags
df1 = df.drop_duplicates(subset = ["name"])
df1 = df1[["name","popular_tags"]].reset_index(drop=True)

In [302]:
# tokenize the tags and create a doc2vec model
tags_doc = [TaggedDocument(words=tokenization(tags), tags=[str(num)]) for num, tags in enumerate(df1.popular_tags.values)]
max_epochs = 50
alpha = 0.025

model = Doc2Vec(alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm=0)
  
model.build_vocab(tags_doc)

# training the doc2vec model in 50 epochs
print('Epoch', end = ': ')
for epoch in range(max_epochs):

  print(epoch, end = ' ')
  
  model.train(tags_doc,
              total_examples=model.corpus_count,
              epochs=model.epochs)

  # decrease the learning rate
  model.alpha -= 0.0002
  # fix the learning rate, no decay
  model.min_alpha = model.alpha


tags_vectors = model.docvecs  

Epoch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 

In [306]:

games_played = df[['user_id','name','Rating M1','Rating M2' ]]
games_played = games_played[games_played['user_id']==selected_user]
games_played = games_played[games_played['Rating M1']>=4]
user_games_played = list(games_played['name'])
user_tags = []

#Find tags of games user played 
for game in user_games_played:
  df2 = df[df["name"]==game]
  df2 = df2["popular_tags"]
  user_tags.extend(df2.values[0].split(","))
user_tags = list(set(user_tags))  



# average vector of games played by the user
user_games_vec = np.zeros(shape = tags_vectors.vector_size)
for game in user_games_played:
    game_index = df1[df1["name"]==game].index.values[0]
    user_games_vec += tags_vectors[game_index]

# normalizing the vector    
user_games_vec = user_games_vec/len(user_games_played)     
# getting the most similar games of a user using the user vector
suggested_games = []
similar_games_tags= []
tags_explanation = []
games = model.docvecs.most_similar(positive = [user_games_vec], topn = 15)  
for i, j in games:
  game = df1.loc[int(i), "name"].strip()
  if game not in user_games_played:
    suggested_games.append(game)
    df2 = df[df["name"]==game]
    df2 = df2["popular_tags"]
    tags = df2.values[0].split(",")
    similar_games_tags.extend(tags)
    for tag in tags:
      if tag in user_tags:
        tags_explanation.append(tag) 
  recomm = "This game: " + game + " has been recommended because of the following tags: | "
  for tag in tags:
    recomm+=tag + "|"
  print(recomm)  
           
    
similar_games_tags = list(set(similar_games_tags))

accuracy = 0
for tag in similar_games_tags:
  if tag in user_tags:
    accuracy +=1
acc = accuracy / len(user_tags)    
print("accuracy is:  " + str(acc)) 

This game: Gravity Badgers has been recommended because of the following tags: | Adventure|Point & Click|Puzzle|
This game: One Night has been recommended because of the following tags: | Indie|Adventure|
This game: Big Money! Deluxe has been recommended because of the following tags: | Casual|
This game: Mechanic Escape has been recommended because of the following tags: | Indie|Action|Platformer|Adventure|2D|
This game: The Adventures of Mr. Bobley has been recommended because of the following tags: | Indie|Adventure|Family Friendly|
This game: Chuzzle Deluxe has been recommended because of the following tags: | Casual|Puzzle|Match 3|Singleplayer|Cute|
This game: Dead Mountaineer's Hotel has been recommended because of the following tags: | Adventure|Point & Click|
This game: SUPER DISTRO has been recommended because of the following tags: | Indie|Action|Adventure|Platformer|Difficult|2D|
This game: Fieldrunners 2 has been recommended because of the following tags: | Tower Defense|St

In [307]:
# creating random groups
users_ratings = df.groupby(['user_id']).count()

selected = users_ratings['Rating M1'] > 100

selected_users = users_ratings.loc[selected]

# sample() returns now n random rows from the dataframe. The returned object is a dataframe with five rows 
random_selected = selected_users.sample(n=5) 

# reset_index() create a new index, and the userId became a column. Then, we can filter using the column name
select_column_df = random_selected.reset_index()['user_id'] 

# iloc select by index, since our dataframe only has one row we read it from the index 0
group_users = list(select_column_df) 

group_ratings = df.loc[df['user_id'].isin(group_users)]

all_games = set(df.index.tolist())

num_ratings_df = df.groupby(['name']).count()

considered_games = set(num_ratings_df.loc[num_ratings_df['user_id'] > 10].reset_index()['name'])

group_seen_games = set(group_ratings['name'].tolist())

group_unseen_games = considered_games - group_seen_games

In [263]:
aggf = ['Addition', 'Least_Misery', 'Most_Pleasure', 'Least_Misery+Most_Pleasure']
AggregationResult = pd.DataFrame(index = group_users, columns= games_not_played).fillna(0).astype(float)
for game in games_not_played:
  AggregationResult.at['Addition', game] = sum(new_df[game])
  AggregationResult.at['Least_Misery', game] = min(new_df[game])
  AggregationResult.at['Most_Pleasure', game] = max(new_df[game])
  AggregationResult.at['Least_Misery+Most_Pleasure', game] = min(new_df[game])+max(new_df[game])
  
AggregationResult=AggregationResult[5:]

AggregationResult = AggregationResult.sort_values(by ='Least_Misery+Most_Pleasure', axis=1, ascending=False)
AggregationResult.sort_values(by ='Addition', axis=1, ascending=False)
display(AggregationResult)

Unnamed: 0,3DMark 11,100% Orange Juice,10 Second Ninja,140,"10,000,000"
Addition,3.266651,3.229189,2.904982,2.623846,2.365811
Least_Misery,0.601566,0.552052,0.551453,0.488224,0.313053
Most_Pleasure,0.77842,0.714734,0.669681,0.599936,0.565103
Least_Misery+Most_Pleasure,1.379985,1.266786,1.221134,1.088161,0.878155


In [312]:
# calculate vector for each user
group_games_vec = np.zeros(shape = tags_vectors.vector_size)
group_tags = []
group_members_tags = []
for user in group_users:
  games_played = df[['user_id','name','Rating M1','Rating M2' ]]
  games_played = games_played[games_played['user_id']==user]
  games_played = games_played[games_played['Rating M1']>=4]
  user_games_played = list(games_played['name'])
  # average vector of games played by the user
  user_games_vec = np.zeros(shape = tags_vectors.vector_size)
  for game in user_games_played:
      game_index = df1[df1["name"]==game].index.values[0]
      user_games_vec += tags_vectors[game_index]
      df2 = df[df["name"]==game]
      df2 = df2["popular_tags"]
      group_tags.extend(df2.values[0].split(","))
  # normalizing the vector    
  user_games_vec = user_games_vec/len(user_games_played)
  group_members_tags.append(user_games_vec)
  group_games_vec += user_games_vec
  group_tags = list(set(group_tags))




# normalize group vector
group_games_vec = group_games_vec/len(group_users)
# getting the most similar games of a user using the group vector
cnt = 0
games = model.docvecs.most_similar(positive = [group_games_vec], topn = 10)
preferred_game = ""
preferred_game_similarity = 0
for i, j in games:
  game = df1.loc[int(i), "name"].strip()
  if game not in group_seen_games:
    df2 = df[df["name"]==game]
    df2 = df2["popular_tags"]
    tags = df2.values[0].split(",")
    tagsss= []
    k=0
    for tag in group_tags:
      if tag in tags:
        tagsss.append(tag)
        k+=1
        if k==3:
          break

    preferred_game = game
    preferred_game_similarity = j
    strr = "The game \"" + preferred_game + "\" has been chosen since it achieves the highest rating score and contains the group’s preferred tags: |" 
    tagsss = list(set(tagsss))
    for tag in tagsss:
      strr +=tag
      strr+="|"      
    print(strr)
    cnt+=1
    if cnt==5:
      break
  



The game "Coniclysm" has been chosen since it achieves the highest rating score and contains the group’s preferred tags: |Action|
The game "The Baconing" has been chosen since it achieves the highest rating score and contains the group’s preferred tags: |Local Co-Op|Action RPG|Comedy|
The game "Delta Force 2" has been chosen since it achieves the highest rating score and contains the group’s preferred tags: |Classic|Action|Tactical|
The game "Guild Wars Trilogy" has been chosen since it achieves the highest rating score and contains the group’s preferred tags: |PvP|MMORPG|Massively Multiplayer|
The game "Arma 3" has been chosen since it achieves the highest rating score and contains the group’s preferred tags: |Co-op|Moddable|Tactical|


In [311]:
#least Misery ans Most pleasure
group_members_tags = np.array(group_members_tags)
least_misery = []
most_pleasure = []
for i in range(len(group_members_tags[0])):
    least_misery.append(np.amin(group_members_tags[:,i]))
    most_pleasure.append(np.amax(group_members_tags[:,i]))
least_misery = np.array(least_misery) 
most_pleasure = np.array(most_pleasure)  
lm_mp = least_misery + most_pleasure   
# getting the most similar games of a user using the group least misery vector
games = model.docvecs.most_similar(positive = [lm_mp], topn = 10)
preferred_game = ""
preferred_game_similarity = 0
cnt = 0
for i, j in games:
  game = df1.loc[int(i), "name"].strip()
  if game not in group_seen_games:
    preferred_game = game
    preferred_game_similarity = j   
    k=0
    for tag in group_tags:
      if tag in tags:
        tagsss.append(tag)
        k+=1
        if k==3:
          break

    preferred_game = game
    preferred_game_similarity = j
    strr = "The game \"" + preferred_game + "\" has been chosen since it prevents misery, ensures pleasure and contains the group’s preferred tags: |" 
    tagsss = list(set(tagsss))
    for tag in tagsss:
      strr +=tag
      strr+="|"      
    print(strr)
    cnt+=1
    if cnt==5:
      break
 
  




The game "Coniclysm" has been chosen since it prevents misery, ensures pleasure and contains the group’s preferred tags: |Co-op|Moddable|Tactical|
The game "Arma 3" has been chosen since it prevents misery, ensures pleasure and contains the group’s preferred tags: |Co-op|Moddable|Tactical|
The game "Guild Wars Trilogy" has been chosen since it prevents misery, ensures pleasure and contains the group’s preferred tags: |Co-op|Moddable|Tactical|
The game "Delta Force 2" has been chosen since it prevents misery, ensures pleasure and contains the group’s preferred tags: |Co-op|Moddable|Tactical|
The game "Damnation City of Death" has been chosen since it prevents misery, ensures pleasure and contains the group’s preferred tags: |Co-op|Moddable|Tactical|


In [191]:
#user testing

accuracies = []
ids = []


for user in test_group:
  games_played = df[['user_id','name','Rating M1','Rating M2' ]]
  games_played = games_played[games_played['user_id']==user]
  games_played = games_played[games_played['Rating M1']>=4]
  user_games_played = list(games_played['name'])
  user_tags = []

  #Find tags of games user played 
  for game in user_games_played:
    df2 = df[df["name"]==game]
    df2 = df2["popular_tags"]
    user_tags.extend(df2.values[0].split(","))
  user_tags = list(set(user_tags))  



  # average vector of games played by the user
  user_games_vec = np.zeros(shape = tags_vectors.vector_size)
  for game in user_games_played:
      game_index = df1[df1["name"]==game].index.values[0]
      user_games_vec += tags_vectors[game_index]

  # normalizing the vector    
  user_games_vec = user_games_vec/len(user_games_played)     
  # getting the most similar games of a user using the user vector
  suggested_games = []
  similar_games_tags= []

  games = model.docvecs.most_similar(positive = [user_games_vec], topn = 15)  
  for i, j in games:
    game = df1.loc[int(i), "name"].strip()
    if game not in user_games_played:
      suggested_games.append(game)
      df2 = df[df["name"]==game]
      df2 = df2["popular_tags"]
      similar_games_tags.extend(df2.values[0].split(","))
      
  similar_games_tags = list(set(similar_games_tags))

  accuracy = 0
  for tag in similar_games_tags:
    if tag in user_tags:
      accuracy +=1
  acc = accuracy / len(user_tags)   
  accuracies.append(acc)
  ids.append(user)    
accuracy = sum(accuracies) / len(test_group)
print("The accuracy of the model is: "+ str(accuracy))
df_acc = pd.DataFrame()
df_acc["ids"]= ids
df_acc["accuracies"]= accuracies
df_acc

The accuracy of the model is: 0.8271641178279602


Unnamed: 0,ids,accuracies
0,78309377,0.544304
1,132196353,0.777778
2,67694595,0.806452
3,144412676,1.000000
4,110776325,1.000000
...,...,...
1067,23154676,0.850000
1068,201678836,1.000000
1069,103804924,1.000000
1070,159428605,0.970588
