# PreProcess

In [38]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import jaccard_score as jscore
from scipy.spatial.distance import pdist, squareform
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds 
from nltk.corpus import stopwords as sw
import chart_studio.plotly as py
import cufflinks as cf
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
%matplotlib inline

# Reviews

In [2]:
reviews_df = pd.read_csv('metacritic_reviews_main.csv').drop(['Unnamed: 0'],axis=1)
reviews_df.head()

Unnamed: 0,ids,name,game,rating,review
0,118429,ThomasR.,Combat Mission: Barbarossa to Berlin,9,Needs a brush-up on the graphics.
1,58304,JoaquinD.,Combat Mission: Barbarossa to Berlin,10,Perfect strategic and tactical WWII wargame. ...
2,78199,MichaelDorosh,Combat Mission: Barbarossa to Berlin,10,"Still the gold standard for WEGO, squad-based,..."
3,143673,dinin70,Combat Mission: Barbarossa to Berlin,9,I don't think it deserves 10/10 for a simple r...
4,14454,BlueFalcon,NHL 2002,6,"I was hyped about this release at the time, b..."


In [3]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555661 entries, 0 to 555660
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ids     555661 non-null  int64 
 1   name    555650 non-null  object
 2   game    555661 non-null  object
 3   rating  555661 non-null  int64 
 4   review  555308 non-null  object
dtypes: int64(2), object(3)
memory usage: 21.2+ MB


In [4]:
reviews_df[reviews_df['name'].isna()]

Unnamed: 0,ids,name,game,rating,review
42046,-1,,Diablo III,4,Stay awhile and listen to my whining. If you t...
65948,-1,,Resident Evil 5,5,Co-op is the only thing that makes this game ...
108234,-1,,The Walking Dead: Episode 1 - A New Day,8,I'm not a big fan of the TV series that goes w...
147609,-1,,Enslaved: Odyssey to the West,6,I haven't finished Enslaved completely and I'l...
148608,-1,,Hitman: Absolution,6,Hitman: Absolution fails to be a great game be...
233870,167115,,Diablo III,4,Stay awhile and listen to my whining. If you t...
257821,82685,,Resident Evil 5,5,Co-op is the only thing that makes this game ...
300222,167115,,The Walking Dead: Episode 1 - A New Day,8,I'm not a big fan of the TV series that goes w...
340348,167115,,Enslaved: Odyssey to the West,6,I haven't finished Enslaved completely and I'l...
341543,167115,,Hitman: Absolution,6,Hitman: Absolution fails to be a great game be...


These are actually duplicate rows but for some reason the ids did not get duplicated. This actually works out since we'll only need the ids anyway. Lets do a little research just to be sure.

In [5]:
# get rid of rows with ids=-1
reviews = reviews_df.sort_values('ids').iloc[5:,:]

In [6]:
reviews = reviews.drop_duplicates().reset_index().iloc[:,1:]
reviews = reviews.iloc[:,[0,2,3,4]].drop(416615).reset_index(drop=True)
reviews = reviews.drop_duplicates(['ids', 'game'])
reviews

Unnamed: 0,ids,game,rating,review
0,0,Rockstar Games presents Table Tennis,9,Amazingly Addicting!
1,0,Disney's Magical Mirror Starring Mickey Mouse,10,this game lets you able to play as mickey mou...
2,1,Deus Ex: Invisible War,1,Yet another disappointment to add to 2003's r...
3,2,Mobile Suit Gundam: Journey to Jaburo,10,I love this Game!
4,3,Dead to Rights: Reckoning,10,This is a really good game for people ...
...,...,...,...,...
416613,186351,Enthusia Professional Racing,10,I think this game is better than GT4 because ...
416614,186352,Crash Bandicoot 2: N-Tranced,10,It's just great. It is very original and you'...
416615,186353,RollerCoaster Tycoon: Loopy Landscapes,10,"Perfect! No blood, no terror. Tüm dünyaca oyn..."
416616,186354,Dominions 3: The Awakening,8,Dominions is a very special 4X fantasy game in...


In [11]:
# reviews.to_csv('metacritic_reviews.csv')

In [7]:
reviews['ids'].value_counts()[reviews['ids'].value_counts()>1]

131893    1122
16496      854
112980     540
128382     512
117211     488
          ... 
185107       2
171310       2
40132        2
21108        2
38220        2
Name: ids, Length: 51431, dtype: int64

In [47]:
game_frequency = reviews['game'].value_counts()
game_frequency[:10]

Grand Theft Auto V                 640
Call of Duty: Ghosts               489
DOOM                               427
Resident Evil 4                    425
Resident Evil 2                    413
Resident Evil 6                    401
The Elder Scrolls V: Skyrim        401
Borderlands 2                      398
Cyberpunk 2077                     392
Assassin's Creed IV: Black Flag    390
Name: game, dtype: int64

In [48]:
reviews['rating'].value_counts()

10    121106
9      68901
8      53499
0      35459
7      31135
6      20805
5      17784
4      14907
3      13171
1      13008
2      11616
Name: rating, dtype: int64

This makes sense as people are more inclined to leave a review if they really like a game or if they hate it with a passion than if they only think the game is decent.

In [49]:
reviews[['game', 'rating']].groupby('game').mean().sort_values('rating', ascending = False)[:10]

Unnamed: 0_level_0,rating
game,Unnamed: 1_level_1
Rack N Ruin,10.0
Dynasty Warriors DS: Fighter's Battle,10.0
Heroes Chronicles: Conquest of the Underworld,10.0
Tiger Woods PGA Tour 2001,10.0
Block Factory,10.0
NHL 2K11,10.0
AlphaBounce,10.0
Wild Earth: African Safari,10.0
NFL GameDay 2003,10.0
Dr. Seuss' The Cat in the Hat,10.0


In [53]:
# number of reviews for Blasters of the Universe
(reviews['game']=='Rack N Ruin').sum()

1

In [54]:
# number of reviews for Super Swing Golf Season 2
(reviews['game']=="Dynasty Warriors DS: Fighter's Battle").sum()

1

Considering we already know what games should be the best, we can see that this grouping put alot of the games at the top that were only played by few people who happened to love the game and give it a 10 so this output is not reliable. Lets incorporate the frequency of the games. 

In [55]:
high_reviewed_games = game_frequency[game_frequency>100].index
high_reviewed_games[:10]

Index(['Grand Theft Auto V', 'Call of Duty: Ghosts', 'DOOM', 'Resident Evil 4',
       'Resident Evil 2', 'Resident Evil 6', 'The Elder Scrolls V: Skyrim',
       'Borderlands 2', 'Cyberpunk 2077', 'Assassin's Creed IV: Black Flag'],
      dtype='object')

In [56]:
frequent_games = reviews[reviews['game'].isin(high_reviewed_games)]
frequent_games[['game', 'rating']].groupby('game').mean().sort_values('rating', ascending = False)[:10].round(3)

Unnamed: 0_level_0,rating
game,Unnamed: 1_level_1
Half-Life,9.81
Astro's Playroom,9.75
Paper Mario: The Thousand-Year Door,9.733
Astral Chain,9.663
Portal 2,9.648
The Witcher 2: Assassins of Kings,9.621
LittleBigPlanet,9.613
DOOM Eternal,9.591
Assassin's Creed Valhalla,9.582
Kirby and the Forgotten Land,9.578


These games definitely look a lot more familiar<br>
Now lets try and pair the games off with games played by the same users

In [21]:
from itertools import permutations
def get_pairs(col):
    pairs = pd.DataFrame(list(permutations(col.values, 2)), columns = ['game1', 'game2'])
    return pairs

In [None]:
# game_pairs = reviews.groupby('ids')['game'].apply(get_pairs)
# game_pairs = game_pairs.reset_index(drop=True)
# game_pairs.head()

In [None]:
# pair_counts = game_pairs.groupby(['game1', 'game2']).size()
# pair_counts

In [None]:
# pairs_df = pair_counts.to_frame(name = 'counts').reset_index()
# pairs_df

In [None]:
# pairs_df = pairs_df.sort_values('counts', ascending=False)

In [28]:
# pairs_unique = pairs_df[pairs_df['game1']!=pairs_df['game2']]
pairs_unique = pd.read_csv('game_pairings_by_counts.csv').drop(['Unnamed: 0'],axis=1)
pairs_unique

Unnamed: 0,game1,game2,counts
0,Perfect Dark Zero,New Super Mario Bros.,141
1,New Super Mario Bros.,Perfect Dark Zero,141
2,F.E.A.R.,New Super Mario Bros.,129
3,New Super Mario Bros.,F.E.A.R.,129
4,Neverwinter Nights,New Super Mario Bros.,113
...,...,...,...
6594881,I Am Bread,Destiny 2: Warmind,1
6594882,I Am Bread,Destiny 2: Forsaken,1
6594883,I Am Bread,Destiny 2: Curse of Osiris,1
6594884,I Am Bread,Depth,1


In [29]:
pairs_unique[pairs_unique['game1']=='Minecraft'][:10]

Unnamed: 0,game1,game2,counts
18056,Minecraft,Rocket League,10
42122,Minecraft,Fortnite,7
46560,Minecraft,Tetris 99,7
49142,Minecraft,Super Mario Maker 2,7
78945,Minecraft,"Snipperclips - Cut it out, together!",6
80086,Minecraft,Thumper,6
96236,Minecraft,Fast RMX,6
96307,Minecraft,Crash Bandicoot N. Sane Trilogy,6
116116,Minecraft,L.A. Noire,5
127769,Minecraft,LEGO City Undercover,5


In [30]:
pairs_unique[pairs_unique['game1']=='Grand Theft Auto V'][:10]

Unnamed: 0,game1,game2,counts
2556,Grand Theft Auto V,Hitman: Absolution,19
2845,Grand Theft Auto V,Mortal Kombat X,18
3715,Grand Theft Auto V,Far Cry 4,17
5000,Grand Theft Auto V,Battlefield Hardline,15
5041,Grand Theft Auto V,Mad Max,15
6728,Grand Theft Auto V,FIFA 15,14
6869,Grand Theft Auto V,The Crew,14
8688,Grand Theft Auto V,Sniper Elite III,13
8850,Grand Theft Auto V,Middle-earth: Shadow of Mordor,13
9159,Grand Theft Auto V,Assassin's Creed Unity,13


In [31]:
pairs_unique[pairs_unique['game1']=='Elden Ring'][:10]

Unnamed: 0,game1,game2,counts
124847,Elden Ring,Tunic,5
129819,Elden Ring,Bloodstained: Ritual of the Night,5
129899,Elden Ring,CrossfireX,5
177731,Elden Ring,Tom Clancy's Ghost Recon: Breakpoint,4
177800,Elden Ring,Metro Exodus,4
177849,Elden Ring,Resident Evil Village,4
177869,Elden Ring,Outriders,4
179840,Elden Ring,Celeste,4
179854,Elden Ring,God of War,4
540285,Elden Ring,Far Cry 6,3


In [96]:
# pairs_unique.to_csv('game_pairings_by_counts.csv')

# Games

In [138]:
games = pd.read_csv('metacritic_games_master2.csv').drop(['Unnamed: 0'], axis = 1)
genre_norepeat = [', '.join(sorted(set(i.split(',')))).strip() for i in games.genre.values]
games['genres'] = genre_norepeat
games.head()

Unnamed: 0,title,release_date,genre,platforms,developer,esrb_rating,ESRBs,metascore,userscore,critic_reviews,user_reviews,num_players,summary,genres
0,Burnout 3: Takedown,"Sep 7, 2004","Driving, Racing, Arcade",Xbox,Criterion Games,T,Mild Language Mild Violence,94,7.7,76,191.0,"1-2 Players, 8 Players Online",Burnout 3 challenges you to crash into (and th...,"Arcade, Racing, Driving"
1,Jet Grind Radio,"Oct 30, 2000","Action, Platformer, 3D",Dreamcast,Smilebit,T,Animated Violence Mild Language,94,8.3,24,105.0,1 Player,"Join a graffiti crew, stamp your territory and...","3D, Platformer, Action"
2,Metal Gear Solid 4: Guns of the Patriots,"Jun 12, 2008","Action Adventure, Modern, General, Modern",PlayStation 3,Kojima Productions,M,Blood Crude Humor Strong Language Suggestive T...,94,8.8,82,4231.0,"1 Player, 16 Players Online Up to 16 Players",Metal Gear Solid 4: Guns of the Patriots featu...,"General, Modern, Action Adventure"
3,Tom Clancy's Splinter Cell Chaos Theory,"Mar 28, 2005","Action Adventure, Modern",Xbox,Ubisoft Montreal,M,Blood Strong Language Violence,94,9.1,70,233.0,"1-4 Players, 4 Players Online","As Sam Fisher, Third Echelon's most skilled Sp...","Modern, Action Adventure"
4,Call of Duty: Modern Warfare 2,"Nov 10, 2009","Action, Shooter, Shooter, First-Person, Modern...",Xbox 360,Infinity Ward,M,Blood Drug Reference Intense Violence Language,94,6.6,100,3055.0,Up to 18 Players,Modern Warfare 2 continues the gripping and he...,"Arcade, First-Person, Modern, Shooter, Action"


In [137]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19317 entries, 0 to 19316
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           19317 non-null  object 
 1   release_date    19317 non-null  object 
 2   genre           19317 non-null  object 
 3   platforms       19317 non-null  object 
 4   developer       19298 non-null  object 
 5   esrb_rating     17202 non-null  object 
 6   ESRBs           7855 non-null   object 
 7   metascore       19317 non-null  int64  
 8   userscore       19317 non-null  object 
 9   critic_reviews  19317 non-null  int64  
 10  user_reviews    17953 non-null  float64
 11  num_players     19304 non-null  object 
 12  summary         19199 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 1.9+ MB


#### Get Genre Similarities

In [None]:
title_genres = games.iloc[:,[0,13]].drop_duplicates().reset_index(drop=True)
genres_stack = title_genres.genres.str.strip().str.split(', ').apply(pd.Series)
genres_stack.index = title_genres.set_index(['title']).index
title_genre_stack = genres_stack.stack().reset_index(['title'])
title_genre_stack = title_genre_stack.rename(columns={0:'genre'}).drop_duplicates()
title_genre_stack.genre = title_genre_stack.genre.str.strip()
title_genre_stack.head()

In [None]:
title_genre_spread = pd.crosstab(title_genre_stack.title, title_genre_stack.genre)
title_genre_spread = title_genre_spread.iloc[:,1:]
title_genre_spread.head()

In [16]:
darb = title_genre_spread.loc['#IDARB']
xxx = title_genre_spread.loc['xXx']
jscore(darb, xxx)

0.16666666666666666

In [22]:
jdistances = pdist(title_genre_spread.values, metric='jaccard')
squarej = squareform(jdistances)
jsims = 1-squarej
genre_similarities = pd.DataFrame(jsims, index = title_genre_spread.index, columns=title_genre_spread.index)

In [None]:
# genre_similarities.to_csv('genre_similarities.csv')
genre_similarities = pd.read_csv('genre_similarities.csv').set_index('title')
genre_similarities

In [9]:
genre_similarities['Elden Ring'].sort_values(ascending=False)[:10]

title
Conan Chop Chop                             1.0
Sigma Star Saga                             1.0
Akaneiro: Demon Hunters                     1.0
Heroes of Hammerwatch - Ultimate Edition    1.0
Victor Vran: Overkill Edition               1.0
Battle Princess of Arcadias                 1.0
Dark Souls                                  1.0
Dark Souls II                               1.0
Moero Crystal H                             1.0
Dark Souls II: Crown of the Ivory King      1.0
Name: Elden Ring, dtype: float64

#### Summary Comparisons

In [139]:
title_summary = games[['title', 'summary']].dropna().drop_duplicates(subset='title').reset_index(drop=True)
title_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12454 entries, 0 to 12453
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    12454 non-null  object
 1   summary  12454 non-null  object
dtypes: object(2)
memory usage: 194.7+ KB


In [140]:
stopwords = sw.words('english')
tfid = TfidfVectorizer(min_df=2, max_df=.7, stop_words = stopwords)
sum_vec = tfid.fit_transform(title_summary.summary)
tfid_summ = pd.DataFrame(sum_vec.toarray(), columns=tfid.get_feature_names())
tfid_summ.index = title_summary.title
tfid_summ.head()

Unnamed: 0_level_0,00,000,007,01,02,03,06,07,08,09,...,zoro,zoya,zulu,zumba,zuntata,zuul,zx,½ll,½s,â½
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Burnout 3: Takedown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jet Grind Radio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Metal Gear Solid 4: Guns of the Patriots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tom Clancy's Splinter Cell Chaos Theory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Call of Duty: Modern Warfare 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# cos_sim = cosine_similarity(tfid_summ)
# cosine_summ_df = pd.DataFrame(cos_sim, index = tfid_summ.index, columns = tfid_summ.index)
# cosine_summ_df.head()

In [154]:
# cosine_summ_df.to_csv('cosine_summaries.csv')
cosine_summ_df = pd.read_csv('cosine_summaries.csv').set_index('title')
cosine_summ_df.head()

Unnamed: 0,title,Burnout 3: Takedown,Jet Grind Radio,Metal Gear Solid 4: Guns of the Patriots,Tom Clancy's Splinter Cell Chaos Theory,Call of Duty: Modern Warfare 2,NCAA Football 2004,Metal Gear Solid 3: Subsistence,Diablo,Madden NFL 2004,...,Infestation: Survivor Stories (The War Z),Alone in the Dark: Illumination,Ride to Hell: Retribution,SPOGS Racing,Yaris,Double Dragon II: Wander of the Dragons,Vroom in the Night Sky,Family Party: 30 Great Games Obstacle Arcade,Smash T.V.,JoJo's Bizarre Adventure: Eyes of Heaven
0,Burnout 3: Takedown,1.0,0.0,0.003786,0.0,0.018341,0.031566,0.020545,0.0,0.009464,...,0.007152,0.0,0.009921,0.070889,0.042889,0.011889,0.003872,0.0,0.009608,0.010073
1,Jet Grind Radio,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.026917,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Metal Gear Solid 4: Guns of the Patriots,0.003786,0.0,1.0,0.021746,0.024551,0.033788,0.376376,0.0,0.018787,...,0.0,0.006956,0.005481,0.020215,0.017844,0.0,0.001793,0.007688,0.022975,0.045028
3,Tom Clancy's Splinter Cell Chaos Theory,0.0,0.0,0.021746,1.0,0.062034,0.02013,0.039811,0.025631,0.018097,...,0.0,0.007007,0.0,0.00576,0.012193,0.0,0.001254,0.0,0.009208,0.025795
4,Call of Duty: Modern Warfare 2,0.018341,0.0,0.024551,0.062034,1.0,0.069082,0.079231,0.006718,0.0554,...,0.0,0.023957,0.01645,0.042662,0.087567,0.0,0.007685,0.0,0.038884,0.034108


In [158]:
cosine_summ_df.loc['Call of Duty: Warzone'].sort_values(ascending=False)[:10]

Call of Duty: Warzone               1.000000
Call of Duty: Vanguard              0.201527
H1Z1: Battle Royale                 0.177544
Call of Duty: Black Ops Cold War    0.177299
H1Z1                                0.129885
Worldwide Soccer Manager 2009       0.123156
Mercenaries 2: World in Flames      0.118759
Crasher                             0.115666
EA Sports Active 2                  0.112992
Spy Hunter 2                        0.111223
Name: Call of Duty: Warzone, dtype: float64

#### Build user profiles by title

In [8]:
user_titles = pd.DataFrame(reviews[['ids', 'game']].groupby('ids').game.apply(list))
user_titles['game_counts'] = [user_titles.game[i].__len__() for i in range(len(user_titles))]
user_titles

Unnamed: 0_level_0,game,game_counts
ids,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[Rockstar Games presents Table Tennis, Disney'...",2
1,[Deus Ex: Invisible War],1
2,[Mobile Suit Gundam: Journey to Jaburo],1
3,[Dead to Rights: Reckoning],1
4,[NHL 2K3],1
...,...,...
186351,[Enthusia Professional Racing],1
186352,[Crash Bandicoot 2: N-Tranced],1
186353,[RollerCoaster Tycoon: Loopy Landscapes],1
186354,[Dominions 3: The Awakening],1


In [152]:
user_test = user_titles.loc[0].game
user_games = tfid_summ.reindex(user_test)
user_prof = user_games.mean().values.reshape(1,-1)
non_user_games = tfid_summ.drop(user_test, axis=0)

In [153]:
user_prof_cos = cosine_similarity(user_prof, non_user_games)

In [163]:
# Game similarities for user 0. 
# This person played -->'Rockstar Games presents Table Tennis', "Disney's Magical Mirror Starring Mickey Mouse"
pd.DataFrame(user_prof_cos.T, index = non_user_games.index, columns = ['similarity']).sort_values('similarity', ascending=False)

Unnamed: 0_level_0,similarity
title,Unnamed: 1_level_1
DIRT 5,0.711571
Borderlands Legendary Collection,0.309851
Borderlands: The Pre-Sequel,0.285803
Borderlands: The Handsome Collection,0.275907
Disney's Hide and Sneak,0.273439
...,...
Before the Echo,0.000000
ChromaGun VR,0.000000
Tokyo Twilight Ghost Hunters,0.000000
Skully,0.000000


Clearly our parameters need to be adjusted bc this person's games are no where close to Dirt 5, a full fledged racing game, yet the similarity score is very high.  

#### Collaborate based on user ratings

In [47]:
users_over11 = user_titles[user_titles.game_counts>3].index
len(users_over11)

18344

In [48]:
user_ratings = reviews.set_index('ids').iloc[:,:-1].loc[users_over11]
user_ratings.loc[205]

Unnamed: 0_level_0,game,rating
ids,Unnamed: 1_level_1,Unnamed: 2_level_1
205,PlanetSide 2,10
205,Resogun,10
205,Knack,10
205,Need for Speed: Rivals,8
205,Dragon's Dogma: Dark Arisen,10
205,D4: Dark Dreams Don't Die,10
205,Velocity 2X,10
205,NieR: Automata,10
205,Death Stranding: Director's Cut,10
205,Destiny,0


In [49]:
user_ratings_pivot = user_ratings.pivot(columns='game', values='rating')
user_ratings_pivot.head()

game,#DRIVE,#IDARB,#KILLALLZOMBIES,'Splosion Man,.detuned,.hack//G.U. Last Recode,.hack//G.U. vol. 1//Rebirth,.hack//G.U. vol. 3//Redemption,.hack//Infection Part 1,.hack//Mutation Part 2,...,kill.switch,lilt line,moon,nail'd,oOo: Ascension,rain,theHunter: Call of the Wild,uDraw Studio,void tRrLM(); //Void Terrarium,xXx
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,,,,,,,,,,,...,,,,,,,,,,
26,,,,,,,,,,,...,,,,,,,,,,
35,,,,,,,,,,,...,,,,,,,,,,
46,,,,,,,,,,,...,,,,,,,,,,
58,,,,,,,,,,,...,,,,,,,,,,


In [50]:
avg_ratings = user_ratings_pivot.mean(axis=1)
user_ratings_centered = user_ratings_pivot.sub(avg_ratings, axis=0)
user_ratings_centered.loc[58360]

game
#DRIVE                                 NaN
#IDARB                                 NaN
#KILLALLZOMBIES                        NaN
'Splosion Man                          NaN
.detuned                         -8.119048
                                    ...   
rain                                   NaN
theHunter: Call of the Wild            NaN
uDraw Studio                           NaN
void tRrLM(); //Void Terrarium         NaN
xXx                                    NaN
Name: 58360, Length: 11069, dtype: float64

In [51]:
user_ratings_zeroed = user_ratings_centered.fillna(0)
game_ratings_pivot = user_ratings_zeroed.T
game_ratings_pivot.head()

ids,20,26,35,46,58,79,82,95,115,126,...,186204,186221,186262,186290,186304,186311,186314,186316,186326,186341
game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#DRIVE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#IDARB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#KILLALLZOMBIES,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Splosion Man,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.detuned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# Game rating similarity based on users
sim_ratings = cosine_similarity(game_ratings_pivot)
cos_game_ratings = pd.DataFrame(sim_ratings, index=game_ratings_pivot.index, columns=game_ratings_pivot.index)
cos_game_ratings.head()

game,#DRIVE,#IDARB,#KILLALLZOMBIES,'Splosion Man,.detuned,.hack//G.U. Last Recode,.hack//G.U. vol. 1//Rebirth,.hack//G.U. vol. 3//Redemption,.hack//Infection Part 1,.hack//Mutation Part 2,...,kill.switch,lilt line,moon,nail'd,oOo: Ascension,rain,theHunter: Call of the Wild,uDraw Studio,void tRrLM(); //Void Terrarium,xXx
game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#DRIVE,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#IDARB,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004536,0.0,0.0,0.0,0.0
#KILLALLZOMBIES,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Splosion Man,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.087878,0.0,...,0.153819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.detuned,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# games most similarly rated to Elden Ring
cos_game_ratings.loc['Elden Ring'].sort_values(ascending=False)[:10]

game
Elden Ring                                1.000000
POSTAL 4: No Regerts                      0.146831
Encased: a sci-fi post-apocalyptic RPG    0.141375
Magicka: Wizard Wars                      0.121801
ATOM RPG: Post-apocalyptic indie game     0.101070
Book of Demons                            0.099765
Lost Judgment                             0.091209
Dying Light 2 Stay Human                  0.077802
Daytona USA                               0.073790
Dead or Alive 5 Plus                      0.070621
Name: Elden Ring, dtype: float64

In [55]:
# How closely rated are these two similar games
warzone = game_ratings_pivot.loc['Call of Duty: Warzone', :].values.reshape(1, -1)
mw2 = game_ratings_pivot.loc['Call of Duty: Modern Warfare 2', :].values.reshape(1, -1)
cod_sims = cosine_similarity(warzone, mw2)
cod_sims

array([[0.00128156]])

In [56]:
# Comparing users to eachother
sim_users = cosine_similarity(user_ratings_zeroed)
cos_user_ratings = pd.DataFrame(sim_users, index=user_ratings_zeroed.index, columns=user_ratings_zeroed.index)
cos_user_ratings.head()

ids,20,26,35,46,58,79,82,95,115,126,...,186204,186221,186262,186290,186304,186311,186314,186316,186326,186341
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
ordered_similarities = cos_user_ratings.loc[205].sort_values(ascending=False)
Knearest = ordered_similarities[1:10].index
Knearest

Int64Index([81852, 158258, 101482, 8906, 34756, 184613, 83141, 69640, 1881], dtype='int64', name='ids')

In [60]:
user_ratings_pivot.reindex(Knearest).mean(axis=0).sort_values(ascending=False)[:10]

game
Football Manager 2017                     10.0
Sonic Mania                               10.0
Into the Breach                           10.0
Hellblade: Senua's Sacrifice              10.0
Grand Theft Auto V                        10.0
Star Wars: Knights of the Old Republic    10.0
Lost Odyssey                              10.0
Octopath Traveler                         10.0
Mad Max                                   10.0
It Takes Two                              10.0
dtype: float64

#### Predict User Score

In [61]:
ratings_no_eldenring = user_ratings_zeroed.drop('Elden Ring', axis = 1)
target_user_x = ratings_no_eldenring.loc[[205]]
target_user_x

game,#DRIVE,#IDARB,#KILLALLZOMBIES,'Splosion Man,.detuned,.hack//G.U. Last Recode,.hack//G.U. vol. 1//Rebirth,.hack//G.U. vol. 3//Redemption,.hack//Infection Part 1,.hack//Mutation Part 2,...,kill.switch,lilt line,moon,nail'd,oOo: Ascension,rain,theHunter: Call of the Wild,uDraw Studio,void tRrLM(); //Void Terrarium,xXx
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
other_users_yna = user_ratings_pivot['Elden Ring']

In [63]:
other_users_x = ratings_no_eldenring[other_users_yna.notnull()]
other_users_x

game,#DRIVE,#IDARB,#KILLALLZOMBIES,'Splosion Man,.detuned,.hack//G.U. Last Recode,.hack//G.U. vol. 1//Rebirth,.hack//G.U. vol. 3//Redemption,.hack//Infection Part 1,.hack//Mutation Part 2,...,kill.switch,lilt line,moon,nail'd,oOo: Ascension,rain,theHunter: Call of the Wild,uDraw Studio,void tRrLM(); //Void Terrarium,xXx
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
other_users_y = other_users_yna.dropna()

In [65]:
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=9)
user_knn.fit(other_users_x, other_users_y)
target_predict = user_knn.predict(target_user_x)

In [66]:
target_predict

array([5.66666667])

In [67]:
reviews_per_game = user_ratings_pivot.notnull().sum().sort_values()
fig = px.histogram(reviews_per_game)
fig.update_layout(xaxis=dict(title='number of reviews per game'), yaxis=dict(title='frequency'), showlegend=False)

Most games were reviewed very few times. We will attempt to predict the missing review ratings using Matrix Factorization. 

In [68]:
u, sigma, vt = svds(user_ratings_zeroed)
sigma = np.diag(sigma)

In [None]:
recalculated_user_ratings = np.dot(np.dot(u, sigma), vt)
recalculated_user_ratings+=avg_ratings.values.reshape(-1,1)
recalculated_user_ratings=pd.DataFrame(recalculated_user_ratings)

In [77]:
user_rows, user_cols = user_ratings_pivot.shape[0],user_ratings_pivot.shape[1]
percent_grid = .2
top_left_ratings = user_ratings_pivot.iloc[:int(percent_grid*user_rows), :int(percent_grid*user_cols)].values
copy_of_ratings = user_ratings_pivot.copy(deep=True)
copy_of_ratings.iloc[:int(percent_grid*user_rows), :int(percent_grid*user_cols)] = np.nan
predicted_topleft_ratings = recalculated_user_ratings.iloc[:int(percent_grid*user_rows), :int(percent_grid*user_cols)].values
mask_topleft = ~np.isnan(top_left_ratings)
top_left_ratings[mask_topleft]

array([ 8., 10.,  9., ...,  3.,  9.,  8.])

In [78]:
 predicted_topleft_ratings[mask_topleft]

array([8.14073019, 8.25228196, 7.16671277, ..., 7.07130925, 7.07209889,
       7.40644798])

In [79]:
mean_squared_error(top_left_ratings[mask_topleft], predicted_topleft_ratings[mask_topleft], squared=False)

2.4006561117330327

# Kaggle mini set

In [45]:
kagg = pd.read_csv('metacritic_reviews.csv').drop(['Unnamed: 0'],axis=1)

In [60]:
kagg['user'] = kagg.groupby('name').ngroup()
kagg = kagg[['user','name','game','rating','review']]
kagg = kagg.sort_values('name').drop_duplicates()
kagg = kagg.iloc[:,[0,2,3,4]]

In [61]:
kagg[['user']].value_counts()[kagg[['user']].value_counts()>10]

user
397     47
7087    39
6783    16
890     15
6116    13
7762    12
8211    12
6563    12
6656    11
9834    11
dtype: int64

In [55]:
kagg[kagg['user']==9834]

Unnamed: 0,user,name,game,rating,review
1852,9834,wesker2012,It Takes Two,8,It Takes Two is a great triple A puzzle platfo...
2136,9834,wesker2012,Dead Space,10,The new king of survival horror. Like playing...
2399,9834,wesker2012,Dishonored,10,Everything about this game is top notch. The ...
2906,9834,wesker2012,Resident Evil 2,10,Awesome game. 2 whole quests.
2976,9834,wesker2012,The Curse of Monkey Island,10,Classic LucasArts point and click adventure ma...
4386,9834,wesker2012,L.A. Noire,5,"I was looking forward to this game, it looks r..."
4806,9834,wesker2012,Silent Hill 2,7,One of the scariest psychological thrillers of...
6904,9834,wesker2012,Shadow Complex,8,Awesome Metroidvania type of game. One of the ...
7226,9834,wesker2012,What Remains of Edith Finch,7,Unique and interesting game. The visuals were...
8922,9834,wesker2012,BioShock 2,8,Great game.


In [63]:
kagg = kagg.sort_values('user')

In [64]:
# kagg.to_csv('metacritic_reviews.csv')

In [65]:
kagg

Unnamed: 0,user,game,rating,review
1744,0,L.A. Noire,10,The most beautifully crafted game since Heavy ...
7386,1,Sekiro: Shadows Die Twice,10,"A departure from the Soulsborne formula, while..."
11593,2,XCOM 2,5,An RNG fest. A whole level will frequently be ...
9346,3,NieR: Automata,10,NieR: Automata is the biggest sleeper hit of 2...
10890,4,DOOM Eternal,10,the overly negative reviewers of this game get...
...,...,...,...,...
1878,9973,It Takes Two,4,"Actually a great game in a creative setting, w..."
7418,9974,Sekiro: Shadows Die Twice,10,This game is a perfect 10. I have spent upward...
8254,9975,Super Mario Maker,7,Super Mario Maker is a fun tool for making you...
11456,9976,Torchlight II,10,LOVE IT! Worth it since its only $20!! Don't b...
