In [24]:
import numpy as np 
import pandas as pd 
from langdetect import detect
from textblob import TextBlob
import time

# IMPORT

In [2]:
df_info = pd.read_csv('raw_data/metacritic_game_info.csv')
df_info.head(3)

Unnamed: 0.1,Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player


In [2]:
df_comments = pd.read_csv('raw_data/metacritic_game_user_comments.csv')
df_comments.head(3)

Unnamed: 0.1,Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus
1,1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin
2,2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody


In [4]:
df_info.shape, df_comments.shape

((5000, 9), (283983, 6))

In [5]:
df_comments.Userscore.value_counts(normalize=True)

10    0.396260
9     0.188353
8     0.106594
0     0.070849
7     0.060046
6     0.042383
5     0.035611
4     0.028002
1     0.025769
3     0.025065
2     0.021068
Name: Userscore, dtype: float64

# Metacritic Game Info

In [5]:
# remove the extra index

df_info = df_info.drop(columns='Unnamed: 0')
df_info.head(1)

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player


In [6]:
df_info.nunique()

Title            3438
Year               25
Publisher        1238
Genre             927
Platform           20
Metascore          24
Avg_Userscore      70
No_Players         52
dtype: int64

In [7]:
df_info.duplicated().sum()

7

In [8]:
# remove duplicates

print(f'{df_info.duplicated().sum()} duplicates have been removed.')

df_info.drop_duplicates(inplace=True)

7 duplicates have been removed.


- year

In [15]:
# remove 'not specified' in column Year
# 4 of them

t = df_info[df_info['Year'] == 'not specified'].shape[0]
print(f'{t} nans have been removed.')

drop_index = df_info[df_info['Year'] == 'not specified'].index
df_info.drop(drop_index, inplace=True)

4 nans have been removed.


- No_Players

In [16]:
df_info['No_Players'].value_counts()

not specified            1539
No Online Multiplayer     852
1 Player                  573
1-2                       351
1-4                       334
1-8                       127
2  Online                 123
4  Online                 113
Up to 4                   103
8  Online                  94
Online Multiplayer         81
Up to 8                    78
2                          74
Massively Multiplayer      52
16  Online                 49
1-16                       47
Up to 10                   32
Up to 16                   32
6  Online                  32
Up to 6                    30
Up to 12                   28
32  Online                 26
Up to 18                   24
10  Online                 20
1-32                       18
Up to 22                   17
1-6                        16
12  Online                 15
Up to 24                   10
1-3                         9
Up to 20                    9
24  Online                  8
1-10                        7
Up to 64  

In [25]:
df_info[df_info['No_Players'].isna() == True]

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
663,Shenmue II,2001,SegaAM2,Action Adventure;Modern,Dreamcast,88,9.1,
1171,Lunar 2: Eternal Blue Complete,2000,GameArts,Role-Playing;Console-style RPG,PlayStation,86,8.9,
1952,Ultra Street Fighter IV,2014,Capcom,Fighting;3D;Action;Fighting;2D;3D,Xbox360,84,6.8,
2051,Ultra Street Fighter IV,2014,Capcom,Fighting;3D;Action;Fighting;2D;3D,PlayStation3,83,7.5,
3244,Wreckfest,2018,Bugbear,Driving;General;General;Racing;Arcade;Automobile,PC,81,8.3,
4433,ONRUSH,2018,Codemasters,Racing;Arcade;Automobile,XboxOne,78,6.6,
4441,Donkey Kong Country,2003,RareLtd.,Action;Platformer;2D,GameBoyAdvance,78,8.9,


In [35]:
df_info.sample(5)

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
4696,NHL 2K7,2006,KushGames,Sports;Traditional;Ice Hockey;Sim,Xbox,78,tbd,8 Online
2245,Tekken Tag Tournament 2,2012,NamcoBandaiGames,Action;Fighting;Fighting;3D;3D,Xbox360,83,7.6,Up to 6
1060,GRID,2008,Codemasters,Driving;Racing;Simulation;GT / Street;GT / Str...,Xbox360,87,7.9,12 Online
1137,Unavowed,2018,WadjetEyeGames,Adventure;General,PC,86,8.0,No Online Multiplayer
4811,Shadows of Adam,2017,SomethingClassicGamesLLC,Role-Playing;Japanese-Style,PC,78,8.2,No Online Multiplayer


In [91]:
df_info.columns

Index(['Unnamed: 0', 'Title', 'Year', 'Publisher', 'Genre', 'Platform',
       'Metascore', 'Avg_Userscore', 'No_Players'],
      dtype='object')

# Metacritic Review

In [3]:
# remove the extra index

df_comments = df_comments.drop(columns='Unnamed: 0')
df_comments.head(1)

Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus


In [4]:
df_comments.shape

(283983, 5)

In [5]:
df_comments.drop_duplicates(inplace=True)
df_comments.dropna(inplace=True)

In [6]:
df_comments.shape

(282201, 5)

In [27]:
df_comments_5000 = df_comments.sample(5000)

In [28]:
start = time.time()

df_comments_5000['lang'] = df_comments_5000['Title'].apply(detect)

end = time.time()
print(f'this function took {end-start} seconds.')

this function took 45.12115955352783 seconds.


In [29]:
df_comments_5000

Unnamed: 0,Title,Platform,Userscore,Comment,Username,lang
154476,Sins of a Solar Empire,PC,10,A massively epic game in the same vein of Sup...,ChrisB.,en
10697,Half-Life 2,PC,10,"Throughout the years, the originality and the ...",Qrutch,en
38518,BioShock Infinite,PC,10,Easily one of the best games I've ever played....,Juiposa,en
37150,Gears of War,Xbox360,9,"First of all, this game was amazing. I'm not ...",VersemS.,en
9780,Super Mario Odyssey,Switch,10,"""It's too easy""What's with all the morons sayi...",Gnafar,en
...,...,...,...,...,...,...
223635,Shift 2: Unleashed,PC,2,Horrible game. Cars are basically uncontrollab...,Gulskjegg,en
49540,Minecraft,PC,0,I gaveMinecraft a zero not a ten :). So what I...,lliam,ro
186219,Ni no Kuni: Wrath of the White Witch,PlayStation3,9,I loved this game so much! It still reminiscin...,Rustedtruck,en
153258,Subnautica,PC,9,I played this game once when it was in an alph...,arruu,ro


In [30]:
df_comments_5000['lang'].value_counts(normalize=True)

en    0.6050
de    0.0616
es    0.0330
id    0.0308
tl    0.0298
fr    0.0220
it    0.0218
nl    0.0204
cy    0.0194
af    0.0154
so    0.0152
no    0.0144
da    0.0132
pt    0.0132
ca    0.0126
fi    0.0114
sw    0.0104
hu    0.0080
pl    0.0080
ro    0.0068
sl    0.0042
sq    0.0040
tr    0.0034
sv    0.0032
et    0.0032
hr    0.0030
sk    0.0028
vi    0.0018
lt    0.0010
lv    0.0006
cs    0.0004
Name: lang, dtype: float64

In [None]:
start = time.time()

df_comments['lang'] = df_comments['Title'].apply(detect)

end = time.time()
print(f'this function took {end-start} seconds.')

# EXPORT

- NLP

In [7]:
df_comments.columns

Index(['Title', 'Platform', 'Userscore', 'Comment', 'Username'], dtype='object')

In [8]:
col_sel = ['Title', 'Userscore', 'Comment']
df_comments_nlp = df_comments[col_sel]

In [9]:
# replace userscore 0.0 with nan
# assume 0.0 means not rated
# this df is for NLP EDA and modeling, doesn't need missing values

df_comments_nlp.loc[df_comments_nlp['Userscore'] == 0, 'Userscore'] = np.NAN

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comments_nlp.loc[df_comments_nlp['Userscore'] == 0, 'Userscore'] = np.NAN


In [10]:
df_comments_nlp.shape

(282201, 3)

In [11]:
df_comments_nlp.drop_duplicates(inplace=True)
df_comments_nlp.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comments_nlp.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comments_nlp.dropna(inplace=True)


In [12]:
df_comments_nlp.shape

(260269, 3)

In [14]:
df_comments_nlp.to_csv(r'processed_df\metacritic_comments_nlp.csv', index = False)