# Data cleaning

In [1]:
# Importing libraries
import pandas as pd
import numpy as np

In [2]:
# Importing the datasets
games = pd.read_csv('Datasets/metacritic_game_info.csv')
comments = pd.read_csv('Datasets/metacritic_game_user_comments.csv')

## Games

In [3]:
# Taking a look at the dataset and dropping this column
games.drop('Unnamed: 0', axis = 1, inplace = True)
games

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player
...,...,...,...,...,...,...,...,...
4995,Donut County,2018,BenEsposito,Action Adventure;General,PC,77,8.1,No Online Multiplayer
4996,MotorStorm: Apocalypse,2011,EvolutionStudios,Driving;Racing;Simulation;Rally / Offroad;Rall...,PlayStation3,77,7.7,4 Online
4997,The Last Guy,2008,SCEJapanStudio,Action Adventure;Sci-Fi;Sci-Fi;General,PlayStation3,77,6.8,1 Player
4998,Valiant Hearts: The Great War,2014,UbisoftMontpellier,Platformer;2D;Action;Platformer;2D,PlayStation4,77,8.4,not specified


In [4]:
# Checking for nulls
print(games.isna().sum())  # got 7

# Checking which are these 7 rows for curiosity and if I can save them or not
games[games.isnull().any(axis = 1)]

Title            0
Year             0
Publisher        0
Genre            0
Platform         0
Metascore        0
Avg_Userscore    0
No_Players       7
dtype: int64


Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
663,Shenmue II,2001,SegaAM2,Action Adventure;Modern,Dreamcast,88,9.1,
1171,Lunar 2: Eternal Blue Complete,2000,GameArts,Role-Playing;Console-style RPG,PlayStation,86,8.9,
1952,Ultra Street Fighter IV,2014,Capcom,Fighting;3D;Action;Fighting;2D;3D,Xbox360,84,6.8,
2051,Ultra Street Fighter IV,2014,Capcom,Fighting;3D;Action;Fighting;2D;3D,PlayStation3,83,7.5,
3244,Wreckfest,2018,Bugbear,Driving;General;General;Racing;Arcade;Automobile,PC,81,8.3,
4433,ONRUSH,2018,Codemasters,Racing;Arcade;Automobile,XboxOne,78,6.6,
4441,Donkey Kong Country,2003,RareLtd.,Action;Platformer;2D,GameBoyAdvance,78,8.9,


In [5]:
# Filling these 7 rows with 'not specified' instead of dropping
games.fillna('not specified', inplace = True)

# Checking again for nulls. All good
print(games.isna().sum())

Title            0
Year             0
Publisher        0
Genre            0
Platform         0
Metascore        0
Avg_Userscore    0
No_Players       0
dtype: int64


In [6]:
# Checking the data types
print(games.dtypes, '\n')  # everything is 'object'. Will change this

# Converting all columns but those 3 to strings
for i in games.columns:
    if i not in ['Year', 'Metascore', 'Avg_Userscore']:
        games[i] = games[i].astype('string')


# List comprehension version to convert all columns to strings except those 3
## Doesn't work cuz I haven't found a way to 'save' or assign games[i] = games[i]... like in the foor loop above
#[games[i].astype('string') for i in games.columns if i not in ['Year', 'Metascore', 'Avg_Userscore']]

Title            object
Year             object
Publisher        object
Genre            object
Platform         object
Metascore        object
Avg_Userscore    object
No_Players       object
dtype: object 



In [7]:
# Checking the types again. All good
print(games.dtypes)

Title            string
Year             object
Publisher        string
Genre            string
Platform         string
Metascore        object
Avg_Userscore    object
No_Players       string
dtype: object


In [8]:
# Checking the year column's values. Want to convert to int
games['Year'].unique()

array(['1998', '2000', '2008', '1999', '2007', '2010', '2014', '2013',
       '2017', '2001', '2002', '2004', '2015', '1997', '2009', '2005',
       '2011', '2006', '2003', '2018', '1996', '2012', '2016',
       'not specified', '1995'], dtype=object)

In [9]:
# Checking what's in here
games[games['Year'] == 'not specified']  # gonna drop all this useless information

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
808,Error 503 Service Unavailable,not specified,not specified,no genre,not specified,not specified,not specified,not specified
1360,Bad Request,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2122,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2123,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2124,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2125,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2126,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2127,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2977,page not found,not specified,not specified,no genre,not specified,not specified,not specified,not specified
4408,Bad Request,not specified,not specified,no genre,not specified,not specified,not specified,not specified


In [10]:
# Dropping those rows mentioned above
games.drop(games[games['Year'] == 'not specified'].index, inplace = True)

# Checking if it worked. All good. Ready to be converted to int
games[games['Year'] == 'not specified']

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players


In [19]:
# Checking Year values
games['Year'].unique()  # seems ready to be converted to int

array(['1998', '2000', '2008', '1999', '2007', '2010', '2014', '2013',
       '2017', '2001', '2002', '2004', '2015', '1997', '2009', '2005',
       '2011', '2006', '2003', '2018', '1996', '2012', '2016', '1995'],
      dtype=object)

In [18]:
# Checking Metascore values
games['Metascore'].unique()  # seems ready to be converted to int as well

array(['99', '98', '97', '96', '95', '94', '93', '92', '91', '90', '89',
       '88', '87', '86', '85', '84', '83', '82', '81', '80', '79', '78',
       '77'], dtype=object)

In [20]:
for i in games.columns:
    if i in ['Year', 'Metascore']:
        games[i] = games[i].astype(int)

# Seeing if it worked. All good
games.dtypes

Title            string
Year              int64
Publisher        string
Genre            string
Platform         string
Metascore         int64
Avg_Userscore    object
No_Players       string
dtype: object

In [21]:
# Checking the avg_userscore values
games['Avg_Userscore'].unique()

array(['9.1', '7.4', '7.5', '8.6', '7.9', '9.0', '7.8', '8.3', '6.2',
       '8.5', '8.9', '9.2', '6.4', '7.7', '8.2', '9.3', '8.8', '8.7',
       '6.8', '8.4', '7.0', '8.1', '7.3', '8.0', '5.9', '5.6', '6.5',
       '7.6', '3.4', '5.0', '6.7', '5.8', '6.6', '4.5', '6.3', '7.2',
       '6.9', '5.5', 'tbd', '6.1', '7.1', '5.7', '5.2', '6.0', '4.1',
       '3.2', 'not specified', '4.2', '4.9', '3.1', '5.1', '5.4', '4.3',
       '4.4', '4.7', '4.8', '1.6', '5.3', '3.3', '3.5', '3.7', '3.8',
       '3.6', '2.7', '1.7', '2.1', '2.8', '4.6', '4.0', '2.4'],
      dtype=object)

In [26]:
games[games['Avg_Userscore'] == 'not specified']  # 1 game - could just drop, I guess

games[games['Avg_Userscore'] == 'tbd'].sort_values('Year')  # 75 games - not sure if I drop or fill with smth

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
3146,Flanker 2.0,1999,FlyingLegends,Simulation;Flight;Modern Jet;Modern Jet;Combat,PC,81,tbd,1-16
4366,Swing Away Golf,2000,T&ESoft,Sports;Traditional;Golf;Arcade,PlayStation2,78,tbd,1-4
4831,NHL FaceOff 2001,2000,SolWorks,Sports;Traditional;Ice Hockey;Sim,PlayStation,78,tbd,not specified
2883,NASCAR Heat 2002,2001,MonsterGamesInc.,Driving;Racing;Stock Car,PlayStation2,81,tbd,1-2
2741,NASCAR Thunder 2002,2001,EATiburon,Driving;Racing;Stock Car,Xbox,82,tbd,1-4
...,...,...,...,...,...,...,...,...
2647,The Lion's Song,2018,Mi'pu'miGames,Adventure;Point-and-Click,Switch,82,tbd,No Online Multiplayer
4484,Runner3,2018,ChoiceProvisions,Action;Platformer;2D,PC,78,tbd,No Online Multiplayer
3873,Space Invaders Extreme,2018,TaitoCorporation,Action;Shooter;Shoot-'Em-Up;Vertical,PC,79,tbd,not specified
4809,Legendary Gary,2018,EvanRogers,Strategy;Turn-Based;General,PC,78,tbd,not specified


In [54]:
games_scores = games.groupby('Title', as_index = False).agg(
    {'Metascore': 'mean'}).sort_values('Metascore', ascending = False).reset_index(drop = True)

games_scores

Unnamed: 0,Title,Metascore
0,The Legend of Zelda: Ocarina of Time,99.0
1,NFL 2K1,97.0
2,Super Mario Odyssey,97.0
3,Super Mario Galaxy 2,97.0
4,Super Mario Galaxy,97.0
...,...,...
3429,Tower of Time,77.0
3430,Delta Force: Black Hawk Down,77.0
3431,Deathtrap,77.0
3432,Dead or Alive 5 Ultimate,77.0


## Comments

In [None]:
# Try to see, at a later point, if we can have like a count of words or something for each game
# to see the most used words and help ux/ui better market stuff?

In [42]:
# Taking a look at the dataset and dropping this column
comments.drop('Unnamed: 0', axis = 1, inplace = True)
comments

Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA
...,...,...,...,...,...
283978,Etrian Odyssey Untold: The Millennium Girl,3DS,7,"Extremely similar to EO:4, which obviously isn...",RileyWRussell
283979,Etrian Odyssey Untold: The Millennium Girl,3DS,0,Typical overrated Atlus trash. A game i should...,TemplarGR
283980,Etrian Odyssey Untold: The Millennium Girl,3DS,9,While I find the story mode to have annoying c...,midipon
283981,Etrian Odyssey Untold: The Millennium Girl,3DS,8,"Pretty good, but it certainly lacks the visual...",night4


In [43]:
# Checking for nulls
print(comments.isna().sum(), '\n')  # 26

# Dropping the nulls
comments.dropna(inplace = True)

# Checking again. All good
print(comments.isna().sum())

Title         0
Platform      0
Userscore     0
Comment      23
Username      3
dtype: int64 

Title        0
Platform     0
Userscore    0
Comment      0
Username     0
dtype: int64


In [44]:
# Checking the data types
print(comments.dtypes, '\n')

# Converting the 'objects' to strings
for i in comments.columns:
    if i != 'Userscore':
        comments[i] = comments[i].astype('string')

# Checking again. All good
print(comments.dtypes)
comments

Title        object
Platform     object
Userscore     int64
Comment      object
Username     object
dtype: object 

Title        string
Platform     string
Userscore     int64
Comment      string
Username     string
dtype: object


Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA
...,...,...,...,...,...
283978,Etrian Odyssey Untold: The Millennium Girl,3DS,7,"Extremely similar to EO:4, which obviously isn...",RileyWRussell
283979,Etrian Odyssey Untold: The Millennium Girl,3DS,0,Typical overrated Atlus trash. A game i should...,TemplarGR
283980,Etrian Odyssey Untold: The Millennium Girl,3DS,9,While I find the story mode to have annoying c...,midipon
283981,Etrian Odyssey Untold: The Millennium Girl,3DS,8,"Pretty good, but it certainly lacks the visual...",night4


In [45]:
games[games['Title'] == "'Splosion Man"]

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
1710,'Splosion Man,2009,TwistedPixelGames,Action;General;Platformer;Platformer;2D;2D,Xbox360,84,7.8,4 Online


In [46]:
avg_user_score = comments.groupby('Title').agg({'Userscore': 'mean'})
avg_user_score

Unnamed: 0_level_0,Userscore
Title,Unnamed: 1_level_1
'Splosion Man,8.687500
007: The World is Not Enough,7.200000
2010 FIFA World Cup South Africa,7.687500
3D After Burner II,8.000000
3D Gunstar Heroes,8.750000
...,...
escapeVektor: Chapter 1,9.000000
flower,7.296774
ilomilo,8.500000
inFamous,8.360269


In [None]:
comments['Comment'][0]