# 1) Metacritic Reviews

###  Scrapping the dataset

To succesfully scrape from Metacritic website we require to follow this steps:
- Extract a list of all game links per console

- Extract each game info (Name, genre, characteristcs)

- Extract game critic and user reviews

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import time
from time import sleep
from tqdm import tqdm_notebook

#### (A) Define extraction functions

Define a function to extract the soup

In [2]:
def get_soup(url):
    user_agent = {'User-agent': 'Chrome/88.0.4324.150'}
    response = requests.get(url, headers = user_agent)
    return BeautifulSoup(response.text, 'html.parser')

Define a function that extracts the links for every game in the analyzed page

In [3]:
def get_game_link(soup):
    games = soup.find('div', class_='product_condensed') #List of games in the page
    links = []
    for game in games.select('li[class*="product game_product"]'):
        links.append(game.a['href']) #Append the link for each of the games in the page
    return links   

Define a function that extracts the game information for each game page

In [4]:
def get_game_data(soup):
    val_title=soup.title
    if val_title.get_text().strip()[0:3]!='404':
        # Get name of the game
        title = soup.find('div', class_='product_title') 
        if title is not None:
            title = title.a.h1.get_text().strip()
        else:
            title = np.nan # If its NAN
        
        # Get game platform
        platform = soup.find('span', class_='platform')
        if platform is not None:
            platform = platform.a.get_text().strip()
        else:
            platform = np.nan # If its NAN

        # Get summary, summary be in different formats
        summary = soup.find('div',class_='section product_details')
        if summary is not None:
            if summary.li.find('span', class_='blurb_expanded') is not None: # Expanded summary
                summary = summary.li.find('span', class_='blurb_expanded').get_text()
            elif summary.li.find('span', class_='data') is not None:
                summary = summary.li.find('span', class_='data').get_text() # If summary does not require expanding extract normal summary
            else:
                summary = np.nan # If its NAN

        # Release date
        release_date = soup.find('div', class_='product_data')
        if release_date is not None:
            release_date = release_date.find('li', class_='summary_detail release_data').find('span', class_='data').get_text().strip()
        else:
            release_date = np.nan # If its NAN

        # Developer
        developer=soup.find('li', class_='summary_detail developer')
        if developer is not None:
            developer = developer.find('span', class_='data').get_text().strip()
        else:
            developer = np.nan # If its NAN        

        # Genre
        genre=[]
        game_gen=soup.find('li', class_='summary_detail product_genre')
        if game_gen is not None:
            for g in game_gen.find_all('span', class_='data'): 
                genre.append(g.get_text().strip())
        else:
            genre = np.nan

        # Rating
        rating = soup.find('li', class_='summary_detail product_rating')
        if rating is not None: 
            rating = rating.find('span', class_='data').get_text().strip() 
        else:
            rating = np.nan

    else:
        title = np.nan
        platform = np.nan
        summary = np.nan
        release_date = np.nan
        developer = np.nan
        genre = np.nan
        rating = np.nan

    return title, platform, summary, release_date, developer, genre, rating

Define function to extract game reviews summary (scores)

In [5]:
def get_game_review_sum(soup):
    if soup.find('div', class_='summary') is not None:
        overview = soup.find('div', class_='summary')
        if overview is not None:
            overview = overview.find('span', class_='desc').get_text().strip()
        else:
            overview = np.nan
            
        reviews_count = soup.find('div', class_='score_distribution')
        if reviews_count is not None:
            reviews_count = reviews_count.find_all('span', class_='count')
            positive = reviews_count[0].get_text().strip()
            mixed = reviews_count[1].get_text().strip()
            negative = reviews_count[2].get_text().strip()
        else:
            positive = '0'; mixed = '0'; negative = '0'
    else:
        overview = np.nan
        positive = np.nan
        mixed = np.nan
        negative = np.nan
    return overview, positive, mixed, negative

Define function to extract individual reviews for each game

In [6]:
def get_ind_reviews(soup, category):
    names = []; dates = []; scores = []; texts = []
    
    reviews_list = soup.find('ol', class_=f'reviews {category}_reviews')
    if reviews_list is not None:
        for review in reviews_list.select(f'li[class*="review {category}_review"]'):
            if category == 'user':
                names.append(review.find('div', class_='name').get_text().strip())
            else:
                names.append(review.find('div', class_='source').get_text().strip())
            dates.append(review.find('div', class_='date').get_text().strip())
            scores.append(review.find('div', class_='review_grade').get_text().strip())
            exp = review.find('span', class_='blurb blurb_expanded')
            if exp is None:
                texts.append(review.find('div', class_='review_body').get_text().strip())
            else:
                texts.append(exp.get_text().strip())

    return names, dates, scores, texts

*Note:* Sometimes the soup is not correctly extracted leading to breakings in the code. We will skip these games.

#### (B) Run text scrapping

Define: 
- The consoles that will be scrapped
- The letters that will be scrapped

In [7]:
#consoles=['ps4','switch','xboxone']
consoles=['switch']

letters= ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m',
          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Obtain the link of all the games in the selected consoles

In [8]:
start=time.time()

game_page_links=[]

for console in consoles:
    for letter in letters:
        # Get first page soup
        soup = get_soup(f'http://www.metacritic.com/browse/games/title/{console}/{letter}') 
        game_page_links += get_game_link(soup)
        
        # Obtain the number of pages in the soup (if its not none that means we have more than one page and we need to loop around them)
        pages=soup.find('ul', class_='pages')
        if pages is not None:
            num_pages=len(pages.find_all('li'))
            #Loop around all additional pages
            for i in range(1,num_pages):
                soup = get_soup(f'http://www.metacritic.com/browse/games/title/{console}/{letter}?page={i}') 
                game_page_links += get_game_link(soup)

done=time.time()
elapsed = done - start
print(len(game_page_links),'links extracted')
print('Extraction time:',round(elapsed,2),'seconds')

5333 links extracted
Extraction time: 22.36 seconds


In [9]:
# Subset of links to test the scrapper
game_page_links_cut=game_page_links.copy()
game_page_links_cut=game_page_links_cut[0:100]

Extract the required information per game link

In [10]:
start=time.time()
titles = {}; platforms = {}; summaries = {}; release_dates = {};developers = {}; genres = {}; ratings = {}
meta_overviews = {}; meta_pos = {}; meta_mixed = {};meta_neg = {}
critics_names = {}; critics_dates = {}; critics_scores = {}; critics_texts = {}
user_overviews = {}; user_pos = {}; user_mixed = {}; user_neg = {}
users_names = {}; users_dates = {}; users_scores = {}; users_texts = {}

#for link in tqdm_notebook(game_page_links_cut):
for link in tqdm_notebook(game_page_links):
    soup = get_soup(f'http://www.metacritic.com{link}')

    # Extract game information
    title, platform, summary, release_date, developer, genre, rating= get_game_data(soup)
    titles[link] = title
    platforms[link] = platform
    summaries[link] = summary
    release_dates[link] = release_date
    developers[link] = developer
    genres[link] = genre
    ratings[link] = rating
    
    #Extract game reviews information (summary and individual comments): Critics
    if titles[link]!=np.nan:
        soup = get_soup(f'http://www.metacritic.com{link}/critic-reviews')    

        overview, positive, mixed, negative=get_game_review_sum(soup)
        meta_overviews[link] =  overview
        meta_pos[link] = positive
        meta_mixed[link] = mixed
        meta_neg[link] = negative

        names, dates, scores, texts = get_ind_reviews(soup, 'critic')
        critics_names[link] = names
        critics_dates[link] = dates
        critics_scores[link] = scores
        critics_texts[link] = texts
    else:
        meta_overviews[link] = np.nan
        meta_pos[link] = np.nan
        meta_mixed[link] = np.nan
        meta_neg[link] = np.nan
        critics_names[link] = np.nan
        critics_dates[link] = np.nan
        critics_scores[link] = np.nan
        critics_texts[link] = np.nan
    
    #Extract game reviews information (summary and individual comments): Users
    if titles[link]!=np.nan:
        soup = get_soup(f'http://www.metacritic.com{link}/user-reviews')
        
        overview, positive, mixed, negative=get_game_review_sum(soup)
        user_overviews[link] =  overview
        user_pos[link] = positive
        user_mixed[link] = mixed
        user_neg[link] = negative
        
        names, dates, scores, texts = get_ind_reviews(soup, 'user')
        users_names[link] = names
        users_dates[link] = dates
        users_scores[link] = scores
        users_texts[link] = texts
    else:
        user_overviews[link] = np.nan
        user_pos[link] = np.nan
        user_mixed[link] = np.nan
        user_neg[link] = np.nan
        users_names[link] = np.nan
        users_dates[link] = np.nan
        users_scores[link] = np.nan
        users_texts[link] = np.nan

done=time.time()
elapsed = done - start
print('Extraction time:',round(elapsed,2),'seconds')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for link in tqdm_notebook(game_page_links):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5333.0), HTML(value='')))


Extraction time: 5056.68 seconds


Dataframe the extracted information for games (without reviews)

In [11]:
df = pd.DataFrame({'title': titles, 'platform': platforms, 'summary': summaries, 
                   'release_date': release_dates, 'developer': developers, 'genre': genres, 'rating': ratings,
                   'meta_overview': meta_overviews,'meta_pos': meta_pos, 'meta_mixed': meta_mixed, 'meta_neg': meta_neg,
                   'user_overview': user_overviews, 'user_pos': user_pos,'user_mixed': user_mixed, 'user_neg': user_neg 
                   }).reset_index(drop = True)
df

Unnamed: 0,title,platform,summary,release_date,developer,genre,rating,meta_overview,meta_pos,meta_mixed,meta_neg,user_overview,user_pos,user_mixed,user_neg
0,#1 Crosswords,Switch,World's Best Crosswords Game. Are you a fan of...,"Feb 12, 2021",Eclipse Games,"[Puzzle, Logic]",,No score yet,0,0,0,No user score yet,0,0,0
1,#Breakforcist Battle,Switch,\n Lucid Sheep Games ...,"Apr 12, 2018",Lucid Sheep Games,"[Puzzle, General]",T,No score yet,1,2,0,No user score yet,1,0,0
2,#DRIVE,Switch,#DRIVE is an endless driving videogame inspire...,"Feb 16, 2021",Dariusz Pietrala,"[Racing, Arcade, Automobile]",,No score yet,1,0,0,No user score yet,1,0,0
3,#Funtime,Switch,#Funtime is an explosive twin-stick shooter wh...,"Jul 16, 2020",One Guy Games,"[Action, Shooter, Shoot-'Em-Up, Top-Down]",,No score yet,0,1,0,No user score yet,0,0,1
4,"#Halloween, Super Puzzles Dream",Switch,#Halloween! is the hashtag for the most terrif...,"Oct 29, 2020",Jorge Biedma Azuar,"[Puzzle, General]",,No score yet,0,0,0,No user score yet,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5321,Zombies ruined my day,Switch,This is the story of how the best day of your ...,"Jul 4, 2020",mancebo,"[Action, General]",,No score yet,0,0,0,No user score yet,0,0,0
5322,Zombillie,Switch,The world has changed. Irrevocably and complet...,"Mar 29, 2018",Forever Entertainment,"[Action, Arcade]",T,No score yet,0,0,0,No user score yet,1,0,0
5323,Zotrix Starglider,Switch,Zotrix is an arcade space shooter inspired by ...,"Jan 14, 2021",Ocean Media,"[Action, General]",,No score yet,0,0,1,No user score yet,1,0,0
5324,Zotrix: Solar Division,Switch,\n ZeroBit Games ...,"Apr 12, 2018",ZeroBit Games,"[Strategy, General]",E,No score yet,0,1,0,No user score yet,1,0,0


Dataframe the extracted critic reviews

In [12]:
critics = []; dates = []; scores = []; texts = []; games = []; plats = []
for i in critics_names:
    critics += critics_names[i]
    dates += critics_dates[i]
    scores += critics_scores[i]
    texts += critics_texts[i]
    games += [titles[i]] * len(critics_names[i])
    plats += [platforms[i]] * len(critics_names[i])

df_critic_rev=pd.DataFrame({'critic': critics, 'date': dates, 'score': scores, 'text': texts, 'title': games, 'platform': plats},
            columns = ['title','platform', 'critic', 'date','score', 'text'])
df_critic_rev

Unnamed: 0,title,platform,critic,date,score,text
0,#Breakforcist Battle,Switch,NintendoWorldReport,"Apr 17, 2018",80,"Despite following a standard, #Breakforcist Ba..."
1,#Breakforcist Battle,Switch,Nintendo Insider,"Jun 5, 2018",70,#Breakforcist Battle might not reinvent the wh...
2,#Breakforcist Battle,Switch,Nintenderos,"Jul 5, 2020",55,#Breakforcist Battle is a new version of the A...
3,#DRIVE,Switch,Finger Guns,"Feb 15, 2021",80,"It has both its calm and frustrating moments, ..."
4,#Funtime,Switch,NintendoWorldReport,"Sep 23, 2020",70,I can’t fault #Funtime for being overly famili...
...,...,...,...,...,...,...
27369,Zumba Burn it Up!,Switch,Pure Nintendo,"Nov 25, 2019",75,Those who are familiar with or already partici...
27370,Zumba Burn it Up!,Switch,GameSpew,"Nov 25, 2019",70,With a few more song options and maybe a way t...
27371,Zumba Burn it Up!,Switch,Nintendo Life,"Nov 23, 2019",70,"Ultimately, Zumba Burn It Up! knows its audien..."
27372,Zumba Burn it Up!,Switch,GamingTrend,"Nov 18, 2019",70,Zumba: Burn it Up! will appeal to the hardcore...


Dataframe the extracted user reviews

In [13]:
critics = []; dates = []; scores = []; texts = []; games = []; plats = []
for i in users_names:
    critics += users_names[i]
    dates += users_dates[i]
    scores += users_scores[i]
    texts += users_texts[i]
    games += [titles[i]] * len(users_names[i])
    plats += [platforms[i]] * len(users_names[i])

df_user_rev=pd.DataFrame({'user': critics, 'date': dates, 'score': scores, 'text': texts, 'title': games, 'platform': plats},
            columns = ['title','platform', 'user', 'date','score', 'text'])
df_user_rev

Unnamed: 0,title,platform,user,date,score,text
0,#KILLALLZOMBIES,Switch,ChrisUnseen,"Feb 13, 2019",9,"#KillAllZombies is a third-person, twin-stick ..."
1,#RaceDieRun,Switch,jens76,"Oct 21, 2020",8,"Einfach gehalten und deswegen nicht schlecht, ..."
2,#RaceDieRun,Switch,Yamanj,"Aug 11, 2019",10,This is a fun game and it's a really hard game...
3,#RaceDieRun,Switch,dep,"Jul 21, 2020",10,Muy buen juego de dificultad elevada pero no p...
4,1-2-Switch,Switch,Oasppp,"Mar 3, 2017",10,An absolute masterpiece! This is the must have...
...,...,...,...,...,...,...
21615,Zotrix Starglider,Switch,TheTword90,"Feb 1, 2021",8,"Shmups are bread and butter to many, and Zotri..."
21616,Zotrix: Solar Division,Switch,KlingonGamerYT,"May 11, 2018",9,a galaga meets tower defence game with online ...
21617,Zumba Burn it Up!,Switch,Ne098,"Dec 4, 2019",0,The game sucks ass don’t buy it I tried it for...
21618,Zumba Burn it Up!,Switch,MarcosBLG,"Jan 28, 2020",0,This game is so bad the game are YouTube video...


In [14]:
df.to_csv("switch_game_info.csv",index=False)
df_critic_rev.to_csv("switch_critic_review.csv",index=False)
df_user_rev.to_csv("switch_user_review.csv",index=False)

###  Game review exploration

(A) Minimum number of reviews

As the dataset we extract includes every game on Metacritic we might be prone to have a lot of game with none or too little reviews to be able to run a text analytics model. We will analyze the frequency per game and define a threeshold to keep only the games with enough information to analyze.

In [36]:
df = pd.read_csv('switch_game_info.csv')
df_critic_rev = pd.read_csv('switch_critic_review.csv')
df_user_rev = pd.read_csv('switch_user_review.csv')

In [37]:
pivot_critic = pd.pivot_table(df_critic_rev, values='text', index=['title']
                                           , aggfunc='count').reset_index()

pivot_user = pd.pivot_table(df_critic_rev, values='text', index=['title']
                                         , aggfunc='count').reset_index()

We define a threshold of x comments and create a unique list of games that comply with it.

In [38]:
th = 15
print(len(pivot_critic[pivot_critic['text']>=th]),'games for critics, representing',round(len(pivot_critic[pivot_critic['text']>=th])/len(pivot_critic)*100,2),'%')
print(len(pivot_user[pivot_user['text']>=th]),'games for users, representing',round(len(pivot_user[pivot_user['text']>=th])/len(pivot_user)*100,2),'%')

merge_critic = pd.merge(left=pivot_critic[pivot_critic['text']>=th]['title'], right=df, how='inner')
merge_critic_list = merge_critic['title'].tolist()

merge_user = pd.merge(left=pivot_user[pivot_user['text']>=th]['title'], right=df, how='inner')
merge_user_list = merge_user['title'].tolist()


unique_list = []

for title in merge_critic_list:
    unique_list.append(title)

for title in merge_user_list:
    if title not in unique_list:
        unique_list.append(title)
print(len(unique_list),'unique games')

449 games for critics, representing 12.82 %
449 games for users, representing 12.82 %
449 unique games


We then use this list to create a filtered review and games dataset

In [39]:
df_filt = df.copy()
df_critic_filt = df_critic_rev.copy()
df_user_filt = df_user_rev.copy()

df_filt = df_filt[df_filt['title'].isin(unique_list)].reset_index()
df_critic_filt = df_critic_filt[df_critic_filt['title'].isin(unique_list)].reset_index()
df_user_filt = df_user_filt[df_user_filt['title'].isin(unique_list)].reset_index()

(B) Genre dimensionality reduction

In [40]:
print(len(df_filt['genre'].unique()),'unique genres in extracted data')

113 unique genres in extracted data


As games can have various genres combinitation we will classify them on the following genre categories: https://en.wikipedia.org/wiki/Video_game_genre

- Action
- Adventure
- Fighting
- Platform
- Puzzle
- Racing
- Role-playing
- Shooter
- Simulation
- Sports
- Strategy
- Miscellaneous

In [41]:
action     = ['Action','Action Adventure','Action RPG','Combat']
adventure  = ['Adventure','Action Adventure']
fighting   = ['Fighting']
platform   = ['Platformer']
puzzle     = ['Puzzle']
racing     = ['Racing']
roleplay   = ['Role-Playing','RPG','Action RPG']
shooter    = ['Shooter']
simulation = ['Simulation','Sim']
sports     = ['Sports']
strategy   = ['Strategy', 'Tactics']
misc       = ['Miscellaneous']

genre_list  = [action, adventure, fighting, platform, puzzle, racing, roleplay, shooter, simulation, sports, strategy, misc]

Additionally we will identify if the game is 2D or 3D is possible

In [42]:
genre_dummies = {}

for i in range(len(df_filt)):
    sep_list=[]
    for word in (df_filt['genre'].iloc[i].split("'")[1:-1]):
        if word!=', ':
            sep_list.append(word)
    
    game_dummy = []
    for genre in genre_list:
        if any(gen in sep_list  for gen in genre)==True:
            game_dummy.append(1) 
        else:
            game_dummy.append(0)
            
    genre_dummies[i] = game_dummy

genre_dummies = pd.DataFrame.from_dict(genre_dummies, orient='index', columns = ['genre_action','genre_adventure','genre_fighting',
                                                                                 'genre_platform','genre_puzzle','genre_racing',
                                                                                 'genre_roleplay','genre_shooter','genre_simulation',
                                                                                 'genre_sports','genre_strategy','genre_misc'])

genre_dummies

Unnamed: 0,genre_action,genre_adventure,genre_fighting,genre_platform,genre_puzzle,genre_racing,genre_roleplay,genre_shooter,genre_simulation,genre_sports,genre_strategy,genre_misc
0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
444,1,0,0,1,0,0,0,0,0,0,0,0
445,1,0,0,1,0,0,0,0,0,0,0,0
446,1,0,0,1,0,0,0,0,0,0,0,0
447,1,0,0,0,0,0,1,0,0,0,0,0


The genre dummies are incorporated in the final dataset

In [43]:
df_filt_fin = df_filt.merge(genre_dummies, left_index=True, right_index=True)
df_filt_fin = df_filt_fin.iloc[:,1:]

In [44]:
df_filt_fin.to_csv('switch_game_info_filtered.csv')
df_critic_filt.to_csv('switch_critic_review_filtered.csv')
df_user_filt.to_csv('switch_user_review_filtered.csv')

# 2) Nintendo Switch Sales Data

In [45]:
import pandas as pd
import numpy as np

In [46]:
sales = pd.read_csv('vgchartz-7_7_2020.csv')
sales = sales[['title','console','total_shipped','total_sales']]
sales = sales[sales['console']=='NS']

print('Nintendo Switch games in the sales database is:',len(sales))
display(sales)

Nintendo Switch games in the sales database is: 1608


Unnamed: 0,title,console,total_shipped,total_sales
329,Astral Chain,NS,1.08,
330,Marvel Ultimate Alliance 3: The Black Order,NS,1.08,
355,Enter the Gungeon,NS,1.00,
479,Monster Hunter Generations Ultimate,NS,,0.72
516,Diablo III: Eternal Collection,NS,,0.67
...,...,...,...,...
57933,Tlicolity Eyes: Twinkle Snowtime,NS,,
57935,VA-11 HALL-A,NS,,
57939,WILL: A Wonderful World,NS,,
57947,"Yoru, Tomosu",NS,,


In [47]:
sales_filt = sales[(sales['total_shipped'].notnull()) | (sales['total_sales'].notnull())]

print('Nintendo Switch games in the sales database is with information:',round(len(sales_filt)/len(sales)*100,2),'%')
display(sales_filt)

Nintendo Switch games in the sales database is with information: 19.59 %


Unnamed: 0,title,console,total_shipped,total_sales
329,Astral Chain,NS,1.08,
330,Marvel Ultimate Alliance 3: The Black Order,NS,1.08,
355,Enter the Gungeon,NS,1.00,
479,Monster Hunter Generations Ultimate,NS,,0.72
516,Diablo III: Eternal Collection,NS,,0.67
...,...,...,...,...
57744,Cendrillon palikA,NS,,0.00
57747,Hakuoki: Shinkai - Fuukaden,NS,,0.00
57753,Aokana: Four Rhythm Across the Blue,NS,,0.00
57758,"Nora, Princess, and Stray Cat",NS,,0.00


In [48]:
sales_filt=sales_filt.fillna(0)
sales_filt=sales_filt.append({'title':'Pokémon Sword','total_shipped':17.37,'total_sales':0.00}, ignore_index=True)
sales_filt=sales_filt.replace('Pokémon Sword / Shield','Pokémon Shield')
sales_filt['total_sales_USDMM'] = sales_filt['total_shipped'] * 60 + sales_filt['total_sales'] ## Ship are in millions of units (standard price for switch games is $60)
sales_filt=sales_filt[['title','total_sales_USDMM']]
sales_filt.sort_values(by=['total_sales_USDMM'], ascending=False)

Unnamed: 0,title,total_sales_USDMM
194,Mario Kart 8 Deluxe,1486.2
86,Super Smash Bros. Ultimate,1130.4
146,Super Mario Odyssey,1044.6
69,The Legend of Zelda: Breath of the Wild,1044.6
315,Pokémon Sword,1042.2
...,...,...
311,Hakuoki: Shinkai - Fuukaden,0.0
312,Aokana: Four Rhythm Across the Blue,0.0
313,"Nora, Princess, and Stray Cat",0.0
314,Memories Off: Innocent File,0.0


In [49]:
sales_filt.to_csv('sales_switch.csv')