# Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd, numpy as np
import warnings
warnings.filterwarnings(action='once')
from icecream import ic
from random import randint
from time import sleep
from timeit import default_timer as timer

In [2]:
url = "https://www.metacritic.com"
first_page = "https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?view=condensed"
user_agent = {'User-agent': 'Mozilla/5.0'}
review_dict = {'name':[], 'date':[], 'game':[], 'rating':[], 'review':[]}
game_dict = {'game':[], 'release_date':[], 'genre':[], 'platforms':[], 'developer':[], 'esrb_rating':[], 'ESRBs':[], \
             'metascore':[], 'userscore':[],'critic_reviews':[], 'user_reviews':[], 'num_players':[], 'summary':[] }

In [3]:
def get_details(soup, text, col, idx=1):
    details = []
    if soup.find('th', scope='row', text = text)==None:
        game_dict[col].append(None)
        return
    for i in soup.find('th', scope='row', text = text).next_siblings:
        details.append(i.text)
    details_ = ' '.join(details[idx].strip().split())
    game_dict[col].append(details_)

In [4]:
def get_platforms(soup):
    if soup.find('span', class_="label", string = "Also On:")==None:
        game_dict['platforms'].append(soup.find('span', class_="platform").text.strip())
    else:
        consoles = [soup.find('span', class_="platform").text.strip()]
        other_consoles = [console.text  for console in soup.find_all('a', class_="hover_none")[1:]]
        consoles.extend(other_consoles)
        game_dict['platforms'].append(', '.join(consoles))

In [5]:
def get_summary(soup):
    if soup.find('span', class_="label", string = "Also On:")!=None:
        game_dict['summary'].append(soup.find_all('span', class_ = 'data')[3].text)
    else:
        game_dict['summary'].append(soup.find_all('span', class_ = 'data')[2].text)

In [6]:
def parse_info(soup):
    # Get to Details Page
    details_link = url + soup.find('a', attrs = {'class':'action'}, text = 'Details & Credits').get('href')
    details_response = requests.get(details_link, headers = user_agent)
    details_soup = bs(details_response.text, 'html.parser')
    
#     Get game Details
    for detail in details_soup.find_all('div', id='main'):
        game_dict['game'].append(detail.find('h1').text)
        game_dict['release_date'].append(detail.find_all('span', class_='data')[1].text)
        get_details(detail, 'Genre(s):', 'genre')
        get_platforms(detail)
        get_details(detail, 'Developer:', 'developer')
        get_details(detail, 'Rating:', 'esrb_rating')
        get_details(detail, 'ESRB Descriptors:', 'ESRBs', idx=0)
        game_dict['metascore'].append(detail.find('span', itemprop='ratingValue').text)
        game_dict['userscore'].append(detail.find_all('a', class_='metascore_anchor')[1].contents[1].text)
        game_dict['critic_reviews'].append([i.find_all('span') for i in 
                                            detail.find_all('span', class_='count')][0][2].text.strip())
        game_dict['user_reviews'].append(detail.find_all('span', class_ = 'count')[1].find('a').text[:-8])
        get_details(detail, 'Number of Players:', 'num_players', idx=0)
        get_summary(detail)

In [7]:
def parse_reviews(soup):
    #  Extracts link to User Reviews Page from main game page
    for user_review in soup.find_all('a', attrs = {'class':'action'}, text = 'User Reviews'):
        user_review_link = url + user_review.get('href')

#             Connects to User Review page
        user_review_response = requests.get(user_review_link, headers = user_agent)
        user_review_soup = bs(user_review_response.text, 'html.parser')


#           credit: Adeline Ong
#           https://towardsdatascience.com/web-scraping-metacritic-reviews-using-beautifulsoup-63801bbe200e

#             Loop through User Review Page and Extract Review Data into a dictionary
        for review in user_review_soup.find_all('div', class_='review_content'):
            if review.find('div', class_='name') == None:
                break
#                 get_consoles(user_review_soup) 
            review_dict['game'].append(user_review_soup.find('h1').text)
            review_dict['name'].append(review.find('div', class_='name').find(recursive = True).text)
            review_dict['date'].append(review.find('div', class_='date').text)
            review_dict['rating'].append(review.find('div', class_='review_grade').find_all('div')[0].text)
            if review.find('span', class_='blurb blurb_expanded'):
                review_dict['review'].append(review.find('span', class_='blurb blurb_expanded').text)
            else:
                review_dict['review'].append(review.find('div', class_='review_body').find('span').text)

In [8]:
def parse_games(page_link, next_page, end_page, games_per_page = 3):  
    next_page = next_page
    print(f'parsing page: {next_page-1}/{end_page}' )
    
#   request from current page
    response = requests.get(page_link, headers = user_agent)
    soup = bs(response.text, 'html.parser')

#     Loop through current page and extract links to game pages     
    for game in soup.find_all('a', attrs = {'class':'title'})[:games_per_page]:
        game_link = url + game.get('href')
        
#         Connects to game page summary
        game_response = requests.get(game_link, headers = user_agent)
        game_soup = bs(game_response.text, 'html.parser')
        
#         Get Game info
        parse_info(game_soup)
    
#         Get Game Reviews
        parse_reviews(game_soup)
    
    
    if next_page == (end_page+1):
        print('Done')
        end = timer()
        return end
    sleep(3)
    
#     Recurse through the following pages
    find_next_page = soup.find('a', class_='page_num', string = f'{next_page}')
    nextpage_link = url + find_next_page.get('href')
    next_page += 1
    parse_games(nextpage_link, next_page, end_page, games_per_page)

In [9]:
start = timer()
parse_games(first_page, next_page = 2, end_page = 2, games_per_page = 2)
end = timer()
print(end-start)

parsing page: 1/2
parsing page: 2/2
Done
8.971382600000002


In [10]:
pd.DataFrame(game_dict)

Unnamed: 0,game,release_date,genre,platforms,developer,esrb_rating,ESRBs,metascore,userscore,critic_reviews,user_reviews,num_players,summary
0,The Legend of Zelda: Ocarina of Time,"Nov 23, 1998","Action Adventure, Fantasy",Nintendo 64,Nintendo,E,Violence,99,9.1,22,6981,1 Player,"As a young boy, Link is tricked by Ganondorf, ..."
1,Tony Hawk's Pro Skater 2,"Sep 20, 2000","Sports, Alternative, Skateboarding","PlayStation, iPhone/iPad, PC",Neversoft Entertainment,T,Mild Animated Violence Mild Language,98,7.4,19,785,1-2 Players,As most major publishers' development efforts ...
2,Jet Grind Radio,"Oct 30, 2000","Action, Platformer, 3D",Dreamcast,Smilebit,T,Animated Violence Mild Language,94,8.3,24,105,,"Join a graffiti crew, stamp your territory and..."
3,Metal Gear Solid,"Oct 21, 1998","Action Adventure, Modern",PlayStation,KCEJ,M,Animated Blood and Gore Animated Violence Matu...,94,9.2,20,1812,1 Player,"You are Snake, a government agent on a mission..."


In [11]:
pd.DataFrame(review_dict)

Unnamed: 0,name,date,game,rating,review
0,doodlerman,"Jun 9, 2011",The Legend of Zelda: Ocarina of Time,10,I'm one of those people who think that this is...
1,Jacody,"Nov 25, 2010",The Legend of Zelda: Ocarina of Time,10,Anyone who gives the masterpiece below a 7 or ...
2,Kaistlin,"Apr 25, 2011",The Legend of Zelda: Ocarina of Time,10,I won't bore you with what everyone is already...
3,SirCaestus,"Jun 12, 2011",The Legend of Zelda: Ocarina of Time,10,"Everything in OoT is so near at perfection, it..."
4,StevenA,"Mar 21, 2010",The Legend of Zelda: Ocarina of Time,10,This game is the highest rated game on Metacr...
...,...,...,...,...,...
301,AB,"Jan 3, 2005",Metal Gear Solid,10,"Although the graphics are outdated, this is o..."
302,TheQuietGamer,"Sep 10, 2016",Metal Gear Solid,9,When it comes to storytelling in video games f...
303,BasilZero,"Dec 3, 2018",Metal Gear Solid,10,Game: Metal Gear Solid\rGenre: Action/Stealth\...
304,Vitalque,"Jul 31, 2014",Metal Gear Solid,10,Great! Worth to play any time anywhere by anyo...
