In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import warnings
warnings.filterwarnings(action='once')
from icecream import ic
from random import randint
from time import sleep

In [223]:
url = "https://www.metacritic.com"
first_page = "https://www.metacritic.com/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc"
user_agent = {'User-agent': 'Mozilla/5.0'}
review_dict = {'name':[], 'date':[], 'game':[], 'rating':[], 'consoles':[], 'review':[]}

In [225]:
def parse_page(page_link, next_page, end_page, games_per_page = 3):  
    next_page = next_page
    print('parsing page:', next_page-1)

#   request from current page
    response = requests.get(page_link, headers = user_agent)
    soup = bs(response.text, 'html.parser')

#     Loop through current page and extract links to game pages     
    for game in soup.find_all('a', attrs = {'class':'title'})[:games_per_page]:
        game_link = url + game.get('href')
        
#         Connects to game page
        game_response = requests.get(game_link, headers = user_agent)
        game_soup = bs(game_response.text, 'html.parser')

#         Extracts link to User Reviews Page from main game page
        for user_review in game_soup.find_all('a', attrs = {'class':'action'}, text = 'User Reviews'):
            user_review_link = url + user_review.get('href')
            
#             Connects to User Review page
            user_review_response = requests.get(user_review_link, headers = user_agent)
            user_review_soup = bs(user_review_response.text, 'html.parser')
            
            
#           credit: Adeline Ong
#           https://towardsdatascience.com/web-scraping-metacritic-reviews-using-beautifulsoup-63801bbe200e

#             Loop through User Review Page and Extract Review Data into a dictionary
            for review in user_review_soup.find_all('div', class_='review_content'):
                if review.find('div', class_='name') == None:
                    break
                get_consoles(user_review_soup)
                review_dict['game'].append(user_review_soup.find('h1').text)
                review_dict['name'].append(review.find('div', class_='name').find(recursive = True).text)
                review_dict['date'].append(review.find('div', class_='date').text)
                review_dict['rating'].append(review.find('div', class_='review_grade').find_all('div')[0].text)
                if review.find('span', class_='blurb blurb_expanded'):
                    review_dict['review'].append(review.find('span', class_='blurb blurb_expanded').text)
                else:
                    review_dict['review'].append(review.find('div', class_='review_body').find('span').text)
    
    
    
    if next_page == (end_page+1):
        print('Done')
        return 
    
#     Recurse through the following pages
    find_next_page = soup.find('a', class_='page_num', string = f'{next_page}')
    nextpage_link = url + find_next_page.get('href')
    next_page += 1
    parse_page(nextpage_link, next_page, end_page, games_per_page)

def get_consoles(user_review_soup):
    if user_review_soup.find('span', class_="label", string = "Also On:")==None:
        review_dict['consoles'].append(user_review_soup.find('span', class_="platform").text.strip())
    else:
        consoles = [user_review_soup.find('span', class_="platform").text.strip()]
        other_consoles = [console.text  for console in user_review_soup.find_all('a', class_="hover_none")[1:]]
        consoles.extend(other_consoles)
        review_dict['consoles'].append(consoles)

In [226]:
parse_page(first_page, next_page = 2, end_page = 2, games_per_page = 2)

parsing page: 1
parsing page: 2
Done


In [229]:
reviews = pd.DataFrame(review_dict) 
reviews.sample(10)

Unnamed: 0,name,date,game,rating,consoles,review
100,RunFish,"Dec 29, 2021",Steins;Gate,10,"[PC, iPhone/iPad, PlayStation 3, PlayStation V...",The game is divine!!!!!!!!!!!!!!!!!!!!!!!!!!!!...
26,EmuChicken,"Jul 20, 2014",Ghost Trick: Phantom Detective,9,"[DS, iPhone/iPad]","Hmn, so this is what the spirit realm is like?..."
35,miccael,"Aug 28, 2018",Ghost Trick: Phantom Detective,10,"[DS, iPhone/iPad]","Such an awesome story, artstyle and cool idea...."
54,Skrapmettle,"Nov 3, 2010",Z.H.P. Unlosing Ranger vs Darkdeath Evilman,9,"[PSP, PC]",It is really amazing to me that a game like th...
70,Vectis99,"Oct 15, 2018",Steins;Gate,10,"[PC, iPhone/iPad, PlayStation 3, PlayStation V...",A gripping masterpiece of a visual novel which...
158,Demonik420,"Nov 22, 2013",The Legend of Zelda: A Link Between Worlds,10,3DS,A Nostalgia paradise! Zelda: A Link Between Wo...
163,PuckMan,"Jan 2, 2014",The Legend of Zelda: A Link Between Worlds,7,3DS,A very solid workman like entry in the Zelda s...
147,robotryz,"Feb 12, 2018",The Legend of Zelda: A Link Between Worlds,9,3DS,A Link Between Worlds was very good!\rI beat i...
65,RunFish12B8,"Dec 28, 2021",Z.H.P. Unlosing Ranger vs Darkdeath Evilman,10,"[PSP, PC]",The game is divine!!!!!!!!!!!!!!!!!!!!!!!!!!!!...
144,Titannumber20,"Mar 15, 2015",The Legend of Zelda: A Link Between Worlds,9,3DS,One of the best Zelda games in recent years. I...


# <br><br><br>Beyond this point was just debugging<br><br><br><br><br>

In [201]:
def testing():
    while True:
        print('done')
        return

In [202]:
testing()

done


In [60]:
# review links for each game title
[link.get('href') for link in soup.find_all('a', attrs = {'class':'title'})][:5]

['/game/ds/ghost-trick-phantom-detective',
 '/game/psp/zhp-unlosing-ranger-vs-darkdeath-evilman',
 '/game/playstation-2/grimgrimoire',
 '/game/xbox-one/superliminal',
 '/game/switch/superliminal']

In [189]:
# get to user reviews page
link = "https://www.metacritic.com/game/ds/ghost-trick-phantom-detective"
response = requests.get(link, headers = user_agent)
soup_game = bs(response.text, 'html.parser')

In [193]:
[link.get('href') for link in soup_game.find_all('a', attrs = {'class':'action'}, text = 'User Reviews')]

['/game/ds/ghost-trick-phantom-detective/user-reviews']

In [141]:
#get game name
link = "https://www.metacritic.com/game/xbox-one/superliminal/user-reviews"
response = requests.get(link, headers = user_agent)
soup = bs(response.text, 'html.parser')

In [142]:
soup.find('h1').text

'Superliminal'

In [143]:
#get consoles 
link = "https://www.metacritic.com/game/xbox-one/superliminal/user-reviews"
response = requests.get(link, headers = user_agent)
soup = bs(response.text, 'html.parser')

In [144]:
soup.find('span', class_="platform").text.strip()

'Xbox One'

In [146]:
soup.find('span', class_="label", string = "Also On:")==None

False

In [152]:
other_consoles = [console.text  for console in soup.find_all('a', class_="hover_none")[1:]]

In [161]:
mylist = ['console1']
mylist.extend([console.text  for console in soup.find_all('a', class_="hover_none")[1:]])

In [162]:
mylist

['console1', 'PC', 'PlayStation 4', 'Switch']

In [206]:
mylist.extend([soup.find('span', class_="platform").text.strip()].extend(other_consoles))

TypeError: 'NoneType' object is not iterable

In [None]:
# # Connect to game reviews
# game_link = "https://www.metacritic.com/game/ds/ghost-trick-phantom-detective/user-reviews"
# response_game = requests.get(game_link, headers = user_agent)
# soup_two = BeautifulSoup(response_game.text, 'html.parser')

# # game reviews --> dict --> dataframe
# review_dict = {'name':[], 'date':[], 'rating':[], 'review':[]}
# for review in soup_two.find_all('div', class_='review_content'):
#         if review.find('div', class_='name') == None:
#                        break 
#         review_dict['name'].append(review.find('div', class_='name').find('a').text)
#         review_dict['date'].append(review.find('div', class_='date').text)
#         review_dict['rating'].append(review.find('div', class_='review_grade').find_all('div')[0].text)
#         if review.find('span', class_='blurb blurb_expanded'):
#             review_dict['review'].append(review.find('span', class_='blurb blurb_expanded').text)
#         else:
#             review_dict['review'].append(review.find('div', class_='review_body').find('span').text)

# sword_reviews = pd.DataFrame(review_dict) 
# sword_reviews.sample()

In [61]:
# page number links
[link.get('href') for link in soup.find_all('a', attrs = {'class': 'page_num'})]

['/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=1',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=2',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=3',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=4',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=5',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=6',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=7',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=8',
 '/browse/games/score/userscore/all/all/filtered?view=condensed&sort=descc&page=179']

In [22]:
# class Metacritic_spider(scrapy.Spider):
    
#     name = "Metacritic"

#     def start_requests(self):
#         urls = ['https://www.metacritic.com/browse/games/score/userscore/all/all/filtered?sort=desc']
#         user_agent = {'User-agent': 'Mozilla/5.0'}
#         for url in urls:
#             yield scrapy.Request(url=url, callback=self.parse_game_links, headers = user_agent)
            
#     def parse_game_links(self, response):
#         # Game Link
#         game_link = response.css( 'a.title::text' )
#         # Extract the links (as a list of strings)        
#         links_to_follow = game_link.extract_first()
        
#         print(links_to_follow)
#         # Follow the links to the next parser
# #         for url in links_to_follow:yield response.follow( url = url,                                   
# #                                                          callback = self.parse_pages )

In [57]:
# process = CrawlerProcess()
# process.crawl(Metacritic_spider)
# process.start()