In [50]:
from bs4 import BeautifulSoup
import requests, pickle
import numpy as np, pandas as pd
from tqdm import notebook
from time import sleep
import random
from random import randint
from urllib.parse import urlparse
from urllib.error import HTTPError

In [51]:
def GET_UA():
    uastrings = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
                "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",\
                "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"\
                ]
 
    return random.choice(uastrings)

In [52]:
def get_soup(url):
 
    headers = {'User-Agent': GET_UA()}
    content = None
 
    try:
        response = requests.get(url, headers=headers)
        ct = response.headers['Content-Type'].lower().strip()
 
        if 'text/html' in ct:
            content = response.content
            soup = BeautifulSoup(content, "html.parser")
        else:
            content = response.content
            soup = None
 
    except Exception as e:
        print("Error:", str(e))
 
    return soup
 
 
def parse_internal_links(soup, current_page):
    return [a['href'].lower().strip() for a in soup.find_all('a', href=True) if urlparse(a['href']).netloc == urlparse(current_page).netloc]

In [53]:
# Old code that would not work because of timeout? 

# def get_soup(url):
#     #sleep(1)
#     page = parse_url(url)
    
#     return BeautifulSoup(page.content, 'html.parser')

In [54]:
def get_games_links(soup):
    games_list = soup.find('div', class_='product_condensed')
    links = []
    for game in games_list.select('li[class*="product game_product"]'):
        links.append(game.a['href'])
        
    return links

In [55]:
def get_game_info(soup):
    title = soup.find('div', class_='product_title')
    if title is not None:
        title = soup.find('div', class_='product_title').find('h1').get_text()
    else:
        title = np.nan
    platform = soup.find('span', class_='platform')
    if platform is not None:
        platform = soup.find('span', class_='platform').find('a').get_text().strip()
    else:
        platform = np.NaN
    # summary = soup.find('span', class_='blurb blurb_expanded')
    # if summary is not None:
    #     summary = summary.get_text().strip()
    # else:
    #     summary = soup.find('span', itemprop='description')
    #     if summary is not None:
    #         summary = summary.get_text().strip()
    #     else:
    #         summary = np.nan
    developer = soup.find('li', class_='summary_detail developer')
    if developer is not None:
        developer = developer.find('span', class_='data').get_text().strip()
    else:
        developer = np.nan
    genre = []
    if soup.find('li', class_='summary_detail product_genre') is not None:
        for g in soup.find('li', class_='summary_detail product_genre').find_all('span', class_='data'): 
            genre.append(g.get_text().strip())
    else:
        genre = np.NaN
    
    release_date = soup.find('li', class_='summary_detail release_data')
    if release_date is not None: 
        release_date = release_date.find('span', class_='data').get_text().strip() 
    else:
        release_date = np.nan


    rating = soup.find('li', class_='summary_detail product_rating')
    if rating is not None: 
        rating = rating.find('span', class_='data').get_text().strip() 
    else:
        rating = np.nan
    
    #return title, platform, summary, release_date, developer, genre, rating
    return title, platform, release_date, developer, genre, rating

In [56]:
def get_reviews_overview(soup):
    overview = soup.find('span', class_='desc')
    if overview is not None:
        overview = soup.find('span', class_='desc').get_text().strip()
    else:
        overview = np.nan
    reviews_count = soup.find('div', class_='score_distribution')
    if reviews_count is not None:
        reviews_count = reviews_count.find_all('span', class_='count')
        pos = reviews_count[0].get_text().strip()
        mixed = reviews_count[1].get_text().strip()
        neg = reviews_count[2].get_text().strip()
    else:
        pos = '0'; mixed = '0'; neg = '0'
    
    return overview, pos, mixed, neg

In [57]:
def get_reviews(soup, category = 'user'):
    names = []; dates = []; scores = []
    
    reviews_list = soup.find('ol', class_=f'reviews {category}_reviews')
    if reviews_list is not None:
        for review in reviews_list.select(f'li[class*="review {category}_review"]'):
            if category == 'user':
                names.append(review.find('div', class_='name').get_text().strip())
            else:
                names.append(review.find('div', class_='source').get_text().strip())
            dates.append(review.find('div', class_='date').get_text().strip())
            scores.append(review.find('div', class_='review_grade').get_text().strip())

    return names, dates, scores

In [58]:
letters = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [59]:
console = 'ps4'
#console = 'ps5'
#console = 'xboxone'
#console = '3DS'
#console = 'switch'

In [60]:
games_links = []

for letter in letters:
    soup = get_soup(f'http://www.metacritic.com/browse/games/title/{console}/{letter}')
    games_links += get_games_links(soup)
    p = soup.find('ul', class_='pages')
    if p is not None: 
        pages_qty = len(p.find_all('li'))
        for page_num in range(1, pages_qty):
            soup = get_soup(f'http://www.metacritic.com/browse/games/title/{console}/{letter}?page={page_num}')
            games_links += get_games_links(soup)

In [61]:
pickle.dump(games_links, open(f'{console}_games_links', 'wb'))

In [62]:
#games_links = pickle.load(open(f'{console}_games_links', 'rb'))

In [63]:
#titles = {}; platforms = {}; summaries = {}; release_dates = {} 
titles = {}; platforms = {}; release_dates = {} 
developers = {}; genres = {}; ratings = {}; meta_scores = {}
meta_overviews = {}; meta_pos = {}; meta_mixed = {}
meta_neg = {}; critics_names = {}; critics_dates = {}; critics_scores = {}; 
user_scores = {}; user_overviews = {}
user_pos = {}; user_mixed = {}; user_neg = {}; users_names = {}
users_dates = {}; users_scores = {}

In [64]:
games_links

['/game/playstation-4/buy-the-game-i-have-a-gun--sheesh-man',
 '/game/playstation-4/funtime',
 '/game/playstation-4/killallzombies',
 '/game/playstation-4/sinucaattack',
 '/game/playstation-4/n-verlore-verstand',
 '/game/playstation-4/hackgu-last-recode',
 '/game/playstation-4/0-degrees',
 '/game/playstation-4/10-second-ninja-x',
 '/game/playstation-4/1000-top-rated',
 '/game/playstation-4/1001-spikes',
 '/game/playstation-4/100ft-robot-golf',
 '/game/playstation-4/101-ways-to-die',
 '/game/playstation-4/11-11-memories-retold',
 '/game/playstation-4/11-11-memories-retold---warchild-charity',
 '/game/playstation-4/112th-seed',
 '/game/playstation-4/13-sentinels-aegis-rim',
 '/game/playstation-4/140',
 '/game/playstation-4/16-bit-soccer',
 '/game/playstation-4/18-floors',
 '/game/playstation-4/1971-project-helios',
 '/game/playstation-4/1976---back-to-midway',
 '/game/playstation-4/1979-revolution-black-friday',
 '/game/playstation-4/198x',
 '/game/playstation-4/1993-shenandoah',
 '/game

In [65]:
bar = notebook.tqdm(total=len(games_links))

for link in games_links:
    # game summary section
    soup = get_soup(f'http://www.metacritic.com{link}')

    # game summary info
    #title, platform, summary, release_date, developer, genre, rating = get_game_info(soup)
    title, platform, release_date, developer, genre, rating = get_game_info(soup)
    titles[link] = title
    platforms[link] = platform
    #summaries[link] = summary
    release_dates[link] = release_date
    developers[link] = developer
    genres[link] = genre
    ratings[link] = rating
    
    # critics reviews section
    soup = get_soup(f'http://www.metacritic.com{link}/critic-reviews')

    # critics reviews general info
    meta_score = soup.find('span', itemprop='ratingValue')
    if meta_score is not None:
        meta_scores[link] = meta_score.get_text().strip()
    else:
        meta_scores[link] = '0'
    overview, pos, mixed, neg = get_reviews_overview(soup)
    meta_overviews[link] =  overview
    meta_pos[link] = pos
    meta_mixed[link] = mixed 
    meta_neg[link] = neg
    # critics reviews
    names, dates, scores = get_reviews(soup, 'critic')
    critics_names[link] = names
    critics_dates[link] = dates
    critics_scores[link] = scores
    
    # users reviews section
    soup = get_soup(f'http://www.metacritic.com{link}/user-reviews')

    # users reviews general info
    if soup.select('div[class*="metascore_w user large"]'):
        if soup.select('div[class*="metascore_w user large"]') is not None:
            user_scores[link] = soup.select('div[class*="metascore_w user large"]')[0].get_text().strip()
    else:
        user_scores[link] = '0'
    overview, pos, mixed, neg = get_reviews_overview(soup)
    user_overviews[link] =  overview
    user_pos[link] = pos
    user_mixed[link] = mixed 
    user_neg[link] = neg
    # users reviews
    names, dates, scores = get_reviews(soup)
    users_names[link] = names
    users_dates[link] = dates
    users_scores[link] = scores

    # p = soup.find('ul', class_='pages')
    # if p is not None: 
    #     pages_qty = len(p.find_all('li'))
    #     for page_num in range(1, pages_qty):
    #         sleep(randint(1,3))
    #         soup = get_soup(f'http://www.metacritic.com{link}/user-reviews?page={page_num}')

    #         names, dates, scores = get_reviews(soup)
    #         users_names[link] += names
    #         users_dates[link] += dates
    #         users_scores[link] += scores
    
    bar.update()

  0%|          | 0/9087 [00:00<?, ?it/s]

In [None]:
# Debugging

# link=games_links[18]
# print(link)
# soup = get_soup(f'http://www.metacritic.com{link}')
# print(soup.select('div[class*="metascore_w user large"]'))
# if soup.select('div[class*="metascore_w user large"]') is not True :
#     print("bou")
# type(soup.select('div[class*="metascore_w user large"]'))

In [None]:
# Was used to debug

# link=games_links[18]
# print(link)
# soup = get_soup(f'http://www.metacritic.com{link}-2015')
# #print(soup.prettify())
# title = soup.find('div', class_='product_title').find('h1').get_text()
# overview = soup.find('span', class_='desc').get_text().strip()

In [None]:
df = pd.DataFrame({'title': titles, 'platform': platforms,  
                   'release_date': release_dates, 'developer': developers, 'genre': genres, 
                   'rating': ratings, 'meta_score': meta_scores, 'meta_overview': meta_overviews,
                   'meta_pos': meta_pos, 'meta_mixed': meta_mixed, 'meta_neg': meta_neg, 
                   'user_score': user_scores, 'user_overview': user_overviews, 'user_pos': user_pos, 
                   'user_mixed': user_mixed, 'user_neg': user_neg },
                   columns=['title', 'platform', 'developer', 'genre', 'rating', 'release_date',
                            'summary', 'meta_score', 'meta_overview', 'meta_pos', 'meta_mixed', 
                            'meta_neg', 'user_score', 'user_overview', 'user_pos', 'user_mixed', 
                            'user_neg']).reset_index(drop = True)

In [None]:
df.to_csv(f'{console}_games.csv', index=False)

In [None]:
def create_reviews_df(critics_dict, dates_dict, scores_dict):
    critics = []; dates = []; scores = []; texts = []; games = []; plats = []
    for k in critics_dict:
        critics += critics_dict[k]
        dates += dates_dict[k]
        scores += scores_dict[k]
        games += [titles[k]] * len(critics_dict[k])
        plats += [platforms[k]] * len(critics_dict[k])
    
    return pd.DataFrame({'critic': critics, 'date': dates, 'score': scores, 'title': games, 'platform': plats},
                         columns = ['score', 'critic', 'date', 'title', 'platform'])

In [None]:
df = create_reviews_df(critics_names, critics_dates, critics_scores)
df.to_csv(f'{console}_meta_reviews.csv', index=False)

In [None]:
df = create_reviews_df(users_names, users_dates, users_scores)
df.to_csv(f'{console}_user_reviews.csv', index=False)