In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

In [8]:
# user_agents = [
#     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
#     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
#     'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
#     # Add more User-Agents if needed
# ]

headers = {
    'User-Agent': "Mozilla/5.0 (platform; rv:gecko-version) Gecko/gecko-trail Firefox/132.0.1-1"
}

def get_sitemaps() -> list:
    # URL
    sitemap = "https://www.metacritic.com/games.xml"
    sm_response = requests.get(sitemap, headers=headers)
    sm_soup = BeautifulSoup(sm_response.text, 'xml')
    # time.sleep(1)
    return [sm.get_text(strip=True) for sm in sm_soup.find_all('sitemap')]

In [9]:
def get_urls(sitemap: str):
    try:
        urls_response = requests.get(sitemap, headers=headers, timeout=3)
    except Exception:
        return sitemap
    
    urls_soup = BeautifulSoup(urls_response.text, 'xml')
    # time.sleep(1)
    urls = urls_soup.find_all('loc')
    if not urls:
        raise ValueError('No URLs found in sitemap')
    
    return [url.get_text(strip=True) for url in urls]

In [10]:
def get_data(url: str) -> pd.DataFrame:
    # Data Dictionary
    data = {
        'Game Title': [],
        'Game Genre': [],
        'Pricing': [],
        'Publisher': [],
        'Release Date': [],
        'Platform': [],
        'Rating': [],
        'Number of Ratings': []
    }
    response = requests.get(url, headers=headers)
    # print(response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')
    # time.sleep(1)
    if not soup:
        raise ValueError('No data found in URL')
    if soup.find('div', class_='c-error404'):
        raise ValueError('404 Error')
    
    # Rating
    rating = soup.find('div', class_='c-productScoreInfo u-clearfix')
    if rating:
        text = rating.find('div', class_='c-siteReviewScore_background c-siteReviewScore_background-user').get_text(strip=True)
        if text == 'tbd':
            raise ValueError('tbd')
        else:
            data['Rating'].append(text)
            number_of_ratings = rating.find('span', class_='c-productScoreInfo_reviewsTotal u-block').get_text(strip=True).replace('Based on ', '').replace(' User Ratings', '').replace(',', '')
            data['Number of Ratings'].append(int(number_of_ratings))
    else:
        data['Rating'].append(np.nan)
    
    # Title
    title = soup.find("div", class_="c-productHero_title g-inner-spacing-bottom-medium g-outer-spacing-top-medium")
    if title:
        data['Game Title'].append(title.get_text(strip=True))
    else:
        data['Game Title'].append(np.nan)
    
    # Genre
    genre = soup.find('li', class_='c-genreList_item')
    if genre:
        data['Game Genre'].append(genre.get_text(strip=True))
    else:
        data['Game Genre'].append(np.nan)
    
    # Publisher
    publisher = soup.find('div', class_='c-gameDetails_Distributor u-flexbox u-flexbox-row')
    if publisher:
        data['Publisher'].append(publisher.get_text(strip=True).replace('Publisher:', ''))
    else:
        data['Publisher'].append(np.nan)
    
    # Release Date
    release_date = soup.find('div', class_='c-gameDetails_ReleaseDate u-flexbox u-flexbox-row')
    if release_date:
        data['Release Date'].append(release_date.get_text(strip=True).replace('Initial Release Date:', ''))
    else:
        data['Release Date'].append(np.nan)
    
    # Platform
    platform = soup.find('div', class_='c-gameDetails_Platforms u-flexbox u-flexbox-row')
    if platform:
        platforms = []
        for p in platform.find_all('li', class_='c-gameDetails_listItem g-color-gray70 u-inline-block'):
            platforms.append(p.get_text(strip=True))
        # Join the platforms as a comma-separated string
        platforms_text = ", ".join(platforms)
        data['Platform'].append(platforms_text)
    else:
        data['Platform'].append(np.nan)

    # Price
    price = soup.find('div', {'data-cy': 'w2w-purchase-button'})
    if price:
        data['Pricing'].append(price.find('span', class_='g-outer-spacing-left-small').get_text(strip=True).replace('(', '').replace(')', ''))
    else:
        data['Pricing'].append(np.nan)
    
    
    return pd.DataFrame(data)

In [11]:
main_sitemap = get_sitemaps()
total_sitemaps = len(main_sitemap)

def get_game_links(sitemaps, retry=0):
    
    if len(sitemaps) == 0:
        return

    missed_urls = []
    success_urls = []

    for i,sitemap in enumerate(sitemaps):
        
        print(f"\rFound {len(success_urls)}; searching at {i}/{len(sitemaps)}", end="")
        
        try:
            urls = get_urls(sitemap)
            success_urls += urls
        except Exception:
            missed_urls.append(sitemap)
            continue

    if success_urls:
        with open(f"{retry}_metacritic.txt", "w") as file:
            file.write("\n".join(success_urls))

    print(f"\n{len(success_urls)} URLs found, {len(missed_urls)} sitemaps inaccessible")
    
    get_game_links(missed_urls, retry + 1)

In [12]:
get_game_links(main_sitemap)

Found 248999; searching at 249/250
249724 URLs found, 0 sitemaps inaccessible
