In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
headers = {
    'User-Agent': "Mozilla/5.0 (platform; rv:gecko-version) Gecko/gecko-trail Firefox/132.0.1-1"
}

def get_sitemaps() -> list:
    # URL
    sitemap = "https://www.metacritic.com/games.xml"
    sm_response = requests.get(sitemap, headers=headers)
    sm_soup = BeautifulSoup(sm_response.text, 'xml')
    # time.sleep(1)
    return [sm.get_text(strip=True) for sm in sm_soup.find_all('sitemap')]

In [None]:
def get_urls(sitemap: str):
    try:
        urls_response = requests.get(sitemap, headers=headers, timeout=3)
    except Exception:
        return sitemap
    
    urls_soup = BeautifulSoup(urls_response.text, 'xml')
    # time.sleep(1)
    urls = urls_soup.find_all('loc')
    if not urls:
        raise ValueError('No URLs found in sitemap')
    
    return [url.get_text(strip=True) for url in urls]

In [None]:
def scrape_data(url):
    data = {
        'Game Title': [],
        'Game Genre': [],
        'Pricing': 'n/a',
        'Developer': [],
        'Release Date': [],
        'Platform': [],
        'Rating': [],
        'Number of Ratings': []
    }
    
    try:
        response = requests.get(url, headers=headers)
    except Exception:
        raise Exception(e)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    
    info = soup.find('div', class_='c-productHero_score-container u-flexbox u-flexbox-column g-bg-white')
    
    # rating
    ratings = info.find_all('div', class_='c-productScoreInfo_scoreContent')
    
    score = ratings[1].find('div', class_='c-productScoreInfo_scoreNumber').get_text(strip=True)
    
    if score == 'tbd':
        raise ValueError('tbd')

    data['Rating'] = score
    
    import re
    
    num_rating_text = ratings[1].find('span', class_='c-productScoreInfo_reviewsTotal').get_text(strip=True)
    num_ratings = re.search(r'(\d[\d,]*)', num_rating_text).group(1)
    
    data['Number of Ratings'] = num_ratings.replace(',', '')
    
    # title
    data['Game Title'] = soup.find('div', class_='c-productHero_title').get_text(strip=True)
    
    # genre
    genres = soup.find('ul', class_='c-genreList').select('li')
    data['Game Genre'] = ", ".join([genre.get_text(strip=True) for genre in genres])
    
    
    # developer
    devs = soup.find('div', class_='c-gameDetails_Developer').select('.c-gameDetails_listItem')
    data['Developer'] = ", ".join([dev.get_text(strip=True) for dev in devs])
    
    # release date
    release_date = soup.find('div', class_='c-gameDetails_ReleaseDate').select("span")[1].get_text(strip=True)
    data['Release Date'] = release_date
    
    # platforms
    platforms = [li.get_text(strip=True) for li in soup.select(".c-gameDetails_Platforms ul li")]
    data['Platform'] = ", ".join(platforms)
    
    return pd.DataFrame([data])
    

In [None]:
main_sitemap = get_sitemaps()
total_sitemaps = len(main_sitemap)

def get_game_links(sitemaps, retry=0):
    
    if len(sitemaps) == 0:
        return

    missed_urls = []
    success_urls = []

    for i,sitemap in enumerate(sitemaps):
        
        print(f"\rFound {len(success_urls)}; searching at {i}/{len(sitemaps)}", end="")
        
        try:
            urls = get_urls(sitemap)
            success_urls += urls
        except Exception:
            missed_urls.append(sitemap)
            continue

    if success_urls:
        with open(f"{retry}_metacritic.txt", "w") as file:
            file.write("\n".join(success_urls))

    print(f"\n{len(success_urls)} URLs found, {len(missed_urls)} sitemaps inaccessible")
    
    get_game_links(missed_urls, retry + 1)

In [None]:
# get_game_links(main_sitemap)

In [None]:
def run(urls: list, part: str):
    size = len(urls)

    main_df = pd.DataFrame()
    df_list = []

    for i,url in enumerate(urls):
        try:
            data = scrape_data(url)
            df_list.append(data)
        except Exception as e:
            pass
        print(f"\r[{int(((i+1)/size)*100)}%] {i}/{size}, scrapped {len(df_list)}", end="")
            
            
    main_df = pd.concat(df_list, ignore_index=True)
    main_df.to_csv(f"{part}_metacritic_data.csv", index=False)
    
    print(main_df.shape)
    print(main_df.head())

In [None]:
with open("metacritic_urls.txt", "r") as file:
    urls = file.read().splitlines()

# Chạy từng part
> **NẾU CHIA THÀNH NHIỀU RANGE ĐỂ CHẠY THÌ ĐỪNG CHO HÀM `run()` VÀO VÒNG LẶP MÀ HÃY CHẠY RIÊNG MỖI CELL**
### VD:

In [None]:
run(urls[:50000], "part1") # chạy từ 50000 link đầu

In [None]:
run(urls[50000:100000], "part2") # 50000 link tiếp

In [None]:
run(urls[100000:150000], "part3")

In [None]:
run(urls[150000:200000], "part4")

In [None]:
# tiếp tục
# run(urls[200000:], "part5")
# chạy phần còn lại