#### scapy data from website Matactitc
https://www.metacritic.com/browse/game/?releaseYearMin=2015&releaseYearMax=2025&page=1



In [None]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from pydantic import BaseModel
from typing import Optional, List
import pandas as pd
import time
import os
import math
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import threading
from queue import Queue
import logging
import random


# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 线程安全的列表类
class ThreadSafeList:
    def __init__(self):
        self._list = []
        self._lock = threading.Lock()
        
    def append(self, item):
        with self._lock:
            self._list.append(item)
            
    def extend(self, items):
        with self._lock:
            self._list.extend(items)
            
    def get_list(self):
        with self._lock:
            return self._list.copy()

# 定义数据模型
class gameDetails(BaseModel):
    id: Optional[int] = None
    title: Optional[str] = None
    releaseDate: Optional[str] = None
    rating: Optional[str] = None
    genres: Optional[list] = None
    description: Optional[str] = None
    platforms: Optional[list] = None
    production: Optional[dict] = None

class reviewDetails(BaseModel):
    quote: Optional[str] = None
    score: Optional[int] = None
    date: Optional[str] = None
    platform: Optional[str] = None
    author: Optional[str] = None
    publicationName: Optional[str] = None

def start_session(url, max_retries=3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
    }
    session = requests.Session()
    retry = Retry(total=max_retries, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    for attempt in range(max_retries):
        try:
            response = session.get(url, headers=headers, timeout=30)
            if response.status_code == 200:
                return response
            elif response.status_code == 404:
                return None
            else:
                time.sleep(2)
        except Exception as e:
            if attempt == max_retries - 1:
                logger.error(f"Failed to fetch {url}: {e}")
                return None
            #time.sleep(1)
            time.sleep(2 ** attempt)  # 指数退避
    return None

def process_game(game_data, games_list, reviews_list):
    try:
        slug_name = game_data["slug"]
        product_type = "games"
        review_types = ["user", "critic"]
        review_limits = [100, 100]
        max_review_per_time = 1000
        # 获取游戏详情
        #game_url = f"https://backend.metacritic.com/composer/metacritic/pages/{product_type}/{slug_name}/web?filter=all&sort=date&apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u"
        game_url = f"https://backend.metacritic.com/composer/metacritic/pages/{product_type}/{slug_name}/web?filter=all&sort=date&apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u"

        game_response = start_session(game_url)
        
        if not game_response:
            return
            
        game_details = game_response.json()
        game_info = dict(gameDetails(**game_details["components"][0]["data"]["item"]))
        
        # 添加评分信息
        game_info.update({
            "metascore": game_details["components"][6]["data"]["item"]["score"],
            "metascore_count": game_details["components"][6]["data"]["item"]["reviewCount"],
            "metascore_sentiment": game_details["components"][6]["data"]["item"]["sentiment"],
            "userscore": game_details["components"][8]["data"]["item"]["score"],
            "userscore_count": game_details["components"][8]["data"]["item"]["reviewCount"],
            "userscore_sentiment": game_details["components"][8]["data"]["item"]["sentiment"]
        })
        
        # 处理genres
        if "genres" in game_info and game_info["genres"]:
            game_info["genres"] = ",".join([genre["name"] for genre in game_info["genres"] if genre["name"]])
            
        # 处理平台信息
        platforms_data = game_details["components"][0]["data"]["item"]["platforms"]
        game_info["platforms"] = ",".join([p["name"] for p in platforms_data if p["criticScoreSummary"]["score"]])
        game_info["platform_metascores"] = ",".join([str(p["criticScoreSummary"]["score"]) for p in platforms_data if p["criticScoreSummary"]["score"]])
        
        # 处理开发商和发行商
        companies = game_info["production"]["companies"]
        game_info["developer"] = ",".join([c["name"] for c in companies if "Developer" in c["typeName"] and c["name"]])
        game_info["publisher"] = ",".join([c["name"] for c in companies if "Publisher" in c["typeName"] and c["name"]])
        
        game_info.pop("production")
        games_list.append(game_info)
        
        # 获取评论
        platform_slugs = [p["slug"] for p in platforms_data if p["slug"]]
        for review_type, review_limit in zip(review_types, review_limits):
            for platform_slug in platform_slugs:
                offset = 0
                total_reviews = 0
                while offset < max_review_per_time:
                    review_url = f"https://backend.metacritic.com/reviews/metacritic/{review_type}/{product_type}/{slug_name}/platform/{platform_slug}/web?apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u&offset={offset}&limit={review_limit}&filterBySentiment=all&sort=score"
                    review_response = start_session(review_url)
                    
                    if not review_response:
                        break
                        
                    review_data = review_response.json()
                    #for review in review_data["data"]["items"]:
                    review_data["data"]["items"] = review_data["data"]["items"][:100]
                    reviews = review_data["data"]["items"]
                    if len(reviews) == 0:
                        break

                    for review in reviews:
                        review_info = dict(reviewDetails(**review))

            

                        review_info.update({
                            "review_type": review_type,
                            "game_name": game_data["title"],
                            "game_id": game_data["id"]
                        })
                        reviews_list.append(review_info)
                        
                    #if len(review_data["data"]["items"]) < review_limit:
                    total_reviews += len(reviews)
                    if total_reviews >= max_review_per_time or offset >= max_review_per_time:
                        break
                        
                    offset += review_limit
                    time.sleep(random.uniform(1, 3))  # 添加随机延迟
                    
    except Exception as e:
        logger.error(f"Error processing game {game_data['slug']}: {e}")

def scrape_metacritic_games():
    start_time = time.time()
    logger.info("开始爬取Metacritic游戏数据")
    
    # 创建数据目录
    os.makedirs("data", exist_ok=True)
    
    # 初始化线程安全的数据结构
    games_list = ThreadSafeList()
    reviews_list = ThreadSafeList()
    
    # 设置参数
    product_type = "games"
    games_limit = 25
    current_year = datetime.now().year
    max_workers = 10  # 并行线程数
    
    # 获取游戏总数
    initial_url = f"https://backend.metacritic.com/finder/metacritic/web?sortBy=-metaScore&productType={product_type}&page=2&releaseYearMin=2015&releaseYearMax={current_year}&offset=0&limit={games_limit}&apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u"
    # test less data
    #initial_url = f"https://backend.metacritic.com/finder/metacritic/web?sortBy=-metaScore&productType={product_type}&page=2&releaseYearMin={current_year}&releaseYearMax={current_year}&offset=0&limit={games_limit}&apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u"
    
    initial_response = start_session(initial_url)
    
    if not initial_response:
        logger.error("初始化失败")
        return
    
    total_games = initial_response.json()["data"]["totalResults"]
    total_pages = math.ceil(total_games / games_limit)
    
    logger.info(f"总共发现 {total_games} 个游戏")
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for page in range(total_pages):
            offset = page * games_limit
            logger.info(f"处理第 {page + 1}/{total_pages} 页 ({(page + 1)/total_pages*100:.1f}%)")
            
            #每處理2頁，就休息10秒
            if page > 0 and page % 2 == 0:
                time.sleep(10)
                logger.info("休息10秒繼續")


            games_url = f"https://backend.metacritic.com/finder/metacritic/web?sortBy=-metaScore&productType={product_type}&page=2&releaseYearMin=2015&releaseYearMax={current_year}&offset={offset}&limit={games_limit}&apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u"
            games_response = start_session(games_url)
            
            if not games_response:
                continue
                
            games_data = games_response.json()["data"]["items"]
            futures = []
            
            for game in games_data:
                future = executor.submit(process_game, game, games_list, reviews_list)
                futures.append(future)
            
            # 等待当前页面的所有游戏处理完成
            for future in futures:
                future.result()
    
    # 处理和保存数据
    games_df = pd.DataFrame(games_list.get_list())
    games_df.drop_duplicates(subset=["id"], inplace=True)
    games_df["userscore"] = games_df["userscore"].apply(lambda x: x*10 if x is not None else x)
    games_df.to_csv("data/games.csv", index=False)
    
    reviews_df = pd.DataFrame(reviews_list.get_list())
    reviews_df.rename(columns={'game_id': 'id', 'game_name': 'title'}, inplace=True)
    #reviews_df = reviews_df[["game_id", "game_name", "quote", "score", "date", "platform", "author", "publicationName", "review_type"]]
    
    # 修正 publicationName 列 (當 critic 時 資料會錯位, 需要調整)
    reviews_df.loc[reviews_df['review_type'] == 'critic', 'publicationName'] = reviews_df['author']
    
    # 确保所需的列都存在
    required_columns = ["quote", "score", "date", "platform", "author", "publicationName", "review_type"]
    existing_columns = [col for col in required_columns if col in reviews_df.columns]
    
    # 添加可能缺失的列
    for col in required_columns:
        if col not in reviews_df.columns:
            reviews_df[col] = None
            logger.warning(f"列 {col} 不存在，已添加空值列")
    
    
    
    
    
    reviews_df.to_csv("data/games_reviews.csv", index=False)
    
    duration = round((time.time() - start_time)/60, 2)
    logger.info(f"爬取完成,用时 {duration} 分钟")
    logger.info(f"共爬取 {len(games_list.get_list())} 个游戏")
    logger.info(f"共收集 {len(reviews_list.get_list())} 条评论")
    
    return games_df, reviews_df

if __name__ == "__main__":
    scrape_metacritic_games()


2025-05-22 18:05:13,176 - INFO - 开始爬取Metacritic游戏数据
2025-05-22 18:05:13,278 - INFO - 总共发现 150 个游戏
2025-05-22 18:05:13,278 - INFO - 处理第 1/6 页 (16.7%)
2025-05-22 18:07:13,159 - INFO - 处理第 2/6 页 (33.3%)
2025-05-22 18:09:19,654 - INFO - 处理第 3/6 页 (50.0%)
2025-05-22 18:09:29,657 - INFO - 休息10秒繼續
2025-05-22 18:11:05,724 - INFO - 处理第 4/6 页 (66.7%)
2025-05-22 18:12:14,334 - INFO - 处理第 5/6 页 (83.3%)
2025-05-22 18:12:24,345 - INFO - 休息10秒繼續
2025-05-22 18:14:11,084 - INFO - 处理第 6/6 页 (100.0%)
2025-05-22 18:15:28,286 - INFO - 爬取完成,用时 10.25 分钟
2025-05-22 18:15:28,286 - INFO - 共爬取 150 个游戏
2025-05-22 18:15:28,287 - INFO - 共收集 100985 条评论
