In [1]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
import requests
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from fastcore.all import Path

data_path = Path("./data/games")
steam_links_path = data_path / "links.txt"

# We can use this to get the links to all Steam games based on the regular Steam search functionality. It is a good starting point
# for scraping the game pages for more information and images
def get_all_steam_games() -> list[str]:
    steam_url = "https://store.steampowered.com/search/results/?query&start={start}&count=50&dynamic_data=&sort_by=_ASC&supportedlang=english&snr=1_7_7_230_7&infinite=1"
    first_page = requests.get(steam_url.format(start=0)).json()
    total_count = first_page["total_count"]

    steam_links: list[str] = []    
    processed_links: set[str] = set()

    # We save the list of processed links to make scraping resumable. If we have a checkpoint read it, and continue from there
    if steam_links_path.exists():
        steam_links = steam_links_path.read_text().splitlines()
        processed_links = set(steam_links)

    for start in tqdm(range(len(processed_links), total_count, 50)):
        response = requests.get(steam_url.format(start=start))

        # Let's fail quickly here and see why we got an error. That way we can adjust the flow if needed
        response.raise_for_status()

        # Grab all game/content links in the returned response
        response = response.json()
        page_content = BeautifulSoup(response["results_html"], "html.parser")
        links = page_content.find_all("a", { "data-ds-appid": True })
        new_links = [link["href"] for link in links if link["href"] not in processed_links]

        # And checkpoint. This just overwrites the file everyime time. It's not the most efficient, but it's simple and easy to understand
        steam_links.extend(new_links)
        processed_links = set(steam_links)
        steam_links_path.write_text("\n".join(steam_links))

        # Slightly throttle the requests so we don't get rate limited, but Steam seems to be pretty forgiving
        time.sleep(0.1)

    return steam_links

# When we have a good enough checkpoint, we can just read the list from the file
def read_steam_links_from_file() -> list[str]:
    ids = steam_links_path.read_text().splitlines()
    # This deduplicates the list (from Python 3.7 dictionaries are ordered, which is why we're not using sets here)
    # Steam results are returned in order of popularity and we'd like to keep it that way
    return list(dict.fromkeys(ids))

# steam_links = get_all_steam_games()
steam_links = read_steam_links_from_file()
steam_links[:10]

['https://store.steampowered.com/app/553850/HELLDIVERS_2/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/1086940/Baldurs_Gate_3/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/1172470/Apex_Legends/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/1158310/Crusader_Kings_III/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/899770/Last_Epoch/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/2379780/Balatro/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/2670630/Supermarket_Simulator/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/1085660/Destiny_2/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/230410/Warframe/?snr=1_7_7_230_150_1',
 'https://store.steampowered.com/app/1938090/Call_of_Duty/?snr=1_7_7_230_150_1']

In [11]:
import msgspec
from datetime import date, datetime
from typing import Optional, Literal
from urllib.parse import urlparse

SteamUrlType = Literal["app", "sub"]

class SteamReview(msgspec.Struct):
    sentiment: str
    number_of_reviewers: int
    positive_percentage: int

class SteamPageInformation(msgspec.Struct):
    type: Literal["game", "dlc", "package"]

    title: str
    description: Optional[str]
    price: Optional[int]
    images: list[str]
    genres: list[str]
    tags: list[str]
    developers: list[str]
    publishers: list[str]
    release_date: Optional[str]
    recent_reviews: Optional[SteamReview]
    all_reviews: Optional[SteamReview]

class SteamGame(msgspec.Struct):
    id: str
    url: str
    raw_name: Optional[str]
    url_type: SteamUrlType
    popularity_rank: int
    page_information: Optional[SteamPageInformation]

    def crawled_page_id(self) -> str:
        return f"{self.id}_{self.raw_name or 'Unknown'}"
        
def create_steam_game_list(urls: list[str]) -> list[SteamGame]:
    games: dict[str, SteamGame] = {}

    popularity_rank = 1
    for url_with_query in urls:
        url = urlparse(url_with_query)._replace(query=None)
        path_segments = url.path.strip('/').split('/')
        
        if len(path_segments) < 2:
            raise ValueError(f"Invalid url: {url_with_query}")

        url_type = path_segments[0]
        id = path_segments[1]
        name = path_segments[2] if len(path_segments) > 2 else None

        if url.path not in games:
            url_type = msgspec.json.decode(f'"{url_type}"', type=SteamUrlType)
            games[url.path] = SteamGame(id=id, url=url.geturl(), raw_name=name, url_type=url_type, popularity_rank=popularity_rank, page_information=None)
            popularity_rank += 1
        

    return list(games.values())


steam_games = create_steam_game_list(steam_links)
steam_games[:10]

[SteamGame(id='553850', url='https://store.steampowered.com/app/553850/HELLDIVERS_2/', raw_name='HELLDIVERS_2', url_type='app', popularity_rank=1, page_information=None),
 SteamGame(id='1086940', url='https://store.steampowered.com/app/1086940/Baldurs_Gate_3/', raw_name='Baldurs_Gate_3', url_type='app', popularity_rank=2, page_information=None),
 SteamGame(id='1172470', url='https://store.steampowered.com/app/1172470/Apex_Legends/', raw_name='Apex_Legends', url_type='app', popularity_rank=3, page_information=None),
 SteamGame(id='1158310', url='https://store.steampowered.com/app/1158310/Crusader_Kings_III/', raw_name='Crusader_Kings_III', url_type='app', popularity_rank=4, page_information=None),
 SteamGame(id='899770', url='https://store.steampowered.com/app/899770/Last_Epoch/', raw_name='Last_Epoch', url_type='app', popularity_rank=5, page_information=None),
 SteamGame(id='2379780', url='https://store.steampowered.com/app/2379780/Balatro/', raw_name='Balatro', url_type='app', popular

In [4]:
from requests.adapters import HTTPAdapter, Retry
from tqdm.contrib.concurrent import thread_map

steam_game_pages_path = data_path / "pages"

# Crawls a single steam content page with the given session. It returns the game's ID and whether it was successfully crawled
def crawl_game_information_page(game: SteamGame, session: requests.Session) -> tuple[str, bool]:
    # We save crawled HTML pages so that we can resume from where we left off. If this game has a corresponding file,
    # just return that one
    page_content_path = steam_game_pages_path / f"{game.crawled_page_id()}.html"
    if page_content_path.exists():
        return game.url, True

    response = session.get(game.url)
    if response.ok:
        page_content_path.write_text(response.text)
        return game.url, True
    else:
        return game.url, False

def crawl_all_pages(games: list[SteamGame]):
    # Set up a session with automatic retries and connection pooling
    session = requests.Session()
    session.mount("https://", HTTPAdapter(max_retries=Retry(total=3, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])))

    # Don't use too many workers so Steam doesn't rate limit us
    results = thread_map(lambda game: crawl_game_information_page(game, session), games, max_workers=8)

    return results

page_crawl_results = crawl_all_pages(steam_games)
page_crawl_results[:10], len(list(filter(lambda result: not result[1], page_crawl_results)))

  0%|          | 0/82100 [00:00<?, ?it/s]

([('https://store.steampowered.com/app/553850/HELLDIVERS_2/', True),
  ('https://store.steampowered.com/app/1086940/Baldurs_Gate_3/', True),
  ('https://store.steampowered.com/app/1172470/Apex_Legends/', True),
  ('https://store.steampowered.com/app/1158310/Crusader_Kings_III/', True),
  ('https://store.steampowered.com/app/899770/Last_Epoch/', True),
  ('https://store.steampowered.com/app/2379780/Balatro/', True),
  ('https://store.steampowered.com/app/2670630/Supermarket_Simulator/', True),
  ('https://store.steampowered.com/app/1085660/Destiny_2/', True),
  ('https://store.steampowered.com/app/230410/Warframe/', True),
  ('https://store.steampowered.com/app/1938090/Call_of_Duty/', True)],
 0)

In [12]:
import re
from typing import TypeVar, Callable
from bs4 import ResultSet, Tag
from tqdm.contrib.concurrent import process_map

T = TypeVar("T")

def extract_one_tag(tag: Tag|None, mapper: Callable[[Tag], T]) -> T|None:
    if tag is None:
        return None
    return mapper(tag)

def extract_all_tags(tags: ResultSet[Tag], mapper: Callable[[Tag], T]) -> list[T]:
    return [mapper(tag) for tag in tags]

# Regex to extract the two numbers based on this template: 84% of the 643 user reviews
review_regex = re.compile(r"(\d+)% of the ([\d,]+) user reviews")

def create_reviews_from_tag(tag: Tag) -> SteamReview|None:
    review_tooltip = str(tag["data-tooltip-html"])
    if review_tooltip == "Need more user reviews to generate a score" or review_tooltip == "No user reviews":
        return None

    match = review_regex.search(review_tooltip)
    if match is None:
        raise ValueError(f"Failed to parse review tooltip {review_tooltip}")

    return SteamReview(
        sentiment=tag.text.strip(),
        number_of_reviewers=int(match.group(2).replace(",", "")),
        positive_percentage=int(match.group(1).replace(",", "")),
    )

def extract_page_information_for_game(game: SteamGame) -> SteamGame:
    page_path = steam_game_pages_path / f"{game.crawled_page_id()}.html"
    assert page_path.exists(), "We should only try to extract page information for games that have been crawled"
    assert game.url_type == "app", "Only crawling app pages is implemented"

    page_content = BeautifulSoup(page_path.read_text(), "html.parser")


    # Get basic properties
    title = page_content.select_one("#appHubAppName")
    type = "dlc" if page_content.select_one(".game_area_dlc_bubble") is not None else "game"
    assert title is not None, f"Each game on Steam should have at least a title, but {game.url} did not have one"
    description = page_content.select_one(".game_area_description")

    release_date = page_content.select_one(".release_date .date")
    tags = page_content.select(".glance_tags.popular_tags a.app_tag")
    developers = page_content.select("#developers_list a")
    publishers = page_content.select(".dev_row + .dev_row .summary.column a")
    genres = page_content.select("#genresAndManufacturer span[data-panel] a")

    price = page_content.select_one(".game_area_purchase_game_wrapper .game_area_purchase_game:not(.dynamic_bundle_description) div[data-price-final]")
    all_reviews = page_content.select_one("#review_histogram_rollup_section .game_review_summary")
    recent_reviews = page_content.select_one("#review_histogram_recent_section .game_review_summary")


    # Images
    images = page_content.select("#highlight_strip .highlight_strip_screenshot img")

    try:
        return SteamGame(
            id=game.id,
            url=game.url,
            raw_name=game.raw_name,
            url_type=game.url_type,
            popularity_rank=game.popularity_rank,
            page_information=SteamPageInformation(
                title=title.text.strip(),
                description=extract_one_tag(description, lambda tag: tag.text.strip()),
                type=type,
                release_date=extract_one_tag(release_date, lambda tag: tag.text.strip()),
                tags=extract_all_tags(tags, lambda tag: tag.text.strip()),
                developers=extract_all_tags(developers, lambda tag: tag.text.strip()),
                publishers=extract_all_tags(publishers, lambda tag: tag.text.strip()),
                genres=extract_all_tags(genres, lambda tag: tag.text.strip()),
                price=extract_one_tag(price, lambda tag: int(str(tag["data-price-final"]))),
                all_reviews=extract_one_tag(all_reviews, create_reviews_from_tag),
                recent_reviews=extract_one_tag(recent_reviews, create_reviews_from_tag),
                images=extract_all_tags(images, lambda tag: str(tag["src"])),
            ),
        )
    except Exception as e:
        logger.error(f"Failed to extract information for game {game.url} due to {e}")
        raise e

excluded_ids = set([
    "1675200", # Steam Deck, redirects to a marketing page
    "1696780", # Steam Deck Docking Station - same idea

    # There seem to be some broken IDs. You can spot them because their URLs end in //:
    "657510",
    "1543120",
    "1929910",
    "2313910",
    "2199040",
    "2011531",
    "2011530",
    "2151650",
    "2027270",
    "955790",
    "1936041",
    "1936040",
    "2288250",
    "1999030",
    "2240030",
    "1897163",
])
games_path = data_path / "games.json"
# games = process_map(
#     extract_page_information_for_game,
#     [game for game in steam_games
#      if game.id not in excluded_ids
#      and game.url_type == "app"
#     ],
#     chunksize=50,
# )
# games_path.write_bytes(msgspec.json.format(msgspec.json.encode(games)))
games: list[SteamGame] = msgspec.json.decode(games_path.read_bytes(), type=list[SteamGame])
games[:10]

  0%|          | 0/81867 [00:00<?, ?it/s]

[SteamGame(id='553850', url='https://store.steampowered.com/app/553850/HELLDIVERS_2/', raw_name='HELLDIVERS_2', url_type='app', popularity_rank=1, page_information=SteamPageInformation(type='game', title='HELLDIVERS™ 2', description='Digital Deluxe Edition Edition includes:‘DP-53\u202fSavior of the Free’ Armor Set.‘Will of the People’ Cape.‘MP-98 Knight’ Weapon.Super Citizen Status.Stratagem Hero Ship Game.‘Steeled Veterans’ Premium Warbond.', price=3999, images=['https://cdn.cloudflare.steamstatic.com/steam/apps/553850/ss_0c79f56fc7be1bd0102f2ca1c92c8f0900daf4fb.116x65.jpg?t=1709666906', 'https://cdn.cloudflare.steamstatic.com/steam/apps/553850/ss_33e684e9cb2517af1599f0ca2b57d65ee82c2e51.116x65.jpg?t=1709666906', 'https://cdn.cloudflare.steamstatic.com/steam/apps/553850/ss_8949ed7dd24a02d5ea13b08fc5c04fab400dc4bd.116x65.jpg?t=1709666906', 'https://cdn.cloudflare.steamstatic.com/steam/apps/553850/ss_50afbbc4d811c38fe9f64c1fc8d7eb9d9da6d24c.116x65.jpg?t=1709666906', 'https://cdn.cloudfl

242675244