In [None]:
from csv import reader, writer
from math import log2
from os import chdir, makedirs
from os.path import exists
from re import search
from time import sleep, time

from IPython.display import clear_output
from requests import get

In [None]:
# Move up two directories, to project base directory
chdir("..\\..")

# Defines output directory
output_dir = "data\\appids\\"

# Makes output directory if it doesn't exist
if not exists(output_dir):
    makedirs(output_dir)

In [None]:
# Loop prep

# Overwrites app ids file if it already exists.
with open(output_dir + "appids.csv", "w") as f:
    f.write("")

# Pre-populates loop variables
page_num = 1
games_in_last_response = 25
status = 200
start_time = time()

In [None]:
while (status == 200) & (games_in_last_response > 0):

    # Constructs url for current query
    url = (
        "https://store.steampowered.com/search/results/"  # Base search url
        + "?sort_by=Released_DESC"  # Sorts by release date for consistency. Unfortunately sorts newest first; oldest first doesn't work with other filters applied.
        + "&tags=492"  # Filters to indie tag
        + "&category1=998"  # Filters to games only (filters out expansions, other software, etc.)
        + "&ignore_preferences=1"  # Ignores preferences not specified in query
        + "&json=1"  # Ensures output is json-formatted
        + f"&page={page_num}"  # Uses page number, which is augmented on loop iteration.
    )

    # Makes request. Wrapped in a try loop to handle spontaneous json decode errors I (infrequently) observed
    try:
        response = get(url)
    except:
        print("Unexpected request failure! Retrying in one minute...")
        sleep(60)

        response = get(url)

    status = response.status_code

    # If 502 error (bad gateway), waits five seconds and then tries again.
    while status == 502:
        print("502 Error! Retrying in five seconds...")
        response = get(url)
        status = response.status_code
        sleep(5)

    # Gets list of apps from response object. Each item in this list of apps is a dictionary containing the app name and the url to the app logo.
    games = response.json()["items"]

    # Instantiates empty list to hold app ids.
    appids = []

    # Gets appid of each game from logo url; appends to app id list.
    for game in games:
        appid = search("^[0-9]+/", game["logo"][67:])[0][:-1]
        appids.append([appid])

    # Appends new appids to file
    with open(output_dir + "appids.csv", "a", newline="") as f:
        appid_writer = writer(f)
        appid_writer.writerows(appids)

    # Gets number of games in last response (loop termination criterion)
    games_in_last_response = len(games)

    # Periodically prints acquisition progress to console. Might have been better to just print the page count every minute or so.
    if log2(page_num) % 1 == 0:
        print(
            f"Successfully retrieved app ids through page {page_num} in {time()-start_time : .0f} seconds."
        )

    # Augments page number
    page_num += 1

# Checks request status to see if loop terminated because of HTTP error or because it ran to completion.
if status == 200:
    # Loop success printout
    clear_output()
    print("Got all indie app ids!")
else:
    # Loop failure printout
    print(f"Error!\nHTTP code {status} at page {page_num}")

In [None]:
# Checks for duplicate app ids.
# If the block above executed correctly, there should be any duplicate app ids.

# Reads app ids
with open(output_dir + "appids.csv", "r", newline="") as f:
    appid_reader = reader(f)
    appids_dirty = [appid[0] for appid in appid_reader]

# Instantiates empty set to hold app ids
appids_set = set()

# Adds appi ds to set. Sets enforce that all values be unique.
for appid in appids_dirty:
    appids_set.add(appid)

# Extracts elements of set (unique app ids) into new list
appids_clean = [[appid] for appid in appids_set]

# Overwrites csv with list of unique app ids
with open(output_dir + "appids.csv", "w", newline="") as f:
    appid_writer = writer(f)
    appid_writer.writerows(appids_clean)

print(
    f"Successfully removed {len(appids_dirty) - len(appids_clean)} duplicate app ids."
)