In [None]:
# Imports
import requests
from bs4 import BeautifulSoup
from ast import literal_eval
import pandas as pd
import re
from csv import reader
from os import mkdir
from os.path import exists
from time import time, sleep
from IPython.display import clear_output

In [None]:
# Loads appids
with open("../../data/appids/appids.csv", "r", newline="") as f:
    appid_reader = reader(f)
    appids = [appid[0] for appid in appid_reader]

In [None]:
# Makes output directory
if not exists("../../data/tags"):
    mkdir("../../data/tags")

In [4]:
# Loop prep

# Instantiates list to hold app ids for games with broken store pages
failed_requests = []

# Keeps track of minutes elapsed
min_elapsed = 0

# Keeps track of store pages seen
store_pages_seen = 0

# Keeps track of time
timer = time()

pattern_tags = re.compile(
    r"{\"tagid\":\w+,\"name\":\"[^\"]+\",\"count\":\w+,\"browseable\":\w+}"
)

In [None]:
# Downloads tags for each game
for appid in appids:
    # Sets url
    url = f"https://store.steampowered.com/app/{appid}/"

    # Makes request
    response = requests.get(url, allow_redirects=True)
    
    # Updates status code variable
    status = response.status_code
    
    # Updates request time variable
    request_time = time()

    # Retries request after five seconds if 502 error (too many requests)
    while status == 502:
        print("502 Error! Retrying in 5 seconds...")
        sleep(5)
        response = get(url)
        status = response.status_code
    
    # Terminates loop if unexpected http error occurs
    if status != 200:
        print(f"Error! HTTP response code {status} for appid {appid}")
        break
    
    # Try/except loop to handle unexpected behavior
    try:
        # Turns response to HTML
        soup = BeautifulSoup(response.text)

        # Parses response to extract tags as text
        tags = re.findall(
            pattern_tags,
            soup.find_all("script", type="text/javascript")[-1]
            .text.replace("true", "True")
            .replace("false", "False"),
        )

        # Turns tags to dictionary
        tags = [literal_eval(tag) for tag in tags]

        # Turns tag dictionary to DataFrame
        df_tags = pd.DataFrame(tags)

        # Converts appid column to integer
        df_tags["appid"] = int(appid)
        
        # Writes tags DataFrame to parquet file
        df_tags.to_parquet(f"../../data/tags/{appid}.parquet")
    except:
        # If tag extraction unsuccessful, notes appid of failed game
        failed_requests.append([appid])
        
    
    # Augments counter
    store_pages_seen += 1

    # Prints progress to console.
    if (time() - timer) // 60 > min_elapsed:
        min_elapsed = (time() - timer) // 60
        clear_output()
        print(
                f"{store_pages_seen}/{len(appids)} ({format(store_pages_seen/len(appids)*100, '.2f')}%) store pages scraped in {int(min_elapsed//60)} hours and {int(min_elapsed % 60)} minutes"
            )

    # Waits 2 seconds between requests to avoid making Steam mad.
    #while time() - request_time <= 2:
    #    pass

clear_output()
print(f"Execution finished!\nTags retrieved for all but {len(failed_requests)} games.")