In [None]:
import os
from csv import reader, writer
from json import dump
from os import chdir, makedirs
from os.path import exists
from time import sleep, time
from urllib.parse import quote_plus

import requests
from IPython.display import clear_output
from requests import get

In [None]:
# Move up two directories, to project base directory
chdir("../..")

# Defines output directory
output_dir = "data/reviews/"

# Makes output directory if it doesn't exist
if not exists(output_dir):
    makedirs(output_dir)

In [None]:
# Reads app ids into list
with open("data/appids/appids.csv", "r", newline="") as f:
    appid_reader = reader(f)
    appids = [appid[0] for appid in appid_reader]

In [None]:
# For-loop prep

# Instantiates timer to keep track of last print
time_of_last_print = 5.1

# Holds bad requests
failed_requests = []

# Records loop start time.
loop_start_time = time()

In [None]:
# Loops through all known app ids
for count_games_scraped, appid in enumerate(failed_requests):
    # While-loop prep
    
    # Makes game subdirectory
    output_dir_current = output_dir + str(appid) + '/'

    if not os.path.exists(output_dir_current):
        os.makedirs(output_dir_current)

    # Instantiates cursor with starting cursor
    cursor = '*'
    
    # Instantiates string to hold previous cursor
    last_cursor = ''
    
    # Instantiates counter for number of jsons written for current app
    count_jsons_written = 0
    
    # Gets all reviews for current app
    while last_cursor != cursor:
        start_time = time()
        
        # Populates url template
        url = (
            "https://store.steampowered.com/appreviews/"    # Base query url
            + appid     # Current app id
            + f"?cursor={cursor}"   # Position in review list
            + "&filter=recent"  # Sorts list for coherence
            + "&num_per_page=100"   # Specifies number of reviews per request. 100 is the maximum.
            + "&language=all"   # Removes language filter
            + "&json=1" # Specifies json as output format.
            )
        
        # Makes request. Wrapped in a try loop to handle unexpected errors
        try:
            response = get(url)
            status = response.status_code
        except:
            print("Unexpected request failure! Retrying in one minute...")
            sleep(60)
            
            try:
                response = get(url)
                status = response.status_code
            except:
                failed_requests.append(appid)
                print(f'Request unsuccessful for appid {appid}.\n Game appended to list of failed requests.')
                break
        

        # If 502 error (bad gateway), waits five seconds and then tries again.
        while status == 502:
            print('502 Error! Retrying in five seconds...')
            sleep(5)
            
            response = requests.get(url)
            status = response.status_code
            
        if status != 200:
            print(f'HTTP Error {status}!\nLoop interrupted.')
            failed_requests.append(appid)
            break
            
        # Terminates loop if query is empty    
        if response.json()['query_summary']['num_reviews'] == 0:
            if count_jsons_written == 0:
                failed_requests.append(appid)
            break
            
        # Updates last cursor
        last_cursor = cursor
        
        # Updates cursor; encodes for url compatibility.
        cursor = quote_plus(response.json()['cursor'])
        
        # Dumps current json
        with open(f"{output_dir_current}{"%08d" % count_jsons_written}.json", "w") as file:
            dump(response.json(), file)
        
        # Updates counter
        count_jsons_written += 1

        # Debug printout
        if time() - time_of_last_print > 5:
            time_of_last_print = time()
            clear_output()
            print(f"game # {count_games_scraped}, appid: {appid}, request # (current game): {count_jsons_written}, processing time (current game, s) = {time()-start_time : .1f}")

    count_games_scraped += 1

print(F"{count_games_scraped} games successfully scraped in {(time()-loop_start_time) // 3600} hours and {((time()-loop_start_time) // 3600) % 60} minutes")

In [None]:
# Informs user of any failed requests
if len(failed_requests) > 0:

    with open(output_dir + "failed_requests.csv", "w", newline="") as f:
        appid_writer = writer(f)
        appid_writer.writerows([[request] for request in failed_requests])

    print(
        f"Retrieval failed for {len(failed_requests)} games.\nApp ids written to '{output_dir}failed_requests.csv'."
    )