<a href="https://colab.research.google.com/github/tbbye/Steam-Data-Collection/blob/main/Steam_Store_Page.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import pandas as pd
import requests
import csv
import time
import numpy as np
import statistics
import datetime as dt

# customizations - ensure tables show all columns
pd.set_option("display.max_columns", 100)

def get_request(url, parameters=None):
    try:
        response = requests.get(url=url, params=parameters)
    except requests.exceptions.SSLError as s:
        print('SSL Error:', s)

        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)

        # recursively try again
        return get_request(url, parameters)

    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

# Manual input of app IDs
manual_app_ids = ["730", "570", "440"]  # Add more app IDs as needed

# Convert app IDs to DataFrame
manual_app_list = pd.DataFrame({'appid': manual_app_ids, 'name': ''})

def get_app_data(start, stop, parser, pause):
    app_data = []

    # iterate through each row of app_list, confined by start and stop
    for index, row in manual_app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')

        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause)  # prevent overloading api with requests

    return app_data

def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    print('Starting at index {}:\n'.format(begin))

    if end == -1:
        end = len(app_list) + 1

    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)

    apps_written = 0
    batch_times = []

    for i in range(len(batches) - 1):
        start_time = time.time()

        start = batches[i]
        stop = batches[i + 1]

        app_data = get_app_data(start, stop, parser, pause)

        rel_path = os.path.join(download_path, data_filename)

        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')

            for j in range(3, 0, -1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)

            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop - 1, data_filename), end=' ')

        apps_written += len(app_data)

        idx_path = os.path.join(download_path, index_filename)

        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)

        end_time = time.time()
        time_taken = end_time - start_time

        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)

        est_remaining = (len(batches) - i - 2) * mean_time

        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))

        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))

    print('\nProcessing batches complete. {} apps written'.format(apps_written))

# --- Start of added helper functions ---

def reset_index(download_path, index_filename):
    """Resets the index file by deleting it."""
    idx_path = os.path.join(download_path, index_filename)
    if os.path.exists(idx_path):
        os.remove(idx_path)
        print(f'Index file {index_filename} reset.')
    else:
        print(f'Index file {index_filename} not found, nothing to reset.')

def get_index(download_path, index_filename):
    """Retrieves the last processed index from the index file."""
    idx_path = os.path.join(download_path, index_filename)
    if not os.path.exists(idx_path):
        return 0
    with open(idx_path, 'r') as f:
        try:
            index = int(f.read().strip())
        except ValueError:
            index = 0  # If file is empty or corrupted, start from 0
    return index

def prepare_data_file(download_path, data_filename, index, columns):
    """Prepares the data file (creates or wipes) and writes headers if index is 0."""
    os.makedirs(download_path, exist_ok=True)
    rel_path = os.path.join(download_path, data_filename)
    mode = 'w' if index == 0 else 'a'
    with open(rel_path, mode, newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
        if index == 0:
            writer.writeheader()
            print(f'Created/Wiped {data_filename} and wrote headers.')
        else:
            print(f'Appending to {data_filename}.')

def parse_steam_request(appid, name):
    """Parses data for a single appid from the Steam API."""
    url = f"http://store.steampowered.com/api/appdetails?appids={appid}"
    response_json = get_request(url)

    if response_json is None:
        print(f"No response for appid {appid}. Skipping.")
        return {'steam_appid': appid, 'name': name} # Return at least ID and name

    try:
        # The API returns a dictionary where the key is the appid
        app_data = response_json[str(appid)]['data']
        app_data['steam_appid'] = appid # Ensure steam_appid is always present
        app_data['name'] = name # Ensure name is always present
        return app_data
    except KeyError:
        print(f"Could not parse data for appid {appid}. Response: {response_json}. Skipping.")
        return {'steam_appid': appid, 'name': name} # Return at least ID and name if parsing fails

# --- End of added helper functions ---

# Set file parameters
download_path = '../data/download'
steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors',
]

# Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
reset_index(download_path, steam_index)

# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

# Set end and chunksize for demonstration - remove to run through entire app list
process_batches(
    parser=parse_steam_request,
    app_list=manual_app_list,
    download_path=download_path,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index,
    end=len(manual_app_list),
    batchsize=5
)


Index file steam_index.txt reset.
Created/Wiped steam_app_data.csv and wrote headers.
Starting at index 0:

Exported lines 0-2 to steam_app_data.csv. Batch 0 time: 0:00:05 (avg: 0:00:05, remaining: 0:00:00)

Processing batches complete. 3 apps written
