### Data Collection for Big Data Final Project
Latifa Tan </br>
11/02/2024 </br>
run on Python 3.9.12 anaconda

### Data Sources Notes
SteamDB(anti-scrap | Not used): https://steamdb.info/app/367520/ </br>
Steam API Documentation: </br>
1) API Key Registration: https://steamcommunity.com/dev/apikey </br>
2) https://developer.valvesoftware.com/wiki/Steam_Web_API#GetNewsForApp_.28v0001.29 </br>
3) https://steamcommunity.com/dev </br>
4) sample return: https://steamspy.com/api.php?request=appdetails&appid=10 </br>

Notes:</br>
1. GetGlobalAchievementPercentagesForApp (v0002) could get % and achievement details for each game

Steam
SteamSpy
1) Documentations: https://steamspy.com/api.php </br>
2) Sample Return: https://steamspy.com/api.php?request=appdetails&appid=10 </br>

Steam Reviews source: https://www.kaggle.com/datasets/kieranpoc/steam-reviews/data

In [None]:
# all initial settings
import pandas as pd
import csv
import os
import time
import requests
import pandas as pd
import json
from datetime import datetime, timedelta

app_ids = pd.read_csv('unique_game_id.csv')['Game ID'].tolist()

In [None]:
# Function to split list into chunks of a specified size
def split_list(app_ids, chunk_size=100):
    return [app_ids[i:i + chunk_size] for i in range(0, len(app_ids), chunk_size)]

# Split the app ID list into chunks of 150
app_id_chunks = split_list(app_ids, 100)

# Print to verify
#print(f"Total chunks created: {len(app_id_chunks)}")  # Should be about 182
print(app_id_chunks[45])  # First chunk of 100 app IDs


[1311660, 1311700, 1311740, 1311830, 1312740, 1312960, 1312990, 1313, 1313020, 1313050, 1313140, 1313310, 1313470, 1313860, 1313940, 13140, 1314000, 1314010, 1314140, 1314160, 1314290, 1314460, 1314470, 1314563, 1314620, 1314630, 1314761, 1314764, 1314765, 1315160, 1315200, 1315210, 1315270, 1315300, 1315350, 1315610, 1315630, 1315750, 1315751, 1315860, 1315980, 1316060, 1316110, 1316160, 1316230, 1316560, 1316700, 1316760, 1316840, 1316910, 1317020, 1317080, 1317120, 1317160, 1317230, 1317250, 1318030, 1318090, 1318130, 1318420, 1318560, 1318690, 1318740, 1318750, 1318790, 1318940, 1318980, 1319460, 1319850, 1320100, 1320550, 1320950, 1321030, 1321040, 1321070, 1321220, 1321230, 1321270, 1321440, 1321450, 1321680, 1321920, 1322170, 1322270, 1322290, 1322300, 1322310, 1322490, 1322540, 1322600, 1322650, 1322770, 1323200, 1323320, 1323420, 1323470, 1323540, 1323620, 1323900, 1324000]


In [None]:
# handle API request result; expect returns be JSON
bucket_limit = 200  # Max requests per bucket
bucket_timeframe = timedelta(minutes=5)  # 5-minute window
request_count = 0
bucket_start_time = datetime.now()

def reset_bucket():
    """Reset the rate limit bucket."""
    global request_count, bucket_start_time
    request_count = 0
    bucket_start_time = datetime.now()

def get_request(url, parameters=None, max_retries=5, backoff_factor=10):
    """Make a GET request with rate limit management and return JSON response."""
    global request_count, bucket_start_time
    retries = 0

    while retries < max_retries:
        # Check if the bucket has expired
        if datetime.now() - bucket_start_time > bucket_timeframe:
            reset_bucket()

        # If request limit is reached, wait until the next bucket
        if request_count >= bucket_limit:
            time_to_wait = (bucket_start_time + bucket_timeframe) - datetime.now()
            print(f"Rate limit reached. Waiting {time_to_wait.total_seconds()} seconds for next bucket...")
            print(request_count)
            time.sleep(time_to_wait.total_seconds())
            reset_bucket()

        try:
            response = requests.get(url, params=parameters)
            response.raise_for_status()  # Raise HTTPError for bad responses (4XX, 5XX)
            request_count += 1  # Increment request count after a successful request
            return response.json()  # Return JSON if successful
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                retry_after = int(response.headers.get("Retry-After", backoff_factor * (retries + 1)))
                print(f"Error: {e}. Too Many Requests. Retrying in {retry_after} seconds...")
                print(request_count)
                time.sleep(retry_after)
            else:
                print(f"HTTP Error: {e}. Retrying in {backoff_factor * (retries + 1)} seconds...")
                print(request_count)
        except requests.exceptions.RequestException as e:
            print(f"Request Error: {e}. Retrying in {backoff_factor * (retries + 1)} seconds...")
            print(request_count)

        # Increase backoff and retry
        time.sleep(backoff_factor * (retries + 1))
        retries += 1

    # If max retries reached, log the failure and return None
    print(f"Failed to retrieve data from {url} after {max_retries} attempts.")
    print(request_count)
    return None

In [None]:
# fetch data from APIs
def parse_steam_request(appid):
    url = f"https://store.steampowered.com/api/appdetails/?appids={appid}"
    json_data = get_request(url)
    return json_data.get(str(appid), {})

def parse_steamspy_request(appid):
    url = f"https://steamspy.com/api.php?request=appdetails&appid={appid}"
    return get_request(url)

In [None]:
# Load existing JSON data if available, otherwise return an empty dictionary
def load_json_data(filename):
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}

# Append new data to an existing JSON file
def save_to_json(new_data, filename):
    if not os.path.exists(filename):
        data = {}
    else:
        data = load_json_data(filename)  # Load existing data

    data.update(new_data)  # Update with new data
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# Main function to process and save data in chunks
def main(chunk):
    steam_data = {}
    steamspy_data = {}

    for appid in chunk:
        appid = str(appid)
        print(f"Processing AppID: {appid}")

        # Steam API Data
        steam_info = parse_steam_request(appid)
        if steam_info.get("success"):
            steam_data[appid] = steam_info["data"]

        # SteamSpy API Data
        steamspy_info = parse_steamspy_request(appid)
        steamspy_data[appid] = steamspy_info

    # Append the results from this chunk to the JSON files
    save_to_json(steam_data, 'steam_data.json')
    save_to_json(steamspy_data, 'steamspy_data.json')

In [None]:
# main function for 274 chunks
# app_id_chunks[0-54]
main(app_id_chunks[54])

Processing AppID: 1395730
Processing AppID: 1395760
Processing AppID: 1395850
Processing AppID: 1396150
Processing AppID: 1396240
Processing AppID: 1396260
Processing AppID: 1396440
Processing AppID: 1396470
Processing AppID: 1396471
Processing AppID: 1396472
Processing AppID: 1396473
Processing AppID: 1396474
Processing AppID: 1396610
Processing AppID: 1396780
Processing AppID: 1396890
Processing AppID: 1396980
Processing AppID: 1397010
Processing AppID: 1397110
Processing AppID: 1397240
Processing AppID: 1397290
Processing AppID: 1397350
Processing AppID: 1397370
Processing AppID: 1397380
Processing AppID: 1397390
Processing AppID: 1397490
Processing AppID: 1397620
Processing AppID: 1397650
Processing AppID: 1397790
Processing AppID: 1397920
Processing AppID: 1398050
Processing AppID: 1398060
Processing AppID: 1398070
Processing AppID: 1398100
Processing AppID: 1398210
Processing AppID: 1398260
Processing AppID: 1398390
Processing AppID: 1398740
Processing AppID: 1398770
Processing A