In [2]:
import pandas as pd
import requests
import sqlite3
import time
import numpy as np

class StopExecution(Exception):
    def _render_traceback_(self):
        pass

### ID's and Names

**Scrapes the unique ids and names of all Steam video games from Steam's API** 

The API URL is 'https://api.steampowered.com/IStoreService/GetAppList/v1/', where the ids and names are stored in the nested dictionary hierarchy response['response']['apps']. Only a maximum of 50,000 results can be retrieved per response. The API has a parameter called 'last_appid' which will resume data retrieval from the last recorded game (app) id; I replaced appid with game_id or id for a better naming convention. 

In [3]:
idname_df = pd.DataFrame()
last_appid = 0

while True:

    params = {
        'key': '1674C7309B00CA08D73A8CC100CA24C7', 
        'max_results': '50000', # maximum of 50,000 retrievals per response
        'last_appid': last_appid} # will resume data retrieval from this id
    gameListUrl = 'https://api.steampowered.com/IStoreService/GetAppList/v1/'
    
    response = requests.get(gameListUrl, params=params)
    response = response.json()

    if len(response['response']) == 0:
        break
    
    temp_df = pd.DataFrame(response['response']['apps']) # location of ids and names
    idname_df = pd.concat([idname_df, temp_df], ignore_index=True) # adding data to already existing data frame
    last_appid = idname_df['appid'].max()

# Renaming appid to id
idname_df.rename(columns={'appid': 'id'}, inplace=True)
# Dropping unnecessary columns
idname_df.drop(columns=['last_modified', 'price_change_number'], inplace=True)
# Adding columns for future data insertion
idname_df[['release_date', 'price']] = np.nan
idname_df.head()

Unnamed: 0,id,name,release_date,price
0,10,Counter-Strike,,
1,20,Team Fortress Classic,,
2,30,Day of Defeat,,
3,40,Deathmatch Classic,,
4,50,Half-Life: Opposing Force,,


**Converts the 'idname_df' data frame into a table called 'game' in a SQL file called 'steam_db.sqlite' for permanent storage**

If the SQL database already exists, this will compare the list of ids from the dataframe (just pulled from the API) to the ids from the SQL database to check if there are any new ids. The new ids will be added to the SQL database. 

In [18]:
conn = sqlite3.connect('steam_db.sqlite')
cur = conn.cursor()

with conn:
    try:
        # Converts the 'idname_df' to a TABLE called 'game' in the steam_db.sqlite file
        idname_df.to_sql(name='game', con=conn, index=False)
    except:
        print("Table 'game' already exists")

        # Create an index on the COLUMN 'id' for quicker data retrieval
        cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_game_id ON game(id)")

        # Checking to see if there are differences in the lists of ids from the data frame and database
        steam_db_ids = set([id[0] for id in cur.execute('SELECT id FROM game').fetchall()])
        idname_df_ids = set(list(idname_df['id']))
        new_game_ids = list(idname_df_ids - steam_db_ids)
        print(f"Number of new games: {len(new_game_ids)}")
        
        # Splice the 'idname_df' data frame that have the new ids
        new_game_df = idname_df[idname_df['id'].isin(new_game_ids)]

        # Insert the id, name, date, and price of the new games into the database file
        for row in new_game_df.itertuples(index=False):
            id = row[0]
            name = row[1]
            date = row[2]
            price = row[3]
            
            data = (id, name, date, price)
            cur.execute("INSERT INTO game VALUES (?,?,?,?)", data)

        conn.commit()

Table 'game' already exists
Number of new games: 0


### Release Dates and Prices

**Scrapes the release dates and prices of video games from Steam's API**

The API URL with release dates and prices is 'https://store.steampowered.com/api/appdetails/'. This code chunk will update the rows of the database with missing values in the release dates and prices columns. This process will be slow because of the rate limit of 200 requests per response which will require a 5 minute pause in between each response.

In [19]:
conn = sqlite3.connect('steam_db.sqlite')
cur = conn.cursor()

with conn:
    game_pragma = cur.execute('PRAGMA table_info(game)').fetchall()
    game_columns = [col[1] for col in game_pragma]

    # Grab all the data from the SQL TABLE 'game'
    steam_db_game = cur.execute('SELECT * FROM game').fetchall()

    # Converts the SQL TABLE 'game' into a dataframe
    game_df = pd.DataFrame(steam_db_game, columns=game_columns)
    game_df.set_index('id', inplace=True) # Reset the index to the id

    # Grab a subset of the dataframe that has missing data in any of the features/columns
    incomplete_game_df = game_df[game_df.isnull().any(axis=1)]
    ids = incomplete_game_df.index # Assign a list of game ids which is the index of the dataframe


    print(f"Number of rows with any NULL values: {len(incomplete_game_df)}")
    entry_count = int(input("How many rows would you like to update?"))

    entries = 1
    for id in ids:

        if entries > entry_count:
            break

        params = {'appids': id}
        url = 'https://store.steampowered.com/api/appdetails/'
        response = requests.get(url, params=params)

        # Pauses the script due to too many requests to the API
        while response.status_code in [429, 403]:
            if response.status_code == 429:
                print('Status Code', response.status_code, '-> Pausing for 30 seconds')
                time.sleep(30)
                response = requests.get(url, params=params)
            elif response.status_code == 403:
                print('Status Code', response.status_code, '-> Pausing for 10 seconds')
                time.sleep(10)
                response = requests.get(url, params=params)


        ### The JSON file from the API will occassionally encounter key/value errors 
        
        # Error handling for the release date
        try:
            release_date = response.json()[str(id)]['data']['release_date']['date']
        except:
            release_date = -1

        # Error handling for the price
        try:
            price = response.json()[str(id)]['data']['price_overview']['final_formatted']
        except:
            price = -1

        print(f"Entries: {entries}, App ID: {id}, Release Date: {release_date}, Price: {price}")


        # Updating SQL table 'game' with new 'release_date' and 'price' data
        update_query = 'UPDATE game SET release_date = ?, price = ? WHERE id = ?'
        data = (release_date, price, id)
        cur.execute(update_query, data)

        entries += 1

        conn.commit()


print("---Entry Complete---")

Number of rows with any NULL values: 1112
Entries: 1, App ID: 299080, Release Date: Sep 7, 2023, Price: -1
Entries: 2, App ID: 436780, Release Date: Q3 2024, Price: -1
Entries: 3, App ID: 586290, Release Date: 2024, Price: -1
Entries: 4, App ID: 619530, Release Date: To be announced, Price: -1
Entries: 5, App ID: 648950, Release Date: To be announced, Price: -1
Entries: 6, App ID: 667030, Release Date: Aug 30, 2018, Price: $6.99
Entries: 7, App ID: 703680, Release Date: Coming soon, Price: -1
Entries: 8, App ID: 803130, Release Date: Dec 12, 2018, Price: $4.99
Entries: 9, App ID: 845160, Release Date: October 2023, Price: -1
Entries: 10, App ID: 881020, Release Date: Jan 31, 2024, Price: -1
Entries: 11, App ID: 975630, Release Date: To be announced, Price: -1
Entries: 12, App ID: 1005220, Release Date: Coming soon, Price: -1
Entries: 13, App ID: 1118830, Release Date: To be announced, Price: -1
Entries: 14, App ID: 1147700, Release Date: Coming soon, Price: -1
Entries: 15, App ID: 1154

In [124]:
raise StopExecution

source_conn = sqlite3.connect(r"C:\Users\xuqc0\Documents\XUQC01\WORK\Projects\Steam_Games-Predicting_Success\steam_db.sqlite")
destination_conn = sqlite3.connect(r"C:\Users\xuqc0\Documents\XUQC01\WORK\Projects\Predicting_the_Success_of_Steam_Games\steam_db.sqlite")

with source_conn, destination_conn:
    source_conn_cur = source_conn.cursor()
    destination_conn_cur = destination_conn.cursor()

    source_conn_cur.execute("SELECT * FROM game")
    rows = source_conn_cur.fetchall()
    for row in rows:
        id = row[0]
        name = row[1]
        date = row[4]
        price = row[5]

        data = (id, name, date, price)
        destination_conn_cur.execute("INSERT INTO game VALUES (?, ?, ?, ?)", data)
    
    destination_conn.commit()


In [116]:
conn = sqlite3.connect('steam_db.sqlite')
cur = conn.cursor()

cur.execute('CREATE TABLE game (id, name, release_date, price)')

conn.commit()
conn.close()