<p style="font-size:21px"><b>Data Acquisition</b></p>

<b>To acquire the data I needed for this project I used the Steamwebapi along with the requests library.</b> <br><br> Steam has many APIs each returning information on a variety of different features. In order to acquire store front data of each game within the Steam Store I used the appdetails API which required an appid as a parameter.

<br>
To retrieve a list of appids from the store I used the ISteamApps API to get a list of game ids and name, using this to then obtain game information from StorefrontAPI. <br><br> The data was returned as a json type with 'applist' as a key and 'apps' as a key stored within this. I extracted this data and placed it into a dataframe, then stored as a csv to access later.

In [1]:
# library imports
import csv
import datetime as dt
import json
import os
import statistics
import time
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

In [2]:
url = 'https://api.steampowered.com/ISteamApps/GetAppList/v2/'
#This api only retrieves game names and id
r = requests.get(url)
json_ = r.json()

In [6]:
print(json_['applist']['apps'][50])

{'appid': 1506220, 'name': 'Fantasy Grounds - Sherwood: The Legend of Robin Hood'}


In [None]:
GameID = {'appid':[],
               'name':[]}

for item in json_['applist']['apps']:
    try:
        GameID['appid'].append(item['appid'])
    except:
        GameID['appid'].append('None')

    try:
         GameID['name'].append(item['name'])
    except:
        GameID.append('None')

    
GameIDs = pd.DataFrame(GameID)

In [None]:
GameIDs.sort_values(by='appid', inplace=True)
GameIDs.drop(GameIDs[(GameIDs.appid == 'None') | (GameIDs.name == 'None')].index, inplace=True)
GameIDs.reset_index(drop=True, inplace=True)
#GameIDs.to_csv('data/gameids.csv', index=False)

There are around 100k appids within the store so it took me a while to scrape the game data. In order to scrape this over a period of time without losing progress I used a few functions that will make an api call and then write the results to csv.

In [2]:
# Defining get request functon that will handle exceptions

def get_request(url, parameters=None):
    """Return json-formatted response of a get request using parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
           
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
 
    
    except requests.Timeout as errt:
        print('Timeout Error:', errt)
        #too many requests, pause and try again
        print('Waiting 5 seconds')
        time.sleep(5)
        return get_request(url, parameters)
    
    except requests.ConnectionError as errc:
        print('Connection Error:', errc)
        #connection issue, pause and try again
        print('Waiting 25 seconds')
        time.sleep(25)
        return get_request(url, parameters)
              

    except KeyboardInterrupt as errk:
        print('Program stopped:', errk)
        
        
    except requests.RequestException as erre:
        print('General Error:', erre)
        
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests, pause and try again 
        print('No response, waiting 5 mins')
        time.sleep(300)
        print('Retrying')
        return get_request(url, parameters)

In [4]:
def get_app_data(start, stop, parser, pause):
    """Return app data from Steam Store API: json formatted
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row and append to list
        
        url = "http://store.steampowered.com/api/appdetails/"
        parameters = {"appids": appid}

        json_data = get_request(url, parameters=parameters)
        json_app_data = json_data[str(appid)]

        if json_app_data['success']: 
            data = json_app_data['data']
        else:
            data = {'name': name, 'steam_appid': appid}
        
        
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [5]:
def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

In [7]:
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


In [3]:
app_list = pd.read_csv('data/gameids.csv')  

In [8]:
os.getcwd()

'/Users/alison/Desktop/GA/DSI15-lessons/project/project-capstone/Capstone Steam'

In [11]:
# Set file parameters
download_path = 'data/download'
steam_app_data = 'steam_game_data2.csv'
steam_index = 'steam_index2.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors'
]


In [None]:
# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

# Set end and chunksize for demonstration - remove to run through entire app list
process_batches(
    parser=parse_steam_request,
    app_list=app_list,
    download_path=download_path,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index#,
    #end=600,
    #batchsize=10
)

Starting at index 8260:

Exported lines 8260-8359 to steam_game_data2.csv. Batch 0 time: 0:03:18 (avg: 0:03:18, remaining: 2 days, 7:26:44)
Exported lines 8360-8459 to steam_game_data2.csv. Batch 1 time: 0:03:27 (avg: 0:03:23, remaining: 2 days, 8:35:34)
Exported lines 8460-8559 to steam_game_data2.csv. Batch 2 time: 0:03:34 (avg: 0:03:26, remaining: 2 days, 9:36:20)
Exported lines 8560-8659 to steam_game_data2.csv. Batch 3 time: 0:03:19 (avg: 0:03:25, remaining: 2 days, 9:04:01)
Exported lines 8660-8759 to steam_game_data2.csv. Batch 4 time: 0:03:24 (avg: 0:03:25, remaining: 2 days, 8:58:39)
Current index: 8796

In [None]:
#Check number of apps writen to file match the original app list
app_list.info() 

<p style="font-size:21px"><b>Review Scraping</b></p> <br> Due to the vast number of games in the Steam Store and the limited amount of time I had to carry out this project these I decided to focus only on DLCs which was also a personal point of interest.
<br><br>
I extracted the appids with type DLC and then used these to loop through the reviews api. The number of reviews per game is very variable and some go up to 500k+ so I decided to only extract 100 reviews per DLC in English ranked by helpful-ness according to users.


In [None]:
#Review API takes the appid midway through the URL so I adjusted the get requests function
def get_rev_request(url, appid):
    """Return json-formatted response of a get request using parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
           
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url.format(appid))
 
    
    except requests.Timeout as errt:
        print('Timeout Error:', errt)
        #too many requests, pause and try again
        print('Waiting 5 seconds')
        time.sleep(5)
        return get_request(url, parameters)
    
    except requests.ConnectionError as errc:
        print('Connection Error:', errc)
        #connection issue, pause and try again
        print('Waiting 25 seconds')
        time.sleep(25)
        return get_request(url, appid)
              

    except KeyboardInterrupt as errk:
        print('Program stopped:', errk)
        
        
    except requests.RequestException as erre:
        print('General Error:', appid)
        #print the appid here so I know which ones failed
        
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests, pause and try again 
        print('No response, waiting 5 mins')
        time.sleep(300)
        print('Retrying')
        return get_request(url, parameters)

In [None]:
def get_review_data(appid_l, pause=1):
    """Return app data generated from review api
     
     Parametres 
     ---------
     appid_l 
         list of app ids in format appid_l['steam_appid']
     pause 
         sleep time
    """
    #to check time it took to run

    start = time.time()
    
    url = 'https://store.steampowered.com/appreviews/{}?json=1&language=english&num_per_page=100'
    
    # iterate through each row of app_list 
    for appid in appid_l['steam_appid']:
        
        # retrieve app data for a row and send to DF
       
        data = get_request(url, appid)
        
        if (data['success'] == 1):
            if data.get('reviews'): #this checks if there are reviews
                data = data['reviews']
            else:     
                data = {'review': ['No reviews']}
        else:
            data = {'review': ['Failed to scrape']}

          
        data = pd.DataFrame.from_dict(data)
       
        data.insert(0, 'dlc', 1)
        data.insert(0,'app_id', appid)

        time.sleep(pause) # prevent overloading api with requests
        
        rel_path = os.path.join(download_path, data_filename)
        
        #write df to csv, doesn't write headers if they exist
    
        with open(rel_path, 'a', encoding='utf-8') as f:
                data.to_csv(f, header=f.tell()==0)
                
    end = time.time() 
    print('\nThe function took {:.2f} s to compute.'.format(end - start))

Read in Game DF to get a list of appids, theres a lot so will partition these to process in batches

In [None]:
gamedf = pd.read_csv('data/download/steam_game_data2.csv')

In [None]:
gamedf.drop_duplicates(keep='first', inplace=True) 

In [None]:
dlc_df = gamedf[gamedf['type']== 'dlc']
dlc_l = dlc_df[['type', 'steam_appid']]

In [None]:
dlc_l.info()

In [None]:
#define n partitions to run
27325/1000

In [None]:
partitions = 27
part_dfs = np.array_split(dlc_l, partitions)
download_path = 'data/download'
data_filename = 'steam_dlc_reviews2.csv'
#Call function to get dlc review data
#get_app_data(part_dfs[0], 3)

In [None]:
#reading in to inspect dlc review data

#dlc0_df = pd.read_csv('data/download/steam_dlc_reviews2.csv',  
                      usecols = ['app_id', 'dlc', 'recommendationid', 'author'], encoding= 'utf-8')

In [None]:
Upon inspection I noticed from appids were missin so I created a function to check for missing values

In [None]:
def checkIfValuesExists(df_main, df_scrape):
    ''' Check if given elements exists in a dataframe or not.
        Returns a dataframe with the missing ids'''
    df_check = df_main.assign(appid=df_main.steam_appid.isin(df_scrape.app_id).astype(int))
    missing_id = df_check[df_check['appid']==0]
    
    return missing_id

In [None]:
remain_l = checkIfValuesExists(dlc_l, dlc0_df)

In [None]:
download_path = 'data/download/review'
data_filename = 'steam_dlc_reviews2.csv'

In [None]:
#rerun scrape anything missed
get_app_data(part_dfs[3], 4)

In [None]:
dlc_df = pd.read_csv('data/download/steam_dlc_reviews2.csv',  
                      usecols = ['app_id', 'dlc', 'recommendationid', 'author'], encoding= 'utf-8')

In [None]:
remain_l2 = checkIfValuesExists(dlc_l, dlc_df)
len(remain_l)