<p style="font-size:21px"><b>Data Acquisition</b></p>

<b>To acquire the data I needed for this project I used the Steamwebapi along with the requests library.</b> <br><br> Steam has many APIs each returning information on a variety of different features. In order to acquire store front data of each game within the store I used the appdetails API which required an appid as a parameter.

<br>
To retrieve a list of appids from the store I used the ISteamApps API to get a list of game ids and name, using this to then obtain game information from StorefrontAPI. <br><br> The data was returned as a json type with 'applist' as a key and 'apps' as a key stored within this. I extracted this data and placed it into a dataframe, then stored as a csv to access later.

In [1]:
import numpy as np
import pandas as pd
import requests
import csv
import json
import os
from tqdm import tqdm

pd.set_option("max_columns", 100)

In [None]:
url = 'https://api.steampowered.com/ISteamApps/GetAppList/v2/'
#This api only retrieves game names and id
r = requests.get(url)
json_ = r.json()

In [None]:
print(json_['applist']['apps'][50])

In [None]:
GameID = {'appid':[],
               'name':[]}

for item in json_['applist']['apps']:
    try:
        GameID['appid'].append(item['appid'])
    except:
        GameID['appid'].append('None')

    try:
         GameID['name'].append(item['name'])
    except:
        GameID.append('None')

    
GameIDs = pd.DataFrame(GameID)

In [None]:
GameIDs.sort_values(by='appid', inplace=True)
GameIDs.drop(GameIDs[(GameIDs.appid == 'None') | (GameIDs.name == 'None')].index, inplace=True)
GameIDs.reset_index(drop=True, inplace=True)
#GameIDs.to_csv('data/gameids.csv', index=False)

There are around 100k appids within the store so it took me a while to scrape the game data. In order to scrape this over a period of time without losing progress I used a few functions that will make an api call and then write the results to csv.

In [2]:
def get_request(url, parameters=None):
    """Get request function to handle exceptions
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
           
    Returns
    -------
    json_formatted data (dict like)
    """
    try:
        response = requests.get(url=url, params=parameters)
 
    
    except requests.Timeout as errt:
        print('Timeout Error:', errt)
        #too many requests, pause and try again
        print('Waiting 5 seconds')
        time.sleep(5)
        return get_request(url, parameters)
    
    except requests.ConnectionError as errc:
        print('Connection Error:', errc)
        #connection issue, pause and try again
        print('Waiting 25 seconds')
        time.sleep(25)
        return get_request(url, parameters)
        
        
    except requests.RequestException as erre:
        print('General Error:', erre)
        
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests, pause and try again 
        print('No response, waiting 5 mins')
        time.sleep(300)
        print('Retrying')
        return get_request(url, parameters)

In [3]:
def get_index(download_path, data_file, index_file):
    """
    Function to create index file to save and retrieve progress
    
    Parameters
    ----------
    download_path : data folder
    data_file : filename of saved data
    index_file : filename for progress tracking
           
    Returns
    -------
    Current index
    """
    index_path = os.path.join(download_path, index_file)
    if os.path.isfile(index_path):
        with open (index_path) as f:
            index = int(f.read())
    else:
        index = 0
        #If index is 0 create data file and write headers
        data_path = os.path.join(download_path, data_file)
        with open(data_path, 'w', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=columns)
                writer.writeheader()
    return index

In [12]:
def get_app_data(app_list, batchsize=100, pause=1):
    
    """
    Retrieves data from steam api in batches and write to file
    
    Parameters
    ----------
    app_list : list of apps to iterate through
    n_apps : number of apps to process, by default will process all
    batchsize : size of batches to write to file
    pause : sleep timer to avoid overloading API
           
    """    
      
    index = get_index(download_path, data_file, index_file)
    print('Current index: {}'.format(index))    
    end = len(app_list) + 1
    
        #set batches to process
    batches = np.arange(index, end, batchsize)
    
    for i in tqdm(range(len(batches) -1 )):
        
        start = batches[i]
        stop = batches [i+1]

        app_data = []

        # iterate through each row of app_list in batches
        for index, row in app_list[start:stop].iterrows():

            appid = row['appid']
            name = row['name']

            # retrive app data for a row and append to list

            url = "http://store.steampowered.com/api/appdetails/"
            parameters = {"appids": appid}

            jr = get_request(url, parameters=parameters)
            json_data = jr[str(appid)]

            if json_data['success']: 
                data = json_data['data']
            else:
                data = {'name': name, 'steam_appid': appid}

            app_data.append(data)
            
            time.sleep(pause) 

        # writing app data to file
        data_path = os.path.join(download_path, data_file)

        with open(data_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            print('Writing rows {}-{} '.format(start, stop-1))
            writer.writerows(app_data)

        # writing last index to file
        index_path = os.path.join(download_path, index_file)
        with open (index_path, 'w') as f:
            f.write(str(stop))


In [None]:
#checking directory
os.getcwd()

In [10]:
# Set file parameters
download_path = 'data'
index_file = 'steam_index.txt'
data_file = 'steam_game_data.csv'
app_list = pd.read_csv('data/gameids.csv')  

columns = ['type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors']

In [15]:
#running scraping function for demonstration
get_app_data(app_list, batchsize=20)

  0%|          | 0/5 [00:00<?, ?it/s]

Current index: 0


 20%|██        | 1/5 [00:31<02:05, 31.45s/it]

Writing rows 0-19 


 40%|████      | 2/5 [01:04<01:35, 31.85s/it]

Writing rows 20-39 


 60%|██████    | 3/5 [01:35<01:03, 31.77s/it]

Writing rows 40-59 


 80%|████████  | 4/5 [02:08<00:32, 32.12s/it]

Writing rows 60-79 


100%|██████████| 5/5 [02:40<00:00, 32.04s/it]

Writing rows 80-99 





<p style="font-size:21px"><b>  Review Scraping</b></p> <br> Due to the vast number of games in the Steam Store and the tiem constraints of this project I decided to focus only on DLCs which was also a personal point of interest.
<br><br>
I extracted the appids with type DLC and then used these to loop through the reviews api. The number of reviews per game is very variable and some go up to 500k+ so I decided to only extract the first 100 reviews per DLC in English ordered by helpful-ness according to users.


In [None]:
#Review API takes the appid midway through the URL so I adjusted the get requests function
def get_rev_request(url, appid):
    """Get request function to handle exceptions for reviews

        Returns
        -------
        json_formatted data (dict like)
   
    """
    try:
        response = requests.get(url.format(appid))
 
    
    except requests.Timeout as errt:
        print('Timeout Error:', errt)
        #too many requests, pause and try again
        print('Waiting 5 seconds')
        time.sleep(5)
        return get_request(url, parameters)
    
    except requests.ConnectionError as errc:
        print('Connection Error:', errc)
        #connection issue, pause and try again
        print('Waiting 25 seconds')
        time.sleep(25)
        return get_request(url, appid)
        
        
    except requests.RequestException as erre:
        print('General Error:', appid)
        #print the appid here so I know which ones failed
        
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests, pause and try again 
        print('No response, waiting 5 mins')
        time.sleep(300)
        print('Retrying')
        return get_request(url, parameters)

In [None]:
#each request retreives a maximum of 100 reviews each time so write each file to page

def get_review_data(appid_list, data_file, index_file, pause=1):
    
    """Return app data generated from review api
     
     Parameters 
     ---------
     appid_list : list of app ids in format appid_l['steam_appid']
     data_file : filename of saved data
     index_file : filename for progress tracking 
     pause :   sleep time
    """
        
    url = 'https://store.steampowered.com/appreviews/{}?json=1&language=english&num_per_page=100'
    index_path = os.path.join(download_path, index_file)
    
    if os.path.isfile(index_path):
        with open (index_path) as f:
            index = int(f.read())
    else:
        index = 0

    print('Current index: {}'.format(index))    

    end = len(app_list) + 1
    
    batches = np.arange(index, end, 1)

    for i in tqdm(range(len(batches) -1)):

        start = batches[i]
        stop = batches [i+1]

        # iterate through each row of app_list 
        for index, row in app_list[start:stop].iterrows():

            appid = row['steam_appid']

            # retrieve app data for a row and send to DF

            data = get_rev_request(url, appid)

            if (data['success'] == 1):
                if data.get('reviews'): #this checks if there are reviews
                    data = data['reviews']
                else:     
                    data = {'review': ['No reviews']}
            else:
                data = {'review': ['Failed to scrape']}


            data = pd.DataFrame.from_dict(data)

            data.insert(0, 'dlc', 1)
            data.insert(0,'app_id', appid)

            time.sleep(pause) 

            data_path = os.path.join(download_path, data_file)

            #write df to csv, doesn't write headers if they exist

            with open(data_path, 'a', encoding='utf-8') as f:
                    data.to_csv(f, header=f.tell()==0)

            # writing last index to file
            with open (index_path, 'w') as f:
                f.write(str(stop))

Read in Game DF to get a list of appids, use this to retrieve review data

In [None]:
df = pd.read_csv('data/steam_game_data.csv')

In [None]:
df.drop_duplicates(keep='first', inplace=True) 
dlc_df = df[df['type']== 'dlc']
dlc_l = dlc_df[['type', 'steam_appid']]

In [None]:
review_index = 'steam_dlc_index.txt'
review_file = 'steam_dlc_review.csv'
#Call function to get dlc review data
get_review_data(appid_list=dlc_l, data_file=review_file, index_file=review_index, pause=3):