In [1]:
import os.path
from pathlib import Path
import time
import requests
import json
import urllib.parse
import pandas as pd

In [2]:
def initialize_cursor_data():
    """Create an empty csv file for keeping track of cursors obtained from Steam's getreview 
    API where cursors are URL Encoded strings used to query the next batch of data.
    
    https://partner.steamgames.com/doc/store/getreviews contains more information regarding
    the API
    
    Args:
        None
        
    Returns:
        None
    """
    
    file = Path('data/cursor_data.csv')
    
    if (file.exists()):
        print("initialization skipped - cursor data already exists")
    else:
        initial_cursor_data = {"batch": 0, "cursor": ['*']}

        cursor_data = pd.DataFrame(data=initial_cursor_data)
        cursor_data.to_csv('data/cursor_data.csv', index=False)
        
def initialize_raw_data():
    """Create an empty csv file for the raw review data - refer to 
    column_names below for the data contained
    
    Args:
        None
        
    Returns:
        None
    """
    
    file = Path('data/raw_data.csv')
    
    if (file.exists()):
        print("initialization skipped - raw data already exists")
    else:
        column_names = ['recommendationid', 'author', 'language', 'review', 
                        'timestamp_created', 'timestamp_updated', 'voted_up',
                        'votes_funny', 'weighted_vote_score', 'comment_count',
                        'steam_purchase', 'received_for_free', 
                        'written_during_early_access']
        
        raw_data = pd.DataFrame(columns=column_names)
        raw_data.to_csv('data/raw_data.csv', index=False)
        
def initialize_review_data():
    """Create an empty csv file for cleaned data that includes a user's steam id,
    their review, the (Unix) timestamp for when it was created and whether or not 
    the user recommended the game.
    
    Args:
        None
        
    Returns:
        None
    """
    
    file = Path('data/review_data.csv')
    
    if (file.exists()):
        print("initialization skipped - review data already exists")
    else:
        column_names = ['steam_id', 'review', 'timestamp_created', 'voted_up']
        
        data = pd.DataFrame(column=column_names)
        data.to_csv('data/review_data.csv', index=False)

def get_next_cursor():
    """Get the cursor for the next query from cursor_data.csv
    
    Args:
        None
        
    Returns:
        next_cursor (string): URL Encoded string used to obtain the next batch of data
    """
    
    cursor_data = pd.read_csv('data/cursor_data.csv')
    next_cursor = cursor_data['cursor'].iloc[-1]

    return next_cursor

def get_next_url(next_cursor):
    """Get the url containing the next batch of data
    
    Args:
        next_cursor (string): URL Encoded string used to obtain the next batch of data
        
    Returns:
        next_url (string): the url containing the next batch of data
    """
    
    base_url = 'https://store.steampowered.com/appreviews/322330?json=1&day_range=9223372036854775807\
                &language="English"&num_per_page=100&cursor='
    next_url = base_url + next_cursor
    return next_url
    
def get_data(num_batches): 
    """Get the raw review data and cursors
    
    Args:
        num_batches (int): the number of data batches to be queried where each batch contains
                           100 reviews
        
    Returns:
        review_data (dataframe): the raw review data 
        cursor_data (array): cursors used to query the data 
    """
    
    cursor_data = []
    column_names = ['recommendationid', 'author', 'language', 'review', 
                    'timestamp_created', 'timestamp_updated', 'voted_up',
                    'votes_funny', 'weighted_vote_score', 'comment_count',
                    'steam_purchase', 'received_for_free', 
                    'written_during_early_access']
    review_data = pd.DataFrame(columns=column_names)
    next_cursor = get_next_cursor()
    
    for batch in range(num_batches):
        next_url = get_next_url(next_cursor)

        time.sleep(2)
        r = requests.get(next_url)
        json_data = json.loads(r.text) 

        new_review_data = pd.DataFrame(data = json_data['reviews'])
        review_data = review_data.append(new_review_data, ignore_index=True, sort=True)

        next_cursor = urllib.parse.quote(json_data['cursor'])
        cursor_data.append(next_cursor)
    
    return review_data, cursor_data

def update_cursor_data(new_cursors):
    """Create a backup of the cursor data and update it 
    
    Args:
        new_cursors (array): contains cursors corresponding to the queried data along 
                             with a cursor that can be used to query the next batch of data
        
    Returns:
        None
    """
    
    cursor_data = pd.read_csv('data/cursor_data.csv')
    cursor_data.to_csv('data/backup_data/cursor_data_backup.csv', index=False)
    
    initial_length = cursor_data.shape[0]
    final_length = initial_length + len(new_cursors)
    
    new_cursor_data = pd.DataFrame({'batch': range(initial_length, final_length),
                                    'cursor': new_cursors})
    
    updated_cursor_data = cursor_data.append(new_cursor_data, ignore_index=True)
    updated_cursor_data.to_csv('data/cursor_data.csv', index=False)
    
def update_raw_data(new_data):
    """Create a backup of the raw data and update it 
    
    Args:
        new_data (dataframe): contains new raw data to be added
        
    Returns:
        None
    """
    
    raw_data = pd.read_csv('data/raw_data.csv')
    raw_data.to_csv('data/backup_data/raw_data_backup.csv', index=False)
    
    updated_raw_data = raw_data.append(new_data, ignore_index=True, sort=True)
    updated_raw_data.to_csv('data/raw_data.csv', index = False)

def clean_author_data(data):
    data['author'] = data['author'].apply(lambda x: x.get('steamid'))
    data.rename(columns = {'author': 'steam_id'}, inplace=True)
    return data

def clean_reviews(data):
    data['review'] = data['review'].astype(str)
    data['review'] = data['review'].apply(lambda x: x.replace('\n', ' '))
    return data
    
def clean_data(raw_data):
    """Cleans user reviews by dropping unnecessary columns, re-mapping author
    to steam_id and removing newlines from reviews
    
    Args:
        raw_data (dataframe): contains the raw data to be cleansed
        
    Returns:
        None
    """
    
    trimmed_data = raw_data[['author', 'review', 'timestamp_created', 'voted_up']].copy()
    
    intermediate_data = clean_author_data(trimmed_data)
    cleansed_data = clean_reviews(intermediate_data)
    return cleansed_data

def update_review_data(new_data):
    """Create a backup of the cleansed data and update it 
    
    Args:
        new_data (dataframe): contains the cleansed data to be added
        
    Returns:
        None
    """
    
    data = pd.read_csv('data/review_data.csv')
    data.to_csv('data/backup_data/review_data_backup.csv', index=False)
    
    updated_data = data.append(new_data, ignore_index=True)
    updated_data.to_csv('data/review_data.csv', index=False)

In [3]:
# create csv files for data if they do not exist
initialize_cursor_data()
initialize_raw_data()
initialize_review_data()

In [4]:
# fetch and clean new data 
# the input of get_data() corresponds to the number of data batches fetched where each batch contains 100 reviews
new_data = get_data(100)
new_raw_data = new_data[0]
new_cursors = new_data[1]

update_cursor_data(new_cursors)
update_raw_data(new_raw_data)

new_review_data = clean_data(new_raw_data)
update_review_data(new_review_data)