In [1]:
import os.path
from pathlib import Path
import time
import requests
import json
import urllib.parse
import pandas as pd

In [2]:
def initialize_cursor_data():
    file = Path('data/cursor_data.csv')
    
    if (file.exists()):
        print("initialization skipped - cursor data already exists")
    else:
        initial_cursor_data = {"batch": 0, "cursor": ['*']}

        cursor_data = pd.DataFrame(data = initial_cursor_data)
        cursor_data.to_csv('data/cursor_data.csv', index = False)
        
def initialize_raw_data():
    file = Path('data/raw_data.csv')
    
    if (file.exists()):
        print("initialization skipped - raw data already exists")
    else:
        column_names = ['author', 'comment_count', 'language', 'received_for_free', 
                        'recommendationid', 'review', 'steam_purchase', 'timestamp_created', 
                        'timestamp_updated', 'voted_up', 'votes_funny', 'votes_up', 
                        'weighted_vote_score', 'written_during_early_access']
        
        raw_data = pd.DataFrame(columns = column_names)
        raw_data.to_csv('data/raw_data.csv', index = False)
        
def initialize_review_data():
    file = Path('data/review_data.csv')
    
    if (file.exists()):
        print("initialization skipped - review data already exists")
    else:
        column_names = ['steam_id', 'review', 'voted_up', 'timestamp_created']
        
        data = pd.DataFrame(columns = column_names)
        data.to_csv('data/review_data.csv', index = False)

def get_next_cursor():
    cursor_data = pd.read_csv('data/cursor_data.csv')
    next_cursor = cursor_data['cursor'].iloc[-1]

    return next_cursor

def get_next_url(next_cursor):
    base_url = 'https://store.steampowered.com/appreviews/322330?json=1&day_range=9223372036854775807\
                &language="English"&num_per_page=100&cursor='
    next_url = base_url + next_cursor
    
    return next_url
    
def get_data(num_batches): 
    cursor_data = []
    column_names = ['author', 'comment_count', 'developer_response', 'language', 
                    'received_for_free', 'recommendationid', 'review', 'steam_purchase', 
                    'timestamp_created', 'timestamp_updated', 'voted_up', 'votes_funny', 
                    'votes_up', 'weighted_vote_score', 'written_during_early_access']
    review_data = pd.DataFrame(columns = column_names)
    
    next_cursor = get_next_cursor()
    
    for batch in range(num_batches):
        next_url = get_next_url(next_cursor)

        time.sleep(2)
        r = requests.get(next_url)
        json_data = json.loads(r.text) 

        new_review_data = pd.DataFrame(data = json_data['reviews'])
        review_data = review_data.append(new_review_data, ignore_index = True)

        next_cursor = urllib.parse.quote(json_data['cursor'])
        cursor_data.append(next_cursor)
        
    review_data.drop(['developer_response'], axis = 1)
    
    return review_data, cursor_data

def update_cursor_data(new_cursors):
    cursor_data = pd.read_csv('data/cursor_data.csv')
    cursor_data.to_csv('data/cursor_data_backup.csv', index = False)
    
    initial_length = cursor_data.shape[0]
    final_length = initial_length + len(new_cursors)
    
    new_cursor_data = pd.DataFrame({'batch': range(initial_length, final_length),
                                    'cursor': new_cursors})
    
    updated_cursor_data = cursor_data.append(new_cursor_data, ignore_index = True)
    updated_cursor_data.to_csv('data/cursor_data.csv', index = False)
    
def update_raw_data(new_data):
    raw_data = pd.read_csv('data/raw_data.csv')
    raw_data.to_csv('data/raw_data_backup.csv', index = False)
    
    updated_raw_data = raw_data.append(new_data, ignore_index = True)
    updated_raw_data.to_csv('data/raw_data.csv', index = False)

def clean_author_data(data):
    data['author'] = data['author'].apply(lambda x: x.get('steamid'))
    data.rename(columns = {'author': 'steam_id'}, inplace = True)
    
    return data

def clean_reviews(data):
    data['review'] = data['review'].astype(str)
    data['review'] = data['review'].apply(lambda x: x.replace('\n', ' '))
    
    return data
    
def clean_data(data):
    trimmed_data = data[['author', 'review', 'timestamp_created', 'voted_up']].copy()
    
    intermediate_data = clean_author_data(trimmed_data)
    cleansed_data = clean_reviews(intermediate_data)
    
    return cleansed_data

def update_review_data(new_data):
    data = pd.read_csv('data/review_data.csv')
    data.to_csv('data/review_data_backup.csv', index = False)
    
    updated_data = data.append(new_data, ignore_index = True)
    updated_data.to_csv('data/review_data.csv', index = False)
    
def display_reviews(data):
    for row in range(data.shape[0]):
        print(data.loc[row, 'review'], '\n')
    
# update data (write-back)

In [3]:
initialize_cursor_data()
initialize_raw_data()
initialize_review_data()

initialization skipped - cursor data already exists
initialization skipped - raw data already exists
initialization skipped - review data already exists


In [None]:
new_data = get_data(50)
new_raw_data = new_data[0]
new_cursors = new_data[1]

update_cursor_data(new_cursors)
update_raw_data(new_raw_data)

new_review_data = clean_data(new_raw_data)
update_review_data(new_review_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
