In [1]:
import os.path
from pathlib import Path
import time
import requests
import json
import urllib.parse
import pandas as pd

In [2]:
def initialize_cursor_data():
    file = Path('data/cursor_data.csv')
    
    if (file.exists()):
        print("initialization skipped - cursor data already exists")
    else:
        initial_cursor_data = {"batch": 0, "cursor": ['*']}

        cursor_data = pd.DataFrame(data = initial_cursor_data)
        cursor_data.to_csv('data/cursor_data.csv', index = False)
        
def initialize_raw_data():
    file = Path('data/raw_data.csv')
    
    if (file.exists()):
        print("initialization skipped - raw data already exists")
    else:
        column_names = ['recommendationid', 'author', 'language', 'review', 'timestamp_created',
                        'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny', 
                        'weighted_vote_score', 'comment_count', 'steam_purchase', 
                        'received_for_free', 'written_during_early_access']
        
        raw_data = pd.DataFrame(columns = column_names)
        raw_data.to_csv('data/raw_data.csv', index = False)
    

def get_next_cursor():
    cursor_data = pd.read_csv('data/cursor_data.csv')
    next_cursor = cursor_data['cursor'].iloc[-1]
    #next_cursor = next_cursor.to_string(index = False)
    #next_cursor = next_cursor.lstrip()
    
    return next_cursor

def get_next_url(next_cursor):
    base_url = 'https://store.steampowered.com/appreviews/322330?json=1&day_range=9223372036854775807\
                &num_per_page=100&cursor='
    next_url = base_url + next_cursor
    
    return next_url
    
def get_data(num_batches): 
    cursor_data = []
    column_names = ['recommendationid', 'author', 'language', 'review', 'timestamp_created',
                    'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny', 
                    'weighted_vote_score', 'comment_count', 'steam_purchase', 
                    'received_for_free', 'written_during_early_access']
    review_data = pd.DataFrame(columns = column_names)
    
    next_cursor = get_next_cursor()
    
    for batch in range(num_batches):
        next_url = get_next_url(next_cursor)

        time.sleep(2)
        r = requests.get(next_url)
        json_data = json.loads(r.text) 

        new_review_data = pd.DataFrame(data = json_data['reviews'])
        review_data = review_data.append(new_review_data, ignore_index = True)

        next_cursor = urllib.parse.quote(json_data['cursor'])
        cursor_data.append(next_cursor)
    
    return review_data, cursor_data

def update_cursor_data(new_cursors):
    cursor_data = pd.read_csv('data/cursor_data.csv')
    cursor_data.to_csv('data/cursor_data_backup.csv', index = False)
    
    initial_length = cursor_data.shape[0]
    final_length = initial_length + len(new_cursors)
    
    new_cursor_data = pd.DataFrame({'batch': range(initial_length, final_length),
                                    'cursor': new_cursors})
    
    updated_cursor_data = cursor_data.append(new_cursor_data, ignore_index = True)
    updated_cursor_data.to_csv('data/cursor_data.csv', index = False)
    
def update_raw_data(new_raw_data):
    raw_data = pd.read_csv('data/raw_data.csv')
    raw_data.to_csv('data/raw_data_backup.csv', index = False)
    
    updated_raw_data = raw_data.append(new_raw_data, ignore_index = True)
    updated_raw_data.to_csv('data/raw_data.csv', index = False)

def clean_review_data(data):
    review_data = data[['author', 'review', 'voted_up']].copy()
    
    review_data['author'] = data['author'].apply(lambda x: x.get('steamid'))
    review_data.rename(columns = {'author': 'steam_id'}, inplace = True)
    
    review_data['review'] = review_data['review'].apply(lambda x: x.replace('\n', ' '))
    
    return review_data
    
def display_reviews(data):
    for row in range(data.shape[0]):
        print(data.loc[row, 'review'], '\n')
    
# update data (write-back)

In [3]:
initialize_cursor_data()
initialize_raw_data()

In [16]:
new_data = get_data(10)

new_raw_data = new_data[0]
new_cursors = new_data[1]

update_cursor_data(new_cursors)
update_raw_data(new_raw_data)

In [24]:
review_data = pd.read_csv('data/raw_data.csv')
print("number of rows: {0} \n".format(review_data.shape[0]))
print("voted_up {0} \n".format(review_data['voted_up'].value_counts()))
# display_reviews(review_data)

number of rows: 5000 

voted_up True     4952
False      48
Name: voted_up, dtype: int64 



In [6]:
cursor_data = pd.read_csv('data/cursor_data.csv')
cursor_data

Unnamed: 0,batch,cursor
0,0,*
1,1,AoIIQJ9rvHLPjDU%3D
2,2,AoIIQJ9c%2B3innVM%3D
3,3,AoIIQJ9c%2B3rV/UE%3D
4,4,AoIIQJ9c%2B3KPtTc%3D
5,5,AoIIQJ88kHbxmDM%3D
6,6,AoIIQJ80N3SrsTo%3D
7,7,AoIIQJ5qdHXvmzQ%3D
8,8,AoIIP0QPVnz0xroB
9,9,AoIIPywUpXuIpXQ%3D


In [None]:


"""TO DO
    clean data:
        map 'author' to 'steam_id'
        
        lowercase
        stemming/lematization
        stopword removal
        normalization
        noise removal
"""

In [None]:
# reviews_df.loc[reviews_df['voted_up'] == 0, 'review']
# reviews_df.loc[13, 'review']