In [5]:
import os.path
from pathlib import Path
import time
import requests
import json
import urllib.parse
import pandas as pd

In [13]:
def initialize_cursor_data():
    file = Path('data/cursor_data.csv')
    
    if (file.exists()):
        print("initialization skipped - cursor data already exists")
    else:
        initial_cursor_data = {"batch": 0, "cursor": ['*']}

        cursor_data = pd.DataFrame(data = initial_cursor_data)
        cursor_data.to_csv('data/cursor_data.csv', index = False)
        
def initialize_raw_data():
    file = Path('data/raw_data.csv')
    
    if (file.exists()):
        print("initialization skipped - raw data already exists")
    else:
        column_names = ['recommendationid', 'author', 'language', 'review', 'timestamp_created',
                        'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny', 
                        'weighted_vote_score', 'comment_count', 'steam_purchase', 
                        'received_for_free', 'written_during_early_access']
        
        raw_data = pd.DataFrame(columns = column_names)
        raw_data.to_csv('data/raw_data.csv', index = False)
    

def get_next_cursor():
    cursor_data = pd.read_csv('data/cursor_data.csv')
    next_cursor = cursor_data['cursor'].iloc[-1]
    #next_cursor = next_cursor.to_string(index = False)
    #next_cursor = next_cursor.lstrip()
    
    return next_cursor

def get_next_url(next_cursor):
    base_url = 'https://store.steampowered.com/appreviews/322330?json=1&day_range=9223372036854775807&cursor='
    next_url = base_url + next_cursor
    
    return next_url
    
def get_data(num_batches): 
    cursor_data = []
    column_names = ['recommendationid', 'author', 'language', 'review', 'timestamp_created',
                    'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny', 
                    'weighted_vote_score', 'comment_count', 'steam_purchase', 
                    'received_for_free', 'written_during_early_access']
    review_data = pd.DataFrame(columns = column_names)
    
    next_cursor = get_next_cursor()
    
    for batch in range(num_batches):
        next_url = get_next_url(next_cursor)

        time.sleep(2)
        r = requests.get(next_url)
        json_data = json.loads(r.text) 

        new_review_data = pd.DataFrame(data = json_data['reviews'])
        review_data = review_data.append(new_review_data, ignore_index = True)

        next_cursor = urllib.parse.quote(json_data['cursor'])
        cursor_data.append(next_cursor)
    
    return review_data, cursor_data

def update_cursor_data(new_cursors):
    cursor_data = pd.read_csv('data/cursor_data.csv')
    cursor_data.to_csv('data/cursor_data_backup.csv', index = False)
    
    initial_length = cursor_data.shape[0]
    final_length = initial_length + len(new_cursors)
    
    new_cursor_data = pd.DataFrame({'batch': range(initial_length, final_length),
                                    'cursor': new_cursors})
    
    updated_cursor_data = cursor_data.append(new_cursor_data, ignore_index = True)
    updated_cursor_data.to_csv('data/cursor_data.csv', index = False)
    
def update_raw_data(new_raw_data):
    raw_data = pd.read_csv('data/raw_data.csv')
    raw_data.to_csv('data/raw_data_backup.csv', index = False)
    
    updated_raw_data = raw_data.append(new_raw_data, ignore_index = True)
    updated_raw_data.to_csv('data/raw_data.csv', index = False)

def clean_review_data(data):
    review_data = data[['author', 'review', 'voted_up']].copy()
    
    review_data['author'] = data['author'].apply(lambda x: x.get('steamid'))
    review_data.rename(columns = {'author': 'steam_id'}, inplace = True)
    
    review_data['review'] = review_data['review'].apply(lambda x: x.replace('\n', ' '))
    
    return review_data
    
def display_reviews(data):
    for row in range(data.shape[0]):
        print(data.loc[row, 'review'], '\n')
    
# update data (write-back)

In [7]:
initialize_cursor_data()
initialize_raw_data()

initialization skipped - cursor data already exists
initialization skipped - raw data already exists


In [10]:
new_data = get_data(5)

new_raw_data = new_data[0]
new_cursors = new_data[1]

update_cursor_data(new_cursors)
update_raw_data(new_raw_data)

In [20]:
x = review_data.duplicated(subset = 'review', keep = "first")

for z in range (200):
    if x[z] == 1:
        print(z)
    
review_data['re']

52
127
152
154
192


In [12]:
review_data = pd.read_csv('data/raw_data.csv')
display_reviews(review_data)

Survived a Deerclop raid
Killed by a random frog.........





10frograin/10 

Don't Starve is back is more fun than ever!
 
Here are the positives:

-Now you get to die with your friends!

-You can get murdered by your 'Friends'!

GREAT GAME      DORITO+MTN DEW/9,000

P.S. You can also be haunted by your friend and die of insanity! Well, you can be with your friend when you die... so it's ok! 

Favorite survival game. I almost won it once... but then I starved to death. 

Easily the most enjoyable game I have ever played. Each time you start a new world you make it slightly further than you did the last time only to die to a new aspect that you never knew before. Takes several times to get a mechanic, but once you do the you can finally start learning the other aspects of the game such as crafting and fighting. Playing with friends makes the game 1000x more enjoyable. This game is a deal breaker I see potential in losing friends over this game :p 11/10 would recommend 

Great for a gr

In [None]:


"""TO DO
    clean data:
        drop: 'recommendationid', 'language', 'timestamp_created', 'timestamp_updated',
              'votes_up', 'votes_funny', 'weighted_vote_score', 'comment_count', 
              'steam_purchase', 'received_for_free', 'written_during_early_access'
        
        map: 'author' to 'steamid' only
        
        get rid of newline characters(?)
        
    dataframe to save batch numbers and their cursors
"""

r = requests.get('https://store.steampowered.com/appreviews/322330?json=1&cursor=*')

json_data = json.loads(r.text)

#raw_review_data = pd.DataFrame(data = json_data['reviews'])

#review_data = clean_data(raw_review_data)
# base_address = 'https://store.steampowered.com/appreviews/322330?json=1&cursor='
# cursor = json_data['cursor'] # need for next batch of reviews

# next_address = base_address + cursor
# next_batch  = requests.get(next_address)

In [None]:
# reviews_df.loc[reviews_df['voted_up'] == 0, 'review']
# reviews_df.loc[13, 'review']