In [1]:
import pandas as pd
import time

from datetime import datetime, timedelta
from google_play_scraper import Sort, reviews

In [2]:
# get all reviews ranging from today to first instance of day before yesterday
# drop columns 
# rename columns
# filter rows - remove reviews from today and reviews from day before yesterday

# insert to database

In [3]:
def get_extraction_is_complete(data):
    today = datetime.today()
    today = pd.Timestamp(today).floor('d')
    yesterday = today - pd.Timedelta(days=1)
    extraction_is_complete = data['at'].apply(lambda ts: ts < yesterday).any()
    return extraction_is_complete


def get_continue_extracting_reviews(data):
    continue_extracting_reviews = not(get_extraction_is_complete(data))
    return continue_extracting_reviews


def extract_next_batch(continuation_token, app_id):
    results, continuation_token = reviews(
        app_id,
        continuation_token=continuation_token
    )
    next_batch = pd.DataFrame(results)
    return next_batch, continuation_token


def extract_review_data(app_id='com.tgc.sky.android'):
    results, continuation_token = reviews(
        app_id,
        sort=Sort.NEWEST,
        count=200
    ) 
    review_data = pd.DataFrame(results)
    
    continue_extracting_reviews = get_continue_extracting_reviews(review_data)
    while(continue_extracting_reviews):
        next_batch, continuation_token = extract_next_batch(continuation_token, app_id=app_id)
        continue_extracting_reviews = get_continue_extracting_reviews(next_batch) 
        review_data = review_data.append(next_batch, ignore_index=True)
        time.sleep(1)
            
    return review_data

In [4]:
def drop_columns(data):
    data_subset = data.drop(columns=['replyContent', 'userImage', 'repliedAt'])
    return data_subset


def rename_columns(data):
    new_column_names = [
        'review_id', 'user_name', 'content', 'rating', 'thumbs_up_count', 
        'created_for_version', 'created_on'
    ]
    new_columns = dict(zip(data.columns, new_column_names))
    renamed_data = data.rename(columns=new_columns)
    return renamed_data


def transform_review_data(data):
    transformed_data = (
        data.pipe(drop_columns)
            .pipe(rename_columns)
    )
    return transformed_data
 

In [5]:
review_data = extract_review_data()

In [6]:
review_data.head(3)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,gp:AOqpTOEnkV1nptUStBfLu4JEDx7qsasAMNwEfVl4pNh...,S N,https://play-lh.googleusercontent.com/a/AATXAJ...,Explore beautiful realms in 3d. Fly. Swim. Roa...,5,0,,2021-05-06 22:01:01,,
1,gp:AOqpTOG4ZsqQeIvTd05hnHBLITyTfngb6HEPzFZYJES...,WolfGurl 101,https://play-lh.googleusercontent.com/a-/AOh14...,I love this game!,5,0,0.13.3 (165938),2021-05-06 21:46:58,,
2,gp:AOqpTOGuG6KdhHIs4sjF-zB7RI4YZ0rRwZIqHt6KgOG...,Frankie,https://play-lh.googleusercontent.com/a-/AOh14...,I've recently started playing the game and tha...,5,0,0.13.3 (165938),2021-05-06 21:43:44,,


In [7]:
transformed_data = transform_review_data(review_data)

In [8]:
transformed_data.head(3)

Unnamed: 0,review_id,user_name,content,rating,thumbs_up_count,created_for_version,created_on
0,gp:AOqpTOEnkV1nptUStBfLu4JEDx7qsasAMNwEfVl4pNh...,S N,Explore beautiful realms in 3d. Fly. Swim. Roa...,5,0,,2021-05-06 22:01:01
1,gp:AOqpTOG4ZsqQeIvTd05hnHBLITyTfngb6HEPzFZYJES...,WolfGurl 101,I love this game!,5,0,0.13.3 (165938),2021-05-06 21:46:58
2,gp:AOqpTOGuG6KdhHIs4sjF-zB7RI4YZ0rRwZIqHt6KgOG...,Frankie,I've recently started playing the game and tha...,5,0,0.13.3 (165938),2021-05-06 21:43:44
