# Scraping the Google Play Store

## Part 1: Creating a data set containing basic information for twenty of the most popular personal finance apps

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

import matplotlib.pyplot as plt

from google_play_scraper import Sort, reviews, app

%matplotlib inline

In [2]:
# Choosing the apps we want data for
fin_apps = [
    'com.acorns.android',
    'com.mint',
    'com.personalcapital.pcapandroid',
    'com.quicken.acme',
    'com.youneedabudget.evergreen.app',
    'com.onedebit.chime',
    'digit.android',
    'com.meetalbert',
    'com.mobilecreditcards',
    'com.everydollar.android',
    'com.cleevio.spendee',
    'com.Splitwise.SplitwiseMobile',
    'com.pocketguard.android.app',
    'com.honeydue.honeydue',
    'com.zetahelp.zeta',
    'com.dayspringtech.envelopes',
    'co.dollarbird',
    'com.finicity.mvelopes',
    'com.turtlecorp.wally',
    'com.qapital',
]

# Scraping app info
apps_info = []

for ap in tqdm(fin_apps):
    info = app(ap, lang = 'en', country = 'us')
    del info['comments']
    apps_info.append(info)

100%|██████████| 20/20 [00:12<00:00,  1.57it/s]


In [3]:
# Abbreviating the App titles and then verifying the change
for i in range(len(apps_info)):
    shortened_title = re.split(' -|:| —| –', apps_info[i]['title'])[0]
    apps_info[i].update({'title': shortened_title})
    
    print(apps_info[i]['title'])

Acorns
Mint
Personal Capital
Simplifi
YNAB
Chime
Digit
Albert
NerdWallet
EveryDollar
Spendee
Splitwise
PocketGuard
Honeydue
Zeta Money Manager
Goodbudget
Dollarbird
Mvelopes Budget App
Wally
Qapital


In [4]:
# Converting the data in a Pandas data frame
apps_info = pd.DataFrame(apps_info)

apps_info.head()

Unnamed: 0,title,description,descriptionHTML,summary,summaryHTML,installs,minInstalls,score,ratings,reviews,...,adSupported,containsAds,released,updated,version,recentChanges,recentChangesHTML,editorsChoice,appId,url
0,Acorns,"Get started in minutes, & give your money a ch...","Get started in minutes, &amp; give your money ...",Acorns is how you save & invest for your futur...,Acorns is how you save &amp; invest for your f...,"5,000,000+",5000000,4.335278,154146,60971,...,,False,"Oct 7, 2014",1626460907,3.21.1,We made some small performance enhancements so...,We made some small performance enhancements so...,False,com.acorns.android,https://play.google.com/store/apps/details?id=...
1,Mint,<p>Experience a fresh way to manage money. Rea...,<p>Experience a fresh way to manage money. Rea...,"Mint - spend smarter, create budgets, manage b...","Mint - spend smarter, create budgets, manage b...","10,000,000+",10000000,4.523261,191904,68487,...,,False,"May 3, 2010",1626416409,8.12.0,- Bug fixes and improvements.,- Bug fixes and improvements.,True,com.mint,https://play.google.com/store/apps/details?id=...
2,Personal Capital,"<b>Monitor all your investments, retirement, a...","<b>Monitor all your investments, retirement, a...","Investing, Portfolio Management, Spending Trac...","Investing, Portfolio Management, Spending Trac...","500,000+",500000,4.391105,15234,4222,...,,False,"Nov 25, 2012",1626963094,10.3.2,Performance improvements.,Performance improvements.,False,com.personalcapital.pcapandroid,https://play.google.com/store/apps/details?id=...
3,Simplifi,Effortlessly manage your finances and track yo...,Effortlessly manage your finances and track yo...,"Reach savings goals, easily track budgets & ef...","Reach savings goals, easily track budgets &amp...","100,000+",100000,3.277228,742,257,...,,False,"Oct 1, 2019",1625867099,3.5.1,Bug fixes and improvements.,Bug fixes and improvements.,False,com.quicken.acme,https://play.google.com/store/apps/details?id=...
4,YNAB,Gain total control of your money with YNAB. Br...,Gain total control of your money with YNAB. Br...,"Reach goals, easily track every expense, and g...","Reach goals, easily track every expense, and g...","1,000,000+",1000000,4.357607,7690,4205,...,,False,"Oct 20, 2015",1626370523,7.0.6,• Fixed an issue where editing a target type i...,• Fixed an issue where editing a target type i...,False,com.youneedabudget.evergreen.app,https://play.google.com/store/apps/details?id=...


In [5]:
# Checking a list of the columns to find which ones can be removed
apps_info.columns

Index(['title', 'description', 'descriptionHTML', 'summary', 'summaryHTML',
       'installs', 'minInstalls', 'score', 'ratings', 'reviews', 'histogram',
       'price', 'free', 'currency', 'sale', 'saleTime', 'originalPrice',
       'saleText', 'offersIAP', 'inAppProductPrice', 'size', 'androidVersion',
       'androidVersionText', 'developer', 'developerId', 'developerEmail',
       'developerWebsite', 'developerAddress', 'privacyPolicy',
       'developerInternalID', 'genre', 'genreId', 'icon', 'headerImage',
       'screenshots', 'video', 'videoImage', 'contentRating',
       'contentRatingDescription', 'adSupported', 'containsAds', 'released',
       'updated', 'version', 'recentChanges', 'recentChangesHTML',
       'editorsChoice', 'appId', 'url'],
      dtype='object')

In [6]:
# Removing non-essential columns
apps_info.drop(['descriptionHTML', 
                'summary', 
                'summaryHTML', 
                'price',
                'free',
                'currency', 
                'sale', 
                'saleTime', 
                'originalPrice',                
                'saleText', 
                'offersIAP', 
                'inAppProductPrice', 
                'size',
                'androidVersion',
                'androidVersionText',
                'developerId',
                'developerEmail',
                'developerWebsite',
                'developerAddress',
                'privacyPolicy',
                'developerInternalID',
                'genreId',
                'icon',
                'headerImage',
                'screenshots',
                'video',
                'videoImage',
                'contentRatingDescription',
                'adSupported', 
                'containsAds', 
                'updated', 
                'version', 
                'recentChanges', 
                'recentChangesHTML', 
                'appId', 
                'url'], axis = 1, inplace = True)

In [7]:
# Writing the data frame to an excel file
apps_info.to_excel(r'GooglePlayFinancialApps.xlsx', index = False)

## Part 2: Scraping app reviews for the 20 apps in the original data set

In [8]:
# Sampling through the most helpful reviews with roughly the same number of reviews for each score
app_reviews = []

for ap in tqdm(fin_apps):
  for score in list(range(1, 6)):
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
      rvs, _ = reviews(
        ap,
        lang='en',
        country='us',
        sort=sort_order,
        count= 200 if score == 3 else 100,
        filter_score_with=score
      )
      for r in rvs:
        r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        r['appId'] = ap
      app_reviews.extend(rvs)
    
len(app_reviews)

100%|██████████| 20/20 [02:22<00:00,  7.12s/it]


19598

In [9]:
# Converting the data into a Pandas data frame
app_reviews = pd.DataFrame(app_reviews)

app_reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,gp:AOqpTOG0Ub3UswTDZ42CIHXJGxX7Um5gSaNDUsa7ab6...,Jojo J,https://play-lh.googleusercontent.com/a-/AOh14...,"Beware of automatic charges to ""lite"" plan. I ...",1,197,3.21.1,2021-07-19 20:25:48,,NaT,most_relevant,com.acorns.android
1,gp:AOqpTOEpM15tecmZKMA2RIGrihHl44HgAQxsrYyjmVG...,Stephanie Lynn,https://play-lh.googleusercontent.com/a-/AOh14...,"I have to be honest, my experience with Acorns...",1,640,3.20.0,2021-07-12 13:29:16,We are very sorry to hear about your experienc...,2021-07-13 06:59:28,most_relevant,com.acorns.android
2,gp:AOqpTOF6L5Q9Uw5CO9Qjon3JTdFMef--dH2E7bQ8ZdU...,Lily Kutzuba,https://play-lh.googleusercontent.com/a-/AOh14...,F*** this platform. I got three friends to sig...,1,75,3.21.1,2021-07-19 19:15:02,"Hi Lily, we are sorry to hear that you feel th...",2018-09-21 07:12:44,most_relevant,com.acorns.android
3,gp:AOqpTOH6pjxUW07o_S2WhW_t1np9RaRzrT43apoq8YP...,Justins712,https://play-lh.googleusercontent.com/a/AATXAJ...,Do not open an account with acorns! It will ta...,1,196,3.20.0,2021-07-14 15:18:53,We're very sorry to hear about your experience...,2021-07-12 08:20:37,most_relevant,com.acorns.android
4,gp:AOqpTOE0mj4_C6rT9qKD1CS2hjc_S72z46gKUEc7Okf...,Day Love,https://play-lh.googleusercontent.com/a-/AOh14...,Had a terrible experience with their fraud dep...,1,132,3.20.0,2021-07-05 23:40:06,We really appreciate the feedback and apologiz...,2021-07-06 07:28:08,most_relevant,com.acorns.android


In [10]:
# Writing the complete data set to an excel file 
app_reviews.to_excel(r'GooglePlayAppReviews.xlsx')

In [11]:
# Creating separate data frames for one star and five star reviews
one_star = app_reviews[app_reviews['score'] == 1]
five_star = app_reviews[app_reviews['score'] == 5]

In [12]:
# Resetting the index for the new data frames
one_star.reset_index(drop = True, inplace = True)
five_star.reset_index(drop = True, inplace = True)

In [13]:
# Creating csv files containing only the text from the review itself for one star reviews
one_star_min = one_star['content']
one_star_min.to_csv(r'OneStarReviewOnly.csv')

In [14]:
# Creating csv files containing only the text from the review itself for five star reviews
five_star_min = five_star['content']
five_star_min.to_csv(r'FiveStarReviewOnly.csv')

In [15]:
# Function to remove non alpha numeric characters when iterating through our csv files
def strip_word(word):
    res = ''
    for char in word:
        if char.isalnum():
            res += char
    return res

In [16]:
# Counting the number of occurences for each five letter or longer word in our file
five_word_count_dict = dict()

with open('FiveStarReviewOnly.csv', 'r') as file:
    pieces = []
    for line in file:
        pieces += line.rstrip().split()
        
    words = [strip_word(word).lower() for word in pieces]
    
    for word in words:
        if word and len(word) >= 5:
            if word in five_word_count_dict:
                five_word_count_dict[word] += 1
            else:
                five_word_count_dict[word] = 1

In [21]:
# Sorting our results from most common to least common word and returning only the fifteen most common words
counter = 0

for word in sorted(five_word_count_dict, key = five_word_count_dict.get, reverse=True):
    if counter < 15:
        print(word, five_word_count_dict[word])
        counter += 1
    else:
        break

money 752
great 537
budget 465
budgeting 350
track 332
account 331
using 318
really 307
accounts 262
about 242
spending 240
would 236
finances 225
credit 213
expenses 210


In [23]:
# Repeating the process for the one star reviews
one_word_count_dict = dict()

with open('OneStarReviewOnly.csv', 'r') as file:
    pieces = []
    for line in file:
        pieces += line.rstrip().split()
        
    words = [strip_word(word).lower() for word in pieces]
    
    for word in words:
        if word and len(word) >= 5:
            if word in one_word_count_dict:
                one_word_count_dict[word] += 1
            else:
                one_word_count_dict[word] = 1

counter = 0                

for word in sorted(one_word_count_dict, key = one_word_count_dict.get, reverse=True):
    if counter < 15:
        print(word, one_word_count_dict[word])
        counter += 1
    else:
        break

account 1453
money 775
update 438
accounts 408
still 405
service 388
support 387
after 380
would 335
there 333
doesnt 316
because 315
transactions 306
customer 304
tried 303
