In [11]:
import pandas as pd
from google_play_scraper import Sort, reviews

In [13]:
app_reviews = []

def reviews_scraper(app: str, score: int, n_loops: int):
    """
    This is a function to scrape reviews on Google Play Store. Their
    GitHub can be found here: https://github.com/JoMingyu/google-play-scraper

    ### Arguments
    - `app`: the url of the app we want to scrape
    - `score`: number of stars rated by users
    - `n_loops`: the number of loops to collect reviews in batches of 200

    ### Returns
    It extends the `app_reviews` list after scraping and prints out the number
    of reviews scraped (for progress tracking).
    """
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]: # Collect both reviews types - 'most relevant' and 'newest'
        for i in range(n_loops):
            rvs, continuation_token = reviews(
                app,
                lang='en',
                country='my',
                sort=sort_order,
                count=200, # 200 is the maximum number of reviews per page supported by Google Play
                filter_score_with=score,
                continuation_token=None if i==0 else continuation_token
            ) # To begin crawling from where it last left off

            for r in rvs:
                r['sort_order'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['app_id'] = app

            app_reviews.extend(rvs)

            print('No. of reviews collected: ' + str(len(rvs)))

In [14]:
## Collect reviews that were rated 1 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=1, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [15]:
## Collect reviews that were rated 2 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=2, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [16]:
## Collect reviews that were rated 3 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=3, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [17]:
## Collect reviews that were rated 4 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=4, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [18]:
## Collect reviews that were rated 5 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=5, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [30]:
# sample
app_reviews[2000]

{'reviewId': '6310ac74-b6a1-46af-9973-dfdb1e297915',
 'userName': 'Joan Teo (Sora)',
 'userImage': 'https://play-lh.googleusercontent.com/a-/ALV-UjXwQT3M0thCmsrZgJcy5DSqwUjwcAL5ZdmfruPe8c_G0H2fjtT46A',
 'content': "overall the graphics, the lore, the characters' design, the overworld, everything is so beautiful to explore. been playing this game for 4 years but nowadays, i cant help but keep noticing patterns of hoyoverse's contstant colorism and cultural appropriations. this happened in the 4 major hit games such as this, hi3rd, h:sr and even the new game, zzz. this is genuinely concerning as a person of colour who is a minority that also studies world history and culture.",
 'score': 2,
 'thumbsUpCount': 0,
 'reviewCreatedVersion': '3.2.0_10872902_11078128',
 'at': datetime.datetime(2024, 7, 15, 1, 58, 31),
 'replyContent': None,
 'repliedAt': None,
 'appVersion': '3.2.0_10872902_11078128',
 'sort_order': 'most_relevant',
 'app_id': 'com.miHoYo.GenshinImpact'}

In [33]:
df_temp = pd.DataFrame(app_reviews)

In [37]:
# check for duplicates
df_temp.duplicated(subset=['reviewId']).value_counts()

False    8832
True     1168
Name: count, dtype: int64

In [39]:
df_final = df_temp.copy()

df_final.drop_duplicates(subset=['reviewId'], keep='first', inplace=True, ignore_index=True)

In [40]:
# final check
df_final.duplicated(subset=['reviewId']).value_counts()

False    8832
Name: count, dtype: int64

In [42]:
# Save reviews to csv file
df_final.to_csv('../data/genshin_impact_reviews.csv', index=False)

In [43]:
# Read in shopee csv file 
# Datetime parsing for 'at' and 'repliedAt' columns
reviews = pd.read_csv('../data/genshin_impact_reviews.csv', parse_dates=['at','repliedAt'])

### 2.0 Simple Exploratory Data Analysis

In [44]:
# Some null values in app version, developer replies and review created version
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8832 entries, 0 to 8831
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              8832 non-null   object        
 1   userName              8832 non-null   object        
 2   userImage             8832 non-null   object        
 3   content               8832 non-null   object        
 4   score                 8832 non-null   int64         
 5   thumbsUpCount         8832 non-null   int64         
 6   reviewCreatedVersion  6421 non-null   object        
 7   at                    8832 non-null   datetime64[ns]
 8   replyContent          1140 non-null   object        
 9   repliedAt             1140 non-null   datetime64[ns]
 10  appVersion            6421 non-null   object        
 11  sort_order            8832 non-null   object        
 12  app_id                8832 non-null   object        
dtypes: datetime64[ns](

In [45]:
reviews.head(5)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sort_order,app_id
0,12b832b8-a76a-4292-a916-cbda39384153,BAI Li,https://play-lh.googleusercontent.com/a-/ALV-U...,I've been playing this game for about two year...,1,277,5.0.0_26041933_26161852,2024-09-29 18:04:07,We sincerely apologize for your unsatisfactory...,2024-09-30 16:43:34,5.0.0_26041933_26161852,most_relevant,com.miHoYo.GenshinImpact
1,106b7231-3a41-454e-812b-6307445ed11a,ami amin,https://play-lh.googleusercontent.com/a-/ALV-U...,I was really looking forward to playing this g...,1,18,5.0.0_26041933_26161852,2024-09-29 09:55:17,,NaT,5.0.0_26041933_26161852,most_relevant,com.miHoYo.GenshinImpact
2,7f339f1c-a3ac-46ab-95b1-7a2991476824,Tahlia Tucker,https://play-lh.googleusercontent.com/a-/ALV-U...,The rating I could just give is a 1 since I ca...,1,17,5.0.0_26041933_26161852,2024-09-19 06:09:10,We sincerely apologize for your unsatisfactory...,2024-09-25 19:04:25,5.0.0_26041933_26161852,most_relevant,com.miHoYo.GenshinImpact
3,8f3659ca-da32-4350-ac17-dc78e0670766,liza,https://play-lh.googleusercontent.com/a/ACg8oc...,"The games good, yes but for some reason it's l...",1,186,5.1.0_27118081_27297621,2024-10-12 18:34:11,We sincerely apologize for your unsatisfactory...,2024-10-21 18:32:10,5.1.0_27118081_27297621,most_relevant,com.miHoYo.GenshinImpact
4,8b2bc51b-97b5-4b41-affe-8957d8772924,Mustapha Seven,https://play-lh.googleusercontent.com/a-/ALV-U...,"The game, the story and graphics are Very good...",1,235,5.1.0_27118081_27297621,2024-10-22 06:25:56,,NaT,5.1.0_27118081_27297621,most_relevant,com.miHoYo.GenshinImpact


In [46]:
# Summary statistics for numerical variables
reviews.describe()

Unnamed: 0,score,thumbsUpCount,at,repliedAt
count,8832.0,8832.0,8832,1140
mean,3.029212,20.254416,2024-02-17 12:45:34.339560704,2023-12-23 18:24:07.917543680
min,1.0,0.0,2020-09-28 07:11:26,2021-03-04 09:34:25
25%,2.0,0.0,2024-07-02 02:29:09.249999872,2023-01-26 21:42:28.249999872
50%,3.0,1.0,2024-08-29 16:50:57.500000,2024-09-09 08:53:52.500000
75%,4.0,3.0,2024-10-09 21:38:02.249999872,2024-10-22 11:11:30.249999872
max,5.0,4511.0,2024-10-28 06:15:33,2024-10-24 14:21:35
std,1.439469,136.425683,,


In [47]:
# Check how many reviews were retrieved from each score
reviews['score'].value_counts().sort_index()

score
1    1825
2    1644
3    1716
4    1742
5    1905
Name: count, dtype: int64