In [1]:
import pandas as pd
from google_play_scraper import Sort, reviews

### 1.0 Data Collection

We aim to collect [Genshin Impact app](https://play.google.com/store/apps/details?id=com.miHoYo.GenshinImpact&hl=en&gl=my) ratings on Google Play with a star rating and review. The ratings are on a 5-point scale, with 1 being the lowest score and 5 being the highest score one could possibly give. Since the goal of our project is to predict if a review has a positive or negative sentiment based on textual data, we will scrape real user reviews on Google Play.

Here we employ the [Google-Play-Scraper](https://github.com/JoMingyu/google-play-scraper), which provides an API to crawl through Google Play.

The reviews were collected in batches, according to their scores (1-5). This was done in an attempt to achieve a balanced dataset with roughly the same number of reviews for each score.

Also, in order to gather reviews that had more text and were written recently, we set up the google play scraper to scrape from both review types, 'Most relevant' and 'Newest' (which **may result in duplicate reviews** but we'll handle them later).

In [2]:
app_reviews = []

def reviews_scraper(app: str, score: int, n_loops: int):
    """
    This is a function to scrape reviews on Google Play Store. Their
    GitHub can be found here: https://github.com/JoMingyu/google-play-scraper

    ### Arguments
    - `app`: the url of the app we want to scrape
    - `score`: number of stars rated by users
    - `n_loops`: the number of loops to collect reviews in batches of 200

    ### Returns
    It extends the `app_reviews` list after scraping and prints out the number
    of reviews scraped (for progress tracking).
    """
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]: # Collect both reviews types - 'most relevant' and 'newest'
        for i in range(n_loops):
            rvs, continuation_token = reviews(
                app,
                lang='en',
                country='my',
                sort=sort_order,
                count=200, # 200 is the maximum number of reviews per page supported by Google Play
                filter_score_with=score,
                continuation_token=None if i==0 else continuation_token
            ) # To begin crawling from where it last left off

            for r in rvs:
                r['sort_order'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['app_id'] = app

            app_reviews.extend(rvs)

            print('No. of reviews collected: ' + str(len(rvs)))

In [3]:
## Collect reviews that were rated 1 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=1, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [4]:
## Collect reviews that were rated 2 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=2, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [5]:
## Collect reviews that were rated 3 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=3, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [6]:
## Collect reviews that were rated 4 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=4, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [7]:
## Collect reviews that were rated 5 star
reviews_scraper(app='com.miHoYo.GenshinImpact', score=5, n_loops=5)

No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200
No. of reviews collected: 200


In [8]:
# sample
app_reviews[2000]

{'reviewId': '6310ac74-b6a1-46af-9973-dfdb1e297915',
 'userName': 'Joan Teo (Sora)',
 'userImage': 'https://play-lh.googleusercontent.com/a-/ALV-UjXwQT3M0thCmsrZgJcy5DSqwUjwcAL5ZdmfruPe8c_G0H2fjtT46A',
 'content': "overall the graphics, the lore, the characters' design, the overworld, everything is so beautiful to explore. been playing this game for 4 years but nowadays, i cant help but keep noticing patterns of hoyoverse's contstant colorism and cultural appropriations. this happened in the 4 major hit games such as this, hi3rd, h:sr and even the new game, zzz. this is genuinely concerning as a person of colour who is a minority that also studies world history and culture.",
 'score': 2,
 'thumbsUpCount': 0,
 'reviewCreatedVersion': '3.2.0_10872902_11078128',
 'at': datetime.datetime(2024, 7, 15, 1, 58, 31),
 'replyContent': None,
 'repliedAt': None,
 'appVersion': '3.2.0_10872902_11078128',
 'sort_order': 'most_relevant',
 'app_id': 'com.miHoYo.GenshinImpact'}

In [9]:
df_temp = pd.DataFrame(app_reviews)

In [10]:
# check for duplicates since we scraped from the 'Most relevant' and 'Newest' sections (which may have a lot of overlapping ratings)
df_temp.duplicated(subset=['reviewId']).value_counts()

False    8849
True     1151
Name: count, dtype: int64

In [11]:
df_final = df_temp.copy()

df_final.drop_duplicates(subset=['reviewId'], keep='first', inplace=True, ignore_index=True)

In [12]:
# final check
df_final.duplicated(subset=['reviewId']).value_counts()

False    8849
Name: count, dtype: int64

In [None]:
# Save reviews to csv file (which we scraped on the 3rd of November 2024)
df_final.to_csv('../data/genshin_impact_reviews_v03112024.csv', index=False)

In [None]:
# Read in shopee csv file 
# Datetime parsing for 'at' and 'repliedAt' columns
reviews = pd.read_csv('../data/genshin_impact_reviews_v03112024.csv', parse_dates=['at','repliedAt'])

### 2.0 Simple Exploratory Data Analysis

In [15]:
# Some null values in app version, developer replies and review created version
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8849 entries, 0 to 8848
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              8849 non-null   object        
 1   userName              8849 non-null   object        
 2   userImage             8849 non-null   object        
 3   content               8849 non-null   object        
 4   score                 8849 non-null   int64         
 5   thumbsUpCount         8849 non-null   int64         
 6   reviewCreatedVersion  6450 non-null   object        
 7   at                    8849 non-null   datetime64[ns]
 8   replyContent          1086 non-null   object        
 9   repliedAt             1086 non-null   datetime64[ns]
 10  appVersion            6450 non-null   object        
 11  sort_order            8849 non-null   object        
 12  app_id                8849 non-null   object        
dtypes: datetime64[ns](

In [16]:
reviews.head(5)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sort_order,app_id
0,9ebbaa68-df60-45ce-bd37-634ed84c4246,Konuchi A,https://play-lh.googleusercontent.com/a-/ALV-U...,Controller doesn't work. Game is unplayable. T...,1,15,5.0.0_26041933_26161852,2024-11-01 18:13:52,We sincerely apologize for your unsatisfactory...,2024-10-22 14:15:18,5.0.0_26041933_26161852,most_relevant,com.miHoYo.GenshinImpact
1,12b832b8-a76a-4292-a916-cbda39384153,BAI Li,https://play-lh.googleusercontent.com/a-/ALV-U...,I've been playing this game for about two year...,1,539,5.0.0_26041933_26161852,2024-09-29 18:04:07,We sincerely apologize for your unsatisfactory...,2024-09-30 16:43:34,5.0.0_26041933_26161852,most_relevant,com.miHoYo.GenshinImpact
2,7f339f1c-a3ac-46ab-95b1-7a2991476824,Tahlia Tucker,https://play-lh.googleusercontent.com/a-/ALV-U...,The rating I could just give is a 1 since I ca...,1,23,5.0.0_26041933_26161852,2024-09-19 06:09:10,We sincerely apologize for your unsatisfactory...,2024-09-25 19:04:25,5.0.0_26041933_26161852,most_relevant,com.miHoYo.GenshinImpact
3,8f3659ca-da32-4350-ac17-dc78e0670766,liza,https://play-lh.googleusercontent.com/a/ACg8oc...,"The games good, yes but for some reason it's l...",1,207,5.1.0_27118081_27297621,2024-10-12 18:34:11,We sincerely apologize for your unsatisfactory...,2024-10-21 18:32:10,5.1.0_27118081_27297621,most_relevant,com.miHoYo.GenshinImpact
4,8546656c-def9-45ae-9069-fdfeab5febba,Jian Carlo Cornelia,https://play-lh.googleusercontent.com/a-/ALV-U...,"This game is actually really good, the problem...",1,106,5.1.0_27118081_27297621,2024-10-13 09:38:50,We sincerely apologize for your unsatisfactory...,2024-10-22 11:11:42,5.1.0_27118081_27297621,most_relevant,com.miHoYo.GenshinImpact


In [17]:
# Summary statistics for numerical variables
reviews.describe()

Unnamed: 0,score,thumbsUpCount,at,repliedAt
count,8849.0,8849.0,8849,1086
mean,3.029156,20.060685,2024-02-22 18:08:09.260820480,2023-12-09 07:15:38.496316928
min,1.0,0.0,2020-09-28 07:11:26,2021-03-04 09:34:25
25%,2.0,0.0,2024-07-06 01:14:17,2023-01-04 16:02:26.249999872
50%,3.0,1.0,2024-08-31 16:08:14,2024-07-26 15:20:55.500000
75%,4.0,3.0,2024-10-11 14:22:43,2024-10-21 18:32:23.750000128
max,5.0,4519.0,2024-11-02 10:57:59,2024-11-01 18:56:29
std,1.439893,135.978144,,


In [18]:
# Check how many reviews were retrieved from each score
reviews['score'].value_counts().sort_index()

score
1    1832
2    1642
3    1720
4    1746
5    1909
Name: count, dtype: int64