The notebook run with Python 3.11.10

In [12]:
# pip install google-play-scraper

### We import the `google_play_scraper` library to scrape data from the Google Play Store

### Getting the app id of the Application from the Play Store we want to fetch the reviews of.
- For our project, we chose the app Webex to scrape the data from.
- The app's link in the Play Store is `https://play.google.com/store/apps/details?id=com.cisco.wx2.android`, and we're going to use it's id `com.cisco.webex.meetings`.

In [13]:
app_id = 'com.cisco.webex.meetings'

In [14]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


### All the modules and libraries needed for scraping and processing the data.

In [15]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post
import pandas as pd
from tqdm import tqdm
import json
from time import sleep
from typing import List, Optional, Tuple

### Setting the maximum number of reviews to fetch in each request to 199.

In [16]:
MAX_COUNT_EACH_FETCH = 199

### Defining a class `_ContinuationToken` to manage the token used for paginating through reviews
* **hardcoded-credentials** Embedding credentials in source code risks unauthorized access
* GitHub source: https://github.com/JoMingyu/google-play-scraper/blob/master/google_play_scraper/features/reviews.py

In [17]:
class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with

### Defining a function `_fetch_review_items` that sends a POST request to fetch review items from the Google Play Store and parses the response
GitHub source: https://github.com/JoMingyu/google-play-scraper/blob/master/google_play_scraper/features/reviews.py

In [18]:
def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]

### Defining a function `reviews` that fetches reviews for a given app ID, language, country, and other parameters. It handles pagination using the continuation token.
GitHub source: https://github.com/JoMingyu/google-play-scraper/blob/master/google_play_scraper/features/reviews.py

In [19]:
def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )

### Defining a function `reviews_all` that fetches all reviews for a given app ID by repeatedly calling the reviews function and handling pagination.
GitHub source: https://github.com/JoMingyu/google-play-scraper/blob/master/google_play_scraper/features/reviews.py

In [20]:
def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

### The target number of reviews to fetch

In [21]:
reviews_count = 30000

* ### Initializing an empty list `result` and a `continuation_token`.
* ### Then we use a `tqdm` progress bar  to fetch revies in a loop until the target number of reviews is reached. The results are appened to the `result` list.
GitHub source: https://github.com/tqdm/tqdm

In [22]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='en', #The language of review
            country='us', #Country for which we want to scrape 
            sort=Sort.MOST_RELEVANT,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

  0%|          | 0/30000 [00:00<?, ?it/s]

30049it [02:31, 198.66it/s]                           


### We convert the `result` list into a pandas DataFrame  for easier manipulation and analysis

In [23]:
df = pd.DataFrame(result)

### Displaying the columns of the DataFrame.

In [24]:
df.columns

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'appVersion'],
      dtype='object')

### We select specific columns from the DataFrame to keep more relevant information.

In [25]:
df = df[['reviewId', 'userName', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'appVersion']]

In [26]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,56598e3f-86f2-44ff-a561-4f6d58eb7173,Wendy Rowe,This is my second time trying it on my Android...,1,6,44.9.1,2024-10-16 18:12:56,44.9.1
1,06fd6a83-ea69-4a75-af45-93f62e4adc27,Robert Jim Fulner,My company recently update the version of the ...,3,12,44.7.1,2024-08-18 15:18:44,44.7.1
2,7c9e80a3-8d54-4c4e-be76-1a045bb4f73c,Pthom Thompson,Display icons on the screen that do not have l...,3,8,44.7.0,2024-07-19 16:40:28,44.7.0
3,bb2c1106-fd82-4650-91b3-1c17616fe61c,A Google user,"Rough, to say the least. Still, my cell was ul...",3,19,39.4.0,2019-06-14 20:40:45,39.4.0
4,de6c508b-7600-4804-81e4-9d647713272f,Holly Hill,This app worked very well for me. Clear video ...,5,122,44.4.0,2024-05-06 19:10:51,44.4.0


### Saving the DataFrame to a CSV file named after the app ID.

In [27]:
df.to_csv(f'{app_id}_reviews.csv', index=False)