# Scraping ChatGPT using Google Play Scraper

## Install Library

In [1]:
!pip install -q google-play-scraper

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import Library

In [2]:
# Mengimpor pustaka google_play_scraper untuk mengakses ulasan dan informasi aplikasi dari Google Play Store.
from google_play_scraper import app, reviews, Sort, reviews_all

import pandas as pd  # Pandas untuk manipulasi dan analisis data
pd.options.mode.chained_assignment = None  # Menonaktifkan peringatan chaining
import numpy as np  # NumPy untuk komputasi numerik
seed = 0
np.random.seed(seed)  # Mengatur seed untuk reproduktibilitas
import matplotlib.pyplot as plt  # Matplotlib untuk visualisasi data
import seaborn as sns  # Seaborn untuk visualisasi data statistik, mengatur gaya visualisasi

import datetime as dt  # Manipulasi data waktu dan tanggal
import re  # Modul untuk bekerja dengan ekspresi reguler
import string  # Berisi konstanta string, seperti tanda baca

import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from collections import Counter

# Download the specific tagger that's missing
nltk.download('averaged_perceptron_tagger_eng')

nltk.download('punkt')  # For tokenization
nltk.download('punkt_tab')
nltk.download('stopwords')  # For stop words
nltk.download('wordnet')  # For lemmatization
nltk.download('averaged_perceptron_tagger')  # General POS tagger

try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')


from nltk.corpus import opinion_lexicon

# Pastikan lexicon dan tokenizer tersedia
nltk.download('opinion_lexicon')
nltk.download('punkt')

# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory  # Stemming (penghilangan imbuhan kata) dalam bahasa Indonesia
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory  # Menghapus kata-kata berhenti dalam bahasa Indonesia

from wordcloud import WordCloud  # Membuat visualisasi berbentuk awan kata (word cloud) dari teks

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /usr/share/nltk

## Scraper

In [3]:
# Mengambil semua ulasan dari aplikasi dengan ID di Google Play Store.
chatGPT_reviews = reviews_all(
    'com.openai.chatgpt', # ID ChatGPT
    lang='en',             # Bahasa ulasan (default: 'en')
    country='us',          # Negara (default: 'us')
    sort=Sort.MOST_RELEVANT,
    filter_score_with=None,  # Ambil semua rating
)

In [4]:
raw_df = pd.DataFrame(chatGPT_reviews)
raw_df.to_csv('/kaggle/working/chatGPT_reviews.csv', index=False)
# df_tiktok[['content']].head()
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99000 entries, 0 to 98999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              99000 non-null  object        
 1   userName              99000 non-null  object        
 2   userImage             99000 non-null  object        
 3   content               99000 non-null  object        
 4   score                 99000 non-null  int64         
 5   thumbsUpCount         99000 non-null  int64         
 6   reviewCreatedVersion  95209 non-null  object        
 7   at                    99000 non-null  datetime64[ns]
 8   replyContent          190 non-null    object        
 9   repliedAt             190 non-null    datetime64[ns]
 10  appVersion            95209 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 8.3+ MB


## Preprocessing

### Cleaning

In [5]:
# Membuat DataFrame baru (clean_df) dengan menghapus baris yang memiliki nilai yang hilang (NaN) dari app_reviews_df
# raw_df = pd.read_csv('/kaggle/working/tiktok_reviews_en.csv')
clean_df = raw_df.drop_duplicates()
clean_df = raw_df.dropna(subset=['content'])
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99000 entries, 0 to 98999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              99000 non-null  object        
 1   userName              99000 non-null  object        
 2   userImage             99000 non-null  object        
 3   content               99000 non-null  object        
 4   score                 99000 non-null  int64         
 5   thumbsUpCount         99000 non-null  int64         
 6   reviewCreatedVersion  95209 non-null  object        
 7   at                    99000 non-null  datetime64[ns]
 8   replyContent          190 non-null    object        
 9   repliedAt             190 non-null    datetime64[ns]
 10  appVersion            95209 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 8.3+ MB


In [6]:
# clean_df = clean_df.sample(n=20000)
clean_df.to_csv('chatGPT_clean_reviews.csv', index=False)