# **Install Library**

In [None]:
pip install tqdm

In [None]:
pip install google-play-scraper

In [None]:
pip install simager

In [None]:
pip install transformer

# **Import Library**

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from google_play_scraper import Sort, reviews, app

from huggingface_hub import notebook_login
from transformers import pipeline
from simager.preprocess import TextPreprocess

import dateutil.parser

#**Scrapping Dataset**

### **Scrapping Tokopedia**

In [None]:
result_tokopedia, continuation_token = reviews(
    'com.tokopedia.tkpd',
    lang='id', # defaults to 'en'
    country='id', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.NEWEST
    count=150000, # defaults to 100
    filter_score_with=None
)

df_tokopedia = pd.DataFrame(result_tokopedia)
df_tokopedia = df_tokopedia.sort_values(['at'])
df_tokopedia['Apps'] = 'tokopedia'

In [None]:
df_tokopedia.to_csv('/content/drive/MyDrive/Dataset/df_tokopedia_150.csv', index=False)

### **Scrapping Lazada**

In [None]:
result_lazada, continuation_token = reviews(
    'com.lazada.android',
    lang='id', # defaults to 'en'
    country='id', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.NEWEST
    count=600000, # defaults to 100
    filter_score_with=None
)

df_lazada = pd.DataFrame(result_lazada)
df_lazada = df_lazada.sort_values(['at'])
df_lazada['apps'] = 'lazada'

In [None]:
df_lazada.to_csv('/content/drive/MyDrive/Dataset/df_lazada.csv',index=False)

### **Scrapping Blibli**

In [None]:
result_blibli, continuation_token = reviews(
    'blibli.mobile.commerce',
    lang='id', # defaults to 'en'
    country='id', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.NEWEST
    count=100000, # defaults to 100
    filter_score_with=None
)

df_blibli = pd.DataFrame(result_blibli)
df_blibli = df_blibli.sort_values(['at'])
df_blibli['Apps'] = 'blibli'

In [None]:
df_blibli.to_csv('/content/drive/MyDrive/Dataset/df_blibli.csv', index=False)

### **Scrapping Tik-Tok**

In [None]:
result_tiktok, continuation_token = reviews(
    'com.tiktokshop.seller',
    lang='id', # defaults to 'en'
    country='id', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.NEWEST
    count=100000, # defaults to 100
    filter_score_with=None
)
df_tiktok = pd.DataFrame(result_tiktok)
df_tiktok= df_tiktok.sort_values(['at'])
df_blibli['Apps'] = 'blibli'

In [None]:
df_tiktok.to_csv('/content/drive/MyDrive/Dataset/df_tiktok.csv', index=False)

#**Data Labeling**

### **Load Text Pre-processing**

In [None]:
from simager.preprocess import TextPreprocess
methods = [
    "rm_hastag",
    "rm_mention",
    "rm_nonascii",
    "rm_emoticons",
    "rm_html",
    "rm_url",
    "sparate_str_numb",
    "pad_punct",
    "rm_punct",
    "rm_repeat_char",
    "rm_repeat_word",
    "rm_numb",
    "rm_whitespace",
    "normalize",
]

cleaner = TextPreprocess(methods=methods)

### **Load Model Pre-trained**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("sahri/indonesiasentiment")

model = AutoModelForSequenceClassification.from_pretrained("sahri/indonesiasentiment")

In [None]:
nlp = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer
)

def predict_sentimen(text):
  a = nlp(text)
  return a[0]["label"]

### **Labeling Tokopedia**

In [None]:
df_tokopedia = pd.read_csv('/content/drive/MyDrive/Dataset/df_tokopedia.csv')

In [None]:
df_tokopedia = df_tokopedia.drop(['reviewId','userImage','thumbsUpCount','reviewCreatedVersion','replyContent'],axis=1)

In [None]:
df_tokopedia["at"] = df_tokopedia["at"].astype('datetime64[ns]')
df_tokopedia["repliedAt"] = df_tokopedia["repliedAt"].apply(
    lambda t: dateutil.parser.parse(t) if isinstance(t, str) else None
)

#### **Data Pre-processing**

In [None]:
df_tokopedia['content'] = df_tokopedia['content'].apply(cleaner)

In [None]:
#replace blank value in colum text with NAN value
df_tokopedia['content'].replace('', np.nan, inplace=True)
#drop row where colum text with nan value
df_tokopedia.dropna(subset=['content'], inplace=True)

#### **Predict Sentimen**

In [None]:
df_tokopedia['sentimen'] = df_tokopedia['content'].apply(predict_sentimen)

In [None]:
df_tokopedia['sentimen'].value_counts()

In [None]:
df_tokopedia.to_csv('/content/drive/MyDrive/Dataset/df_tokopedia_Sentimen.csv', index=False)

### **Labeling Lazada**

In [None]:
df_lazada = pd.read_csv('/content/drive/MyDrive/Dataset/df_lazada.csv')

In [None]:
df_lazada = df_lazada.drop(['reviewId','userImage','thumbsUpCount','reviewCreatedVersion','replyContent'],axis=1)

In [None]:
df_lazada["at"] = df_lazada["at"].astype('datetime64[ns]')
df_lazada["repliedAt"] = df_lazada["repliedAt"].apply(
    lambda t: dateutil.parser.parse(t) if isinstance(t, str) else None
)

#### **Data Pre-processing**

In [None]:
df_lazada['content'] = df_lazada['content'].apply(cleaner)

In [None]:
#replace blank value in colum text with NAN value
df_lazada['content'].replace('', np.nan, inplace=True)
#drop row where colum text with nan value
df_lazada.dropna(subset=['content'], inplace=True)

#### **Predict Sentimen**

In [None]:
df_lazada['sentimen'] = df_lazada['content'].apply(predict_sentimen)

In [None]:
df_lazada['sentimen'].value_counts()

In [None]:
df_lazada.to_csv('/content/drive/MyDrive/Dataset/df_lazada_Sentimen.csv', index=False)

### **Labeling Blibli**

In [None]:
df_blibli = pd.read_csv('/content/drive/MyDrive/Dataset/df_blibli.csv')

In [None]:
df_blibli = df_blibli.drop(['reviewId','userImage','thumbsUpCount','reviewCreatedVersion','replyContent'],axis=1)

In [None]:
df_blibli["at"] = df_blibli["at"].astype('datetime64[ns]')
df_blibli["repliedAt"] = df_blibli["repliedAt"].apply(
    lambda t: dateutil.parser.parse(t) if isinstance(t, str) else None
)

#### **Data Pre-processing**

In [None]:
df_blibli['content'] = df_blibli['content'].apply(cleaner)

In [None]:
#replace blank value in colum text with NAN value
df_blibli['content'].replace('', np.nan, inplace=True)
#drop row where colum text with nan value
df_blibli.dropna(subset=['content'], inplace=True)

#### **Predict Sentimen**

In [None]:
df_blibli['sentimen'] = df_blibli['content'].apply(predict_sentimen)

In [None]:
df_blibli['sentimen'].value_counts()

In [None]:
df_blibli.to_csv('/content/drive/MyDrive/Dataset/df_blibli_Sentimen.csv', index=False)

### **Labeling Tik-tok**

In [None]:
df_tiktok = pd.read_csv('/content/drive/MyDrive/Dataset/df_tiktok.csv')

In [None]:
df_tiktok = df_tiktok.drop(['reviewId','userImage','thumbsUpCount','reviewCreatedVersion','replyContent'],axis=1)

In [None]:
df_tiktok["at"] = df_tiktok["at"].astype('datetime64[ns]')
df_tiktok["repliedAt"] = df_tiktok["repliedAt"].apply(
    lambda t: dateutil.parser.parse(t) if isinstance(t, str) else None
)

#### **Data Pre-processing**

In [None]:
df_tiktok['content'] = df_tiktok['content'].apply(cleaner)

In [None]:
#replace blank value in colum text with NAN value
df_tiktok['content'].replace('', np.nan, inplace=True)
#drop row where colum text with nan value
df_tiktok.dropna(subset=['content'], inplace=True)

#### **Predict Sentimen**

In [None]:
df_tiktok['sentimen'] = df_tiktok['content'].apply(predict_sentimen)

In [None]:
df_tiktok['sentimen'].value_counts()

In [None]:
df_tiktok.to_csv('/content/drive/MyDrive/Dataset/df_tiktok_Sentimen.csv', index=False)

#**Combine Dataset**

In [None]:
df_tokopedia = pd.read_csv('/content/drive/MyDrive/Dataset/df_tokopedia_Sentimen.csv')
df_lazada = pd.read_csv('/content/drive/MyDrive/Dataset/df_lazada_Sentimen.csv')
df_blibli = pd.read_csv('/content/drive/MyDrive/Dataset/df_blibli_Sentimen.csv')
df_tiktok = pd.read_csv('/content/drive/MyDrive/Dataset/df_tiktok_Sentimen.csv')

In [None]:
df_final = df_tokopedia.append([df_blibli,df_lazada,df_tiktok])
df_final.reset_index(drop=True , inplace=True)

In [None]:
df_final.info()

In [None]:
df_final["at"] = df_final["at"].astype('datetime64[ns]')
df_final["repliedAt"] = df_final["repliedAt"].apply(
    lambda t: dateutil.parser.parse(t) if isinstance(t, str) else None
)

In [None]:
def tenure1(start_date, end_date):
    if pd.isna(end_date) == False:
        tenure = (end_date - start_date)/np.timedelta64(1, 'h')
        return tenure

In [None]:
def tenure2(start_date, end_date):
    if pd.isna(end_date) == False:
        tenure = (end_date - start_date)/np.timedelta64(1, 'm')
        return tenure

In [None]:
def Replied(rp):
  if pd.isna(rp) == False:
    val = 'Replied'
  else:
    val = 'Not Replied'
  return val

In [None]:
df_final['Reply Hours'] = df_final.apply(lambda row: tenure1(row['at'], row['repliedAt']), axis = 1)
df_final['Reply Minute'] = df_final.apply(lambda row: tenure2(row['at'], row['repliedAt']), axis = 1)

In [None]:
df_final['Reply Hours']= df_final['Reply Hours'].round(1)
df_final['Reply Minute']= df_final['Reply Minute'].round(1)

In [None]:
df_final['Replied'] = df_final.apply(lambda row: Replied(row['Reply Hours']), axis = 1)

In [None]:
df_final = df_final.drop(df_final[df_final['Reply Minute'] < 0].index)

In [None]:
df_final.to_csv('/content/drive/MyDrive/Dataset/df_sentimen_final.csv', index=False)