# Tokopedia Product Sentiment

## Data Loading

In [3]:
import re

import pandas as pd

df = pd.read_json(path_or_buf='data/reviews.jsonl', lines=True)
df.head()

Unnamed: 0,product_id,product_url,shop_id,shop_name,shop_url,review_id,star,review,source,variant_name,is_anonymous
0,,https://www.tokopedia.com/merriesofficial/flas...,,,https://www.tokopedia.com/merriesofficial,,5,"Barang yg dibeli sesuai , pengiriman cepat dan...",html,,
1,,https://www.tokopedia.com/merriesofficial/flas...,,,https://www.tokopedia.com/merriesofficial,,5,Barang Original!!! Packing sangat aman... Grea...,html,,
2,,https://www.tokopedia.com/merriesofficial/flas...,,,https://www.tokopedia.com/merriesofficial,,5,Pengiriman sangat cepat!!! pesan hari ini bes...,html,,
3,8536883143.0,https://www.tokopedia.com/winodofficialshop/wi...,11061624.0,winod,https://www.tokopedia.com/winodofficialshop,914575864.0,5,ini rekomen banget.. yg beli ga bakal nyesel.....,api,Black - 37,False
4,8536883143.0,https://www.tokopedia.com/winodofficialshop/wi...,11061624.0,winod,https://www.tokopedia.com/winodofficialshop,1243605012.0,5,"so cute, ukurannya pas, sendalnya ringan dan n...",api,Ivory - 38,True


In [4]:
df.star.value_counts()

star
5    3626
4     215
3      88
1      71
2      34
Name: count, dtype: int64

Kita hanya tertarik pada kolom `review` dan `star`. Maka buang kolom lain.

In [5]:
df = df[['review', 'star']]
df.head()

Unnamed: 0,review,star
0,"Barang yg dibeli sesuai , pengiriman cepat dan...",5
1,Barang Original!!! Packing sangat aman... Grea...,5
2,Pengiriman sangat cepat!!! pesan hari ini bes...,5
3,ini rekomen banget.. yg beli ga bakal nyesel.....,5
4,"so cute, ukurannya pas, sendalnya ringan dan n...",5


## Assessing Data

In [9]:
df.isna().sum()

review    123
star        0
dtype: int64

In [18]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,review,star
0,"Barang yg dibeli sesuai , pengiriman cepat dan...",5
1,Barang Original!!! Packing sangat aman... Grea...,5
2,Pengiriman sangat cepat!!! pesan hari ini bes...,5
3,ini rekomen banget.. yg beli ga bakal nyesel.....,5
4,"so cute, ukurannya pas, sendalnya ringan dan n...",5


In [37]:
df[df.duplicated(subset='review', keep=False)]

Unnamed: 0,review,star
37,Good quality,5
38,Bagus sesuai ekspektasi,5
40,good,5
49,mantap,5
50,mantap,5
...,...,...
4016,Mantap gan barangnya 👍🏻💯\nSukses terus buatlap...,5
4017,Mantap gan barangnya 👍🏻💯\nSukses terus buatlap...,5
4023,Kualitasnya bagus dan sesuai deskripsi,5
4024,Kualitasnya bagus dan sesuai deskripsi,5


In [39]:
df.drop_duplicates(inplace=True)

In [41]:
df.duplicated().sum()

np.int64(0)

## Text Preprocessing

In [57]:
clean_df = df.copy()

### Case Folding

In [58]:
clean_df.review = clean_df.review.map(lambda x: x.lower())
clean_df.head()

Unnamed: 0,review,star
0,"barang yg dibeli sesuai , pengiriman cepat dan...",5
1,barang original!!! packing sangat aman... grea...,5
2,pengiriman sangat cepat!!! pesan hari ini bes...,5
3,ini rekomen banget.. yg beli ga bakal nyesel.....,5
4,"so cute, ukurannya pas, sendalnya ringan dan n...",5


### Remove Special Characters

#### Numbers

In [59]:
clean_df[clean_df.review.str.contains(r'\d')]

Unnamed: 0,review,star
26,"kuliatas ok, pengiriman cepat respon penjual 100%",5
60,"si kecil cabe rawit, first impression pas di s...",5
67,"alhamdulillah paketnya sampe dgn selamat, coc...",5
69,order ke 2x nya karena sebagus itu hydra boost...,5
73,"udah order beerrr kali2. segar, bisa dipakai k...",5
...,...,...
3961,"sdh sampai, barang berfungsi dengan baik, hany...",5
3967,tolong sering-sering kasih diskon yaaaaaa kao....,5
3976,"meh meh meh, toko paling jarang diskon\nsekali...",5
4007,barang saya belum sampai sampai saat ini \nsdh...,1


In [63]:
import re


def preprocess_numbers(x):
    # Pattern 1: Price expressions - convert to [PRICE] token
    price_pattern = r"\b\d+(?:[,.]\d+)*(?:rb|ribu|k|jt|juta|m|million)\b"
    x = re.sub(price_pattern, "[PRICE]", x)

    # Pattern 2: Simple math expressions - remove these
    math_pattern = r"\b\d+[\+\-\*\/]\d+\b"
    x = re.sub(math_pattern, "", x)
    
    # Pattern 3: Split number prefix from words (e.g., 2hari -> 2 hari)
    word_prefix_pattern = r"(\d+)(\w+)"
    x = re.sub(word_prefix_pattern, r"\1 \2", x)

    # Pattern 4: Remove repeated words (e.g., masing2 -> masing-masing)
    repeated_word_pattern = r"(\w+)2\b"
    x = re.sub(repeated_word_pattern, r"\1-\1", x)

    # Pattern 5: Remove standalone numbers that aren't part of important expressions (time and/or quantity)
    # This should run last to avoid interfering with other patterns
    standalone_num_pattern = r"\b\d+\b(?!\s+(?:tahun|thn|bulan|bln|hari|hr|minggu|mgg|jam|jm|menit|mnt|detik|dtk|pcs|kantong|kantng|bungkus|box|paket|biji|buah))"
    x = re.sub(standalone_num_pattern, "", x)
    
    return x.strip()


clean_df.review = clean_df.review.map(preprocess_numbers)


In [64]:
clean_df[clean_df.review.str.contains(r'\d')]

Unnamed: 0,review,star
387,"minyak wangi paling favorit laahhh, harganya m...",5
1677,"👍👍👍👍👍👍👍👍👍👍👍\n1 hari uda nyampek dan aman,,karn...",5
2742,semoga awet 5 tahun,5
3222,buku nya tebel banget semoga bisa menambah ilm...,5


#### Punctuation

In [66]:
import string


def remove_punctuation(x):
    punctuations = set(string.punctuation)
    cleaned_text = ''.join(char for char in x if char not in punctuations)
    return cleaned_text

clean_df.review.map(remove_punctuation)

0       barang yg dibeli sesuai  pengiriman cepat dan ...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini  besok ...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3579, dtype: object

In [67]:
clean_df.review = clean_df.review.map(remove_punctuation)

#### Whitespaces

In [70]:
def remove_whitespaces(x):
    return re.sub(r"\s+", " ", x).strip()

clean_df.review.map(remove_whitespaces)

0       barang yg dibeli sesuai pengiriman cepat dan h...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini besok s...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3579, dtype: object

In [71]:
clean_df.review = clean_df.review.map(remove_whitespaces)

#### Emoji

Emoji is not directly removed as it might helpful for the sentiment. For example love emoji (❤️‍) shows a positive sentiment.

The preprocessing that's needed for emoji is to split any emoji that stick to a word and repeatedly typed or combination of emoji. This will be helpful for tokenization process later.

In [74]:
import re
import emoji

def preprocess_emojis(x):
    # identify emoji positions
    emoji_positions = []
    for match in emoji.emoji_list(x):
        start, end = match['match_start'], match['match_end']
        emoji_positions.append((start, end))
    
    # Process the string from end to beginning (to avoid position shifts)
    emoji_positions.sort(reverse=True)

    char_list = list(x)
    for start, end in emoji_positions:
        # if preceded by word character
        if start > 0 and char_list[start-1].isalnum():
            char_list.insert(start, ' ')
        
        # if followed by word character
        if end < len(char_list) and end < len(x) and char_list[end].isalnum():
            char_list.insert(end, ' ')
    
    return ''.join(char_list)


clean_df.review.map(preprocess_emojis)

0       barang yg dibeli sesuai pengiriman cepat dan h...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini besok s...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3579, dtype: object

### Stopword Removal

In [79]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vsefa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [77]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize

factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()

In [80]:
def remove_stopwords(x):
    word_token = word_tokenize(x)
    important_words = [word for word in word_token if word.lower() not in stopwords]
    return ' '.join(important_words)

clean_df.review.map(remove_stopwords)

0       barang yg dibeli sesuai pengiriman cepat harga...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari besok great...
3       rekomen banget yg beli ga bakal nyesel ga nyan...
4       so cute ukurannya pas sendalnya ringan nyaman ...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu trmksh atas...
Name: review, Length: 3579, dtype: object

In [82]:
clean_df.review = clean_df.review.map(remove_stopwords)

### Stemming

In [81]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

Stemming with Sastrawi is quiet slow. A parallel operation is needed for applying stemming for each row. We use `swifter` for this job.

In [84]:
import swifter

In [91]:
clean_df.swifter.allow_dask_on_strings(enable=True)
clean_df.swifter.force_parallel(enable=True)


<swifter.swifter.DataFrameAccessor at 0x1d00ee1e7b0>

In [92]:
clean_df.review = clean_df.review.swifter.apply(lambda x: stemmer.stem(x))

Pandas Apply:   0%|          | 0/3579 [00:00<?, ?it/s]

In [93]:
clean_df.review

0       barang yg beli sesuai kirim cepat harga masuk ...
1           barang original packing sangat aman great job
2           kirim sangat cepat pesan hari besok great jpb
3       rekomen banget yg beli ga bakal nyesel ga nyan...
4             so cute ukur pas sendal ringan nyaman pakai
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                            kulitas bagus kirim cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu trmksh atas...
Name: review, Length: 3579, dtype: object

## Feature Extraction