# Tokopedia Product Sentiment

In [2]:
from tqdm.notebook import tqdm

tqdm.pandas()

## Data Loading

In [49]:
import pandas as pd

df = pd.read_json(path_or_buf='data/reviews.jsonl', lines=True)
df.head()

Unnamed: 0,product_id,product_url,shop_id,shop_name,shop_url,review_id,star,review,source,variant_name,is_anonymous
0,,https://www.tokopedia.com/merriesofficial/flas...,,,https://www.tokopedia.com/merriesofficial,,5,"Barang yg dibeli sesuai , pengiriman cepat dan...",html,,
1,,https://www.tokopedia.com/merriesofficial/flas...,,,https://www.tokopedia.com/merriesofficial,,5,Barang Original!!! Packing sangat aman... Grea...,html,,
2,,https://www.tokopedia.com/merriesofficial/flas...,,,https://www.tokopedia.com/merriesofficial,,5,Pengiriman sangat cepat!!! pesan hari ini bes...,html,,
3,8536883143.0,https://www.tokopedia.com/winodofficialshop/wi...,11061624.0,winod,https://www.tokopedia.com/winodofficialshop,914575864.0,5,ini rekomen banget.. yg beli ga bakal nyesel.....,api,Black - 37,False
4,8536883143.0,https://www.tokopedia.com/winodofficialshop/wi...,11061624.0,winod,https://www.tokopedia.com/winodofficialshop,1243605012.0,5,"so cute, ukurannya pas, sendalnya ringan dan n...",api,Ivory - 38,True


In [50]:
df.star.value_counts()

star
5    3626
4     215
3      88
1      71
2      34
Name: count, dtype: int64

## Assessing Data

In [52]:
df[['review', 'star']].isna().sum()

review    123
star        0
dtype: int64

In [54]:
df.dropna(subset='review', inplace=True)
df[['review', 'star']].head()

Unnamed: 0,review,star
0,"Barang yg dibeli sesuai , pengiriman cepat dan...",5
1,Barang Original!!! Packing sangat aman... Grea...,5
2,Pengiriman sangat cepat!!! pesan hari ini bes...,5
3,ini rekomen banget.. yg beli ga bakal nyesel.....,5
4,"so cute, ukurannya pas, sendalnya ringan dan n...",5


In [56]:
df[df.duplicated(subset='review', keep=False)]['review']

37                                           Good quality
38                                Bagus sesuai ekspektasi
40                                                   good
49                                                 mantap
50                                                 mantap
                              ...                        
4016    Mantap gan barangnya 👍🏻💯\nSukses terus buatlap...
4017    Mantap gan barangnya 👍🏻💯\nSukses terus buatlap...
4023               Kualitasnya bagus dan sesuai deskripsi
4024               Kualitasnya bagus dan sesuai deskripsi
4031                                       sesuai pesanan
Name: review, Length: 524, dtype: object

In [57]:
df.drop_duplicates(subset='review', inplace=True)

In [58]:
df.duplicated().sum()

np.int64(0)

## Text Preprocessing

Kita hanya tertarik pada kolom `review` dan `star`. Maka buang kolom lain.

In [63]:
clean_df = df[['review', 'star']].copy()

### Case Folding

In [65]:
clean_df.review = clean_df.review.map(lambda x: x.lower())
clean_df.head()

Unnamed: 0,review,star
0,"barang yg dibeli sesuai , pengiriman cepat dan...",5
1,barang original!!! packing sangat aman... grea...,5
2,pengiriman sangat cepat!!! pesan hari ini bes...,5
3,ini rekomen banget.. yg beli ga bakal nyesel.....,5
4,"so cute, ukurannya pas, sendalnya ringan dan n...",5


### Remove Special Characters

#### Numbers

In [67]:
clean_df[clean_df.review.str.contains(r'\d')]

Unnamed: 0,review,star
26,"kuliatas ok, pengiriman cepat respon penjual 100%",5
60,"si kecil cabe rawit, first impression pas di s...",5
67,"alhamdulillah paketnya sampe dgn selamat, coc...",5
69,order ke 2x nya karena sebagus itu hydra boost...,5
73,"udah order beerrr kali2. segar, bisa dipakai k...",5
...,...,...
3961,"sdh sampai, barang berfungsi dengan baik, hany...",5
3967,tolong sering-sering kasih diskon yaaaaaa kao....,5
3976,"meh meh meh, toko paling jarang diskon\nsekali...",5
4007,barang saya belum sampai sampai saat ini \nsdh...,1


In [None]:
import re


def preprocess_numbers(x):
    # Pattern 1: Price expressions - convert to [PRICE] token
    price_pattern = r"\b\d+(?:[,.]\d+)*(?:rb|ribu|k|jt|juta|m|million)\b"
    x = re.sub(price_pattern, "[PRICE]", x)

    # Pattern 2: Simple math expressions - remove these
    math_pattern = r"\b\d+[\+\-\*\/]\d+\b"
    x = re.sub(math_pattern, "", x)

    # Pattern 3: Split number prefix from words (e.g., 2hari -> 2 hari)
    word_prefix_pattern = r"(\d+)(\w+)"
    x = re.sub(word_prefix_pattern, r"\1 \2", x)

    # Pattern 4: Remove repeated words (e.g., masing2 -> masing-masing)
    repeated_word_pattern = r"(\w+)2\b"
    x = re.sub(repeated_word_pattern, r"\1-\1", x)

    # Pattern 5: Remove standalone numbers that aren't part of important expressions (time and/or quantity)
    # This should run last to avoid interfering with other patterns
    standalone_num_pattern = r"\b\d+\b(?!\s+(?:tahun|thn|bulan|bln|hari|hr|minggu|mgg|jam|jm|menit|mnt|detik|dtk|pcs|kantong|kantng|bungkus|box|paket|biji|buah))"
    x = re.sub(standalone_num_pattern, "", x)

    return x.strip()


clean_df.review = clean_df.review.map(preprocess_numbers)


In [69]:
clean_df[clean_df.review.str.contains(r'\d')]

Unnamed: 0,review,star
214,dikasih free 3 pcs masyaallah baik banget. pen...,5
228,sangat mengecewakan dan sangat tidak professio...,1
242,"sdh diterima br periksa lusin, cek semuanya p...",5
299,"kurang 2 kantong, mohon di cek kembali pas pac...",4
387,"minyak wangi paling favorit laahhh, harganya m...",5
...,...,...
3518,"kualitas suara & bluetooth bagus, tapi sayang ...",2
3525,"tws lenovo thinkplus selalu jadi pilihan, oran...",5
3589,"bintang ini utk pengiriman ya, utk kualitas b...",4
3662,ternyata dikirim dari ace artha gading yg jara...,2


#### Punctuation

In [71]:
import string


def remove_punctuation(x):
    punctuations = set(string.punctuation)
    cleaned_text = ''.join(char for char in x if char not in punctuations)
    return cleaned_text

clean_df.review.map(remove_punctuation)

0       barang yg dibeli sesuai  pengiriman cepat dan ...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini  besok ...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3565, dtype: object

In [72]:
clean_df.review = clean_df.review.map(remove_punctuation)

#### Whitespaces

In [74]:
def remove_whitespaces(x):
    return re.sub(r"\s+", " ", x).strip()

clean_df.review.map(remove_whitespaces)

0       barang yg dibeli sesuai pengiriman cepat dan h...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini besok s...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3565, dtype: object

In [75]:
clean_df.review = clean_df.review.map(remove_whitespaces)

#### Emoji

Emoji is not directly removed as it might helpful for the sentiment. For example love emoji (❤️‍) shows a positive sentiment.

The preprocessing that's needed for emoji is to split any emoji that stick to a word. This will be helpful for tokenization process later.

In [80]:
import re
import emoji

def preprocess_emojis(x):
    # identify emoji positions
    emoji_positions = []
    for match in emoji.emoji_list(x):
        start, end = match['match_start'], match['match_end']
        emoji_positions.append((start, end))

    # Process the string from end to beginning (to avoid position shifts)
    emoji_positions.sort(reverse=True)

    char_list = list(x)
    for start, end in emoji_positions:
        # if preceded by word character
        if start > 0 and char_list[start-1].isalnum():
            char_list.insert(start, ' ')

        # if followed by word character
        if end < len(char_list) and end < len(x) and char_list[end].isalnum():
            char_list.insert(end, ' ')

    return ''.join(char_list)


clean_df.review.map(preprocess_emojis)

0       barang yg dibeli sesuai pengiriman cepat dan h...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini besok s...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                   mantuuulllllmulussssssemoga awettt
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032    bbbaaarrraaaaaannnnnggggggggggg bbbbaaaaaggggu...
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3565, dtype: object

In [81]:
clean_df.review = clean_df.review.map(preprocess_emojis)

#### Repeated Letters

In [82]:
def normalize_repeated_letters(word):
    # Only replace if a character repeats 3 or more times
    return re.sub(r'(.)\1{2,}', r'\1', word)

def normalize_review(text):
    return ' '.join([normalize_repeated_letters(w) for w in text.split()])

clean_df.review = clean_df.review.map(normalize_review)

In [83]:
clean_df.review

0       barang yg dibeli sesuai pengiriman cepat dan h...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari ini besok s...
3       ini rekomen banget yg beli ga bakal nyesel ga ...
4       so cute ukurannya pas sendalnya ringan dan nya...
                              ...                        
4028                                mantulmulusemoga awet
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032                                    barang bagus puas
4033    terimakasih brg datang tepat waktu dan trmksh ...
Name: review, Length: 3565, dtype: object

### Stopword Removal

In [84]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [85]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize

factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()

In [86]:
def remove_stopwords(x):
    word_token = word_tokenize(x)
    important_words = [word for word in word_token if word.lower() not in stopwords]
    return ' '.join(important_words)

clean_df.review.map(remove_stopwords)

0       barang yg dibeli sesuai pengiriman cepat harga...
1           barang original packing sangat aman great job
2       pengiriman sangat cepat pesan hari besok great...
3       rekomen banget yg beli ga bakal nyesel ga nyan...
4       so cute ukurannya pas sendalnya ringan nyaman ...
                              ...                        
4028                                mantulmulusemoga awet
4029                       kulitas bagus pengiriman cepat
4030                                             the best
4032                                    barang bagus puas
4033    terimakasih brg datang tepat waktu trmksh atas...
Name: review, Length: 3565, dtype: object

In [87]:
clean_df.review = clean_df.review.map(remove_stopwords)

### Stemming

In [89]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [90]:
clean_df.review = clean_df.review.progress_map(lambda x: stemmer.stem(x))

  0%|          | 0/3565 [00:00<?, ?it/s]

In [91]:
clean_df.review

0       barang yg beli sesuai kirim cepat harga masuk ...
1           barang original packing sangat aman great job
2           kirim sangat cepat pesan hari besok great jpb
3       rekomen banget yg beli ga bakal nyesel ga nyan...
4             so cute ukur pas sendal ringan nyaman pakai
                              ...                        
4028                                mantulmulusemoga awet
4029                            kulitas bagus kirim cepat
4030                                             the best
4032                                    barang bagus puas
4033    terimakasih brg datang tepat waktu trmksh atas...
Name: review, Length: 3565, dtype: object

In [93]:
df.loc[clean_df.index]\
    .assign(review=clean_df.review)\
    .to_csv('data/clean_reviews.csv', index=False, sep='\t')

## Feature Extraction

In [2]:
# load the cleaned data if necessary
import pandas as pd

clean_df = pd.read_csv('data/clean_reviews.csv', sep='\t', usecols=['review', 'star'])
clean_df.head()

Unnamed: 0,star,review
0,5,barang yg beli sesuai kirim cepat harga masuk ...
1,5,barang original packing sangat aman great job
2,5,kirim sangat cepat pesan hari besok great jpb
3,5,rekomen banget yg beli ga bakal nyesel ga nyan...
4,5,so cute ukur pas sendal ringan nyaman pakai


In [3]:
clean_df.star.value_counts()

star
5    3219
4     196
3      75
1      50
2      25
Name: count, dtype: int64

In [4]:
from collections import Counter

top_words_per_star = {}

for star in sorted(clean_df.star.unique()):
    reviews = clean_df[clean_df.star == star].review.dropna()
    words = ' '.join(reviews).split()
    most_common = Counter(words).most_common(10)
    top_words_per_star[star] = most_common

top_words_per_star

{np.int64(1): [('barang', 21),
  ('kirim', 13),
  ('yg', 12),
  ('beli', 9),
  ('sangat', 8),
  ('produk', 8),
  ('gak', 8),
  ('seller', 8),
  ('sesuai', 7),
  ('baru', 7)],
 np.int64(2): [('kirim', 8),
  ('yg', 8),
  ('pesan', 5),
  ('baru', 5),
  ('pcs', 5),
  ('nya', 4),
  ('barang', 4),
  ('sampe', 4),
  ('kali', 4),
  ('gak', 4)],
 np.int64(3): [('kirim', 19),
  ('barang', 18),
  ('nya', 13),
  ('gak', 9),
  ('bagus', 9),
  ('yg', 9),
  ('lama', 9),
  ('kecil', 9),
  ('kurang', 8),
  ('sesuai', 8)],
 np.int64(4): [('barang', 51),
  ('kirim', 42),
  ('sesuai', 35),
  ('bagus', 35),
  ('nya', 30),
  ('yg', 26),
  ('lama', 24),
  ('cepat', 22),
  ('kurang', 20),
  ('harga', 20)],
 np.int64(5): [('barang', 749),
  ('bagus', 702),
  ('cepat', 664),
  ('sesuai', 623),
  ('kirim', 622),
  ('aman', 387),
  ('nya', 342),
  ('terima', 321),
  ('baik', 318),
  ('banget', 310)]}

In [5]:
import fasttext

# try to run this in terminal (python console)
# as the downloading progress bar badly handled by notebook output causing lags
# import fasttext.util
# fasttext.util.download_model('id', if_exists='ignore')
ft = fasttext.load_model('cc.id.300.bin')

: 

In [None]:
ft.get_word_vector('yg')