In [43]:
from collections import Counter
import re
import pandas as pd
import json


In [44]:
df = pd.read_csv('data/tokopedia_scrape_all.csv')
df.head()

Unnamed: 0,product_id,product_name,url,description,price,sold_count,rating,total_stock,category
0,2618801377,Piano Elektronik Payment Mr. D 2,https://www.tokopedia.com/miracle-piano-maestr...,pengiriman seluruh Indonesia Jakarta Merupak...,20300000,1,,9,Speaker
1,1418259858,mesin cuci lg 15kg f2515stgw(GARANSI RESMI),https://www.tokopedia.com/memory-elektronik-ja...,Tanyakan ketersediaan stock terlebih dahulu\nF...,11100000,1,5.0,5,Mesin Cuci
2,12881922184,Smart TV XIAOMI L32M8-A2ID 32 A Pro series,https://www.tokopedia.com/blessingcombali/smar...,WAJIB Membaca CATATAN TOKO\nWAJIB Video Unboxi...,1729800,215,5.0,38,Monitor Tabung
3,11108003873,AQUA Elektronik AQR-D225(MDS) 1 pintu 180L,https://www.tokopedia.com/rossielektroni/aqua-...,AQUA Elektronik AQR-D225 1Door 180 L\n\nAQR-D2...,1933000,11,5.0,11,Kulkas
4,100413508806,GARANSI 1 TAHUN! BARDI ZigBee Smart Gateway,https://www.tokopedia.com/technobitzzz/garansi...,BARDI Zigbee Gateway adalah jembatan antara pe...,469000,2,,398,Remote IO


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    1143 non-null   int64  
 1   product_name  1143 non-null   object 
 2   url           1143 non-null   object 
 3   description   1118 non-null   object 
 4   price         1143 non-null   int64  
 5   sold_count    1143 non-null   int64  
 6   rating        949 non-null    float64
 7   total_stock   1143 non-null   int64  
 8   category      1143 non-null   object 
dtypes: float64(1), int64(4), object(4)
memory usage: 80.5+ KB


## Most Common Words Data Scrape 1 (Product's Name, Description, adn Other Specification)

In [46]:
# --- load stopwords dari file JSON ---
with open("data/stopwords-id.json", "r", encoding="utf-8") as f:
    stopwords_id = set(json.load(f))  # pastikan bentuknya list/array di json

# --- kolom yang akan dianalisis ---
cols_to_check = ["product_id", "product_name", "description", 
                 "price", "sold_count", "rating", "total_stock", "category"]

def get_most_common_words(series, n=10):
    text = " ".join(series.dropna().astype(str))
    words = re.findall(r'\w+', text.lower())
    
    filtered = []
    for w in words:
        # skip angka murni
        if w.isdigit():
            continue
        # skip stopwords
        if w in stopwords_id:
            continue
        filtered.append(w)
    
    return Counter(filtered).most_common(n)

# --- loop tiap kolom ---
for col in cols_to_check:
    print(f"\n=== Most Common Words in '{col}' ===")
    common_words = get_most_common_words(df[col], n=10)
    for word, count in common_words:
        print(f"{word}: {count}")



=== Most Common Words in 'product_id' ===

=== Most Common Words in 'product_name' ===
wanita: 201
pria: 174
tas: 128
sekolah: 123
anak: 121
bayi: 115
celana: 112
paket: 100
premium: 90
olahraga: 89

=== Most Common Words in 'description' ===
cm: 1344
produk: 1331
x: 1006
bahan: 925
ukuran: 830
warna: 613
barang: 612
kulit: 560
size: 553
cocok: 513

=== Most Common Words in 'price' ===

=== Most Common Words in 'sold_count' ===

=== Most Common Words in 'rating' ===

=== Most Common Words in 'total_stock' ===

=== Most Common Words in 'category' ===
pria: 122
wanita: 116
android: 72
os: 72
celana: 60
tas: 54
bayi: 47
ransel: 45
alat: 43
kaos: 39


In [47]:
# --- kumpulkan semua kata ---
all_words = []
for col in cols_to_check:
    common_words = get_most_common_words(df[col], n=None)  # ambil semua kata
    all_words.extend(common_words)

# --- gabungkan count dari semua kolom ---
total_counter = Counter()
for word, count in all_words:
    total_counter[word] += count

# --- ambil top 300 ---
top_words = total_counter.most_common(500)

# --- jadikan DataFrame ---
df_top_words1 = pd.DataFrame(top_words, columns=['word', 'count'])
df_top_words1.head

<bound method NDFrame.head of              word  count
0              cm   1347
1          produk   1336
2               x   1009
3           bahan    963
4          ukuran    842
..            ...    ...
495       standar     52
496  membersihkan     52
497            wi     52
498            si     52
499     frekuensi     52

[500 rows x 2 columns]>

## Most Common Words Data Scrape 2 (Product's Reviews)

In [48]:
import os
os.listdir()

['data',
 'dictionary.ipynb',
 'README.md',
 'requirements.txt',
 'scraper_tokopedia.ipynb',
 'scraper_tokped.py',
 'userdata_shopee']

In [49]:
# --- load stopwords dari file json ---
with open("data/stopwords-id.json", "r", encoding="utf-8") as f:
    stopwords_id = set(json.load(f))

# --- load reviews ---
with open("data/reviews.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = [d["message"] for d in data if "message" in d]

all_words = []
for text in texts:
    words = re.findall(r"\w+", text.lower())  # pecah kata
    for w in words:
        # skip angka murni
        if w.isdigit():
            continue
        # skip stopwords
        if w in stopwords_id:
            continue
        # skip token terlalu pendek (opsional)
        if len(w) == 1:
            continue
        all_words.append(w)

# hitung frekuensi
counter = Counter(all_words)
top_100 = counter.most_common(500)

# tampilkan
for word, freq in top_100:
    print(word, ":", freq)

barang : 108
sesuai : 105
nya : 101
bagus : 93
harga : 90
suara : 53
bass : 51
kualitas : 46
mantap : 44
cepat : 43
suaranya : 41
yg : 34
ya : 34
pengiriman : 33
lumayan : 31
produk : 31
ok : 27
banget : 25
terima : 24
barangnya : 22
gak : 22
seller : 21
berfungsi : 20
ga : 19
headset : 18
pesanan : 18
kasih : 18
packing : 18
aja : 17
deskripsi : 17
beli : 16
sebelah : 16
oke : 16
jernih : 15
bassnya : 15
respon : 15
thanks : 15
tp : 15
sih : 14
gan : 14
good : 14
murah : 13
kurir : 12
harganya : 12
dah : 12
amp : 11
udah : 11
pengirimannya : 11
puas : 11
dikirim : 11
makasih : 10
sukses : 10
terimakasih : 10
rapi : 10
aman : 10
awet : 10
pas : 10
diterima : 9
warna : 9
semoga : 9
nyampe : 8
nggak : 8
produknya : 8
proses : 8
merah : 8
hitam : 8
kabel : 8
pesan : 8
rapih : 8
segitu : 8
bgt : 8
top : 8
desainnya : 7
modern : 7
cempreng : 7
nyaman : 7
dipakai : 7
terjangkau : 7
gk : 7
toko : 7
kirim : 7
pokoknya : 7
tdk : 7
tokopedia : 7
mendem : 7
dipake : 7
cepet : 7
liger : 7
flash : 

In [50]:
# top_100 is a list of tuples (word, freq)
df_top_words2 = pd.DataFrame(top_100, columns=['word', 'count'])
df_top_words2.head(40)

Unnamed: 0,word,count
0,barang,108
1,sesuai,105
2,nya,101
3,bagus,93
4,harga,90
5,suara,53
6,bass,51
7,kualitas,46
8,mantap,44
9,cepat,43


## Most Common Words from both

In [51]:
# Gabungkan kedua DataFrame top words
df_top_words_combined = pd.concat([df_top_words1, df_top_words2])
df_top_words_combined = df_top_words_combined.groupby('word', as_index=False)['count'].sum()
df_top_words_combined = df_top_words_combined.sort_values(by='count', ascending=False).reset_index(drop=True)

df_top_words_combined.head(30)

Unnamed: 0,word,count
0,produk,1367
1,cm,1347
2,x,1009
3,bahan,966
4,ukuran,842
5,barang,722
6,warna,657
7,kulit,589
8,anak,571
9,size,569


## Detect Acronyms

In [58]:
import re
import pandas as pd

# === whitelist akronim/unit yang valid (boleh kamu tambah terus) ===
SEED = {
    # --- umum/teknis ---
    "sku","cod","oem","ori","sni","bpom","gsm",
    # power/elektronik
    "w","v","a","hz","mah","watt","volt","ampere",
    "usb","usbc","typec","hdmi","vga","wifi","bt","ble","nfc",
    "ram","rom","ssd","hdd","cpu","gpu",
    "lcd","led","oled","amoled","ips","hdr","fps","rgb","srgb","ppi","dpi","mp",
    "ip67","ip68","ipx4","ipx5","ipx6","ipx7","ipx8","pd","qc","dc","ac",
    "kb","mb","gb","tb",
    # ukuran/berat/volume
    "cm","mm","m","km","kg","g","gr","l","ml","oz","lb","lbs","psi",
    # fashion (size code)
    "xs","s","m","l","xl","xxl","xxxl","xxxs",
    # kecantikan/kesehatan/makanan
    "spf","uv","uva","uvb","ph","uht","bpa","ppm","mg","mcg","iu","kcal","cal",
    # hp/jaringan
    "5g","4g","lte","nfc","ir","nits",
    # kertas/alat sekolah
    "a3","a4","a5","b5","f4","folio","gsm"
}

RULES = [
    # huruf saja 2–5 -> harus ada di SEED (biar kata umum gak ikut)
    ("alpha_short_seeded", re.compile(r"^[a-z]{2,5}$")),

    # huruf+angka 2–8 (ip67, 1080p, 5g, 34b bra, a4, b5, 256gb, uk9)
    ("alnum_mixed", re.compile(r"^(?=.*[a-z])(?=.*\d)[a-z0-9]{2,8}$")),

    # angka + unit huruf (10cm, 15ml, 230v, 65w, 500gb)
    ("num_unit", re.compile(r"^\d{1,4}[a-z]{1,4}$")),

    # resolusi / label p (720p, 1080p, 4k -> dianggap alnum_mixed tapi ini khusus biar jelas)
    ("resolution_p", re.compile(r"^\d{3,4}p$|^[2348]k$")),

    # dimensi 2D/3D (20x30, 200x200x50)
    ("dimension", re.compile(r"^\d{1,4}x\d{1,4}(x\d{1,4})?$")),

    # ukuran fashion (override 1-huruf S/M/L) -> valid
    ("size_code", re.compile(r"^(xs|s|m|l|xl|xxl|xxxl|xxxs)$")),

    # rating proteksi air (ipx7, ipx8) -> kadang tidak di SEED semua
    ("ipx_rating", re.compile(r"^ipx[0-8]$")),
]

# (opsional) stopwords domain: kata umum e-commerce yang ingin dibuang
DOMAIN_STOP = {
    "bahan","warna","ukuran","size","anak","pria","wanita","bayi","toko",
    "alat","jam","aman","kaos","harga","wajah","berat","dada","lebar",
    "video","isi","baju","tas","paket","cocok","mudah","barang","produk"
}

def detect_candidates(df_words: pd.DataFrame, min_count: int = 20) -> pd.DataFrame:
    rows = []
    for _, row in df_words.iterrows():
        w = str(row["word"]).lower()
        c = int(row["count"])

        if c < min_count:
            continue
        if w in DOMAIN_STOP:
            continue
        # hanya token alnum (sudah cocok dengan cara kamu bikin df_top_words)
        if not re.fullmatch(r"[a-z0-9]+", w):
            continue

        matched = None
        for name, rx in RULES:
            if not rx.match(w):
                continue
            if name == "alpha_short_seeded" and w not in SEED:
                # huruf pendek wajib ada di SEED (hindari 'bahan', 'warna', dst)
                continue
            matched = name
            break

        # kalau belum match apa-apa & token 1 huruf (bukan S/M/L), skip
        if not matched:
            continue

        rows.append({"word": w, "count": c, "rule": matched})

    return (
        pd.DataFrame(rows)
        .sort_values(["count","word"], ascending=[False, True])
        .reset_index(drop=True)
    )

# pakai:
df_acronyms = detect_candidates(df_top_words_combined, min_count=20)
df_acronyms.head(40)


Unnamed: 0,word,count,rule
0,cm,1347,alpha_short_seeded
1,l,275,size_code
2,m,242,size_code
3,usb,221,alpha_short_seeded
4,kg,214,alpha_short_seeded
5,xl,205,alpha_short_seeded
6,ram,199,alpha_short_seeded
7,1x,182,alnum_mixed
8,s,176,size_code
9,bpom,149,alpha_short_seeded
