In [42]:
from collections import Counter
import re, pandas as pd, json, os, pathlib
from collections import defaultdict

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords_id = set(stopwords.words('indonesian'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aufii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('data/tokopedia_500.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3972 entries, 0 to 3971
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    3972 non-null   int64  
 1   product_name  3972 non-null   object 
 2   url           3972 non-null   object 
 3   description   3868 non-null   object 
 4   price         3972 non-null   int64  
 5   sold_count    3969 non-null   float64
 6   rating        3380 non-null   float64
 7   total_stock   3969 non-null   float64
 8   category      3972 non-null   object 
 9   keyword       3972 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 310.4+ KB


## Most Common Words Data Scrape 1 (Product's Name, Description, adn Other Specification)

In [4]:
# --- kolom yang akan dianalisis ---
cols_to_check = ["product_name", "description", 
                 "price", "sold_count", "rating", "total_stock", "category"]

def get_most_common_words(series, n=10):
    text = " ".join(series.dropna().astype(str))
    words = re.findall(r'\w+', text.lower())
    
    filtered = []
    for w in words:
        # skip angka murni
        if w.isdigit():
            continue
        # skip stopwords
        if w in stopwords_id:
            continue
        filtered.append(w)
    
    return Counter(filtered).most_common(n)

# --- loop tiap kolom ---
for col in cols_to_check:
    print(f"\n=== Most Common Words in '{col}' ===")
    common_words = get_most_common_words(df[col], n=10)
    for word, count in common_words:
        print(f"{word}: {count}")


=== Most Common Words in 'product_name' ===
wanita: 591
pria: 502
bayi: 438
anak: 399
sekolah: 376
set: 371
tas: 346
paket: 331
celana: 325
alat: 323

=== Most Common Words in 'description' ===
produk: 4491
cm: 4035
bahan: 3003
x: 2910
ukuran: 2555
warna: 1974
kulit: 1964
barang: 1768
cocok: 1735
size: 1659

=== Most Common Words in 'price' ===

=== Most Common Words in 'sold_count' ===

=== Most Common Words in 'rating' ===

=== Most Common Words in 'total_stock' ===

=== Most Common Words in 'category' ===
pria: 430
wanita: 391
android: 262
os: 262
celana: 213
bayi: 185
alat: 176
kaos: 166
tas: 166
ransel: 148


In [5]:
# --- kumpulkan semua kata ---
all_words = []
for col in cols_to_check:
    common_words = get_most_common_words(df[col], n=None)  # ambil semua kata
    all_words.extend(common_words)

# --- gabungkan count dari semua kolom ---
total_counter = Counter()
for word, count in all_words:
    total_counter[word] += count

# --- ambil top 1000 ---
top_words = total_counter.most_common(1000)

# --- jadikan DataFrame ---
df_top_words1 = pd.DataFrame(top_words, columns=['word', 'count'])
df_top_words1.head

<bound method NDFrame.head of               word  count
0           produk   4518
1               cm   4049
2            bahan   3135
3                x   2949
4           ukuran   2591
5            kulit   2070
6            warna   2040
7            cocok   1778
8           barang   1776
9            paket   1756
10            bayi   1725
11            size   1702
12          celana   1556
13            anak   1517
14      pengiriman   1448
15         lingkar   1435
16          wanita   1377
17           mudah   1367
18         garansi   1334
19            pria   1258
20          nyaman   1242
21             tas   1171
22            alat   1148
23        membantu   1139
24          sesuai   1097
25            aman   1087
26             jam   1061
27            baju   1051
28             air   1045
29           wajah   1039
30           video   1011
31        memiliki   1003
32           berat   1000
33            toko    989
34             set    986
35             isi    969
36      

cek beberapa kata

In [6]:
keywords = ["sd", "lp", "w", "it", "pp"]

# Gabungkan semua teks dari kolom-kolom yang dianalisis
all_texts = []
for col in cols_to_check:
    all_texts.extend(df[col].dropna().astype(str).tolist())

# Untuk setiap keyword, cari kalimat/fragmen yang mengandungnya
for kw in keywords:
    print(f"\n=== Contoh kemunculan '{kw}' ===")
    found = 0
    for t in all_texts:
        t_lower = t.lower()
        if kw in t_lower.split():
            print("-", t)
            found += 1
        elif f" {kw}" in t_lower or f"{kw} " in t_lower:
            print("-", t)
            found += 1
        if found >= 5:
            break
    if found == 0:
        print("Tidak ditemukan.")


=== Contoh kemunculan 'sd' ===
- Komputer All in One Baru, Intel Core i7 Gen ke-6, RAM 16GB + SSD 512GB Gratis Keyboard+mouse
- Set PC Komputer Intel Core i5-12400 Gen-12 RAM 8GB SSD || Frameless
- ASUS PC Desktop P500MV Intel Core i3 1315 RAM 8 GB SSD 512 GB Windows 11 + LED Monitor 21.5 Inch
- Set PC Slim Intel Core i5-14400 Gen-14 RAM 8GB SSD || Frameless
- Komputer Rakitan i5 3470 | Ram 8GB HDD 1TB SSD 120GB| Kantor Sekolah

=== Contoh kemunculan 'lp' ===
- COD Original JBL Phantom Wireless Bluetooth Headset LP35, Ringan dan Nyaman, Cocok untuk Ponsel Android dan iphone, Panggilan HD, Musik In-Ear Berkualitas Tinggi, Headphone Gaming Tidur,TWS
- UMUM
Tahun Rilis 2024 
Jaringan 2G, 3G, 4G, 5G 
SIM Card Single SIM 
eSIM Ya

BODY
Dimensi 160.9 x 77.8 x 7.8 mm 
Berat 199 gram
Ketahanan IP68 
Fitur Lainnya
- Tahan debu seluruh sisi dan tahan air hingga kedalaman 6 m selama 30 menit
- Material: kaca buatan Corning (bagian depan dan belakang), aluminium (frame)

LAYAR UTAMA
Jenis Super R

## Most Common Words Data Scrape 2 (Product's Reviews)

In [7]:
import os
os.listdir()

['data',
 'dictionary.ipynb',
 'README.md',
 'requirements.txt',
 'scraper_tokopedia.ipynb',
 'scraper_tokped.py']

In [8]:
# --- load reviews dari CSV ---
df_reviews = pd.read_csv("data/reviews_only.csv", encoding="utf-8")
texts = df_reviews["review"].dropna().astype(str).tolist()

In [9]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1676 non-null   object
dtypes: object(1)
memory usage: 13.2+ KB


In [10]:
all_words = []
for text in texts:
    words = re.findall(r"\w+", text.lower())
    for w in words:
        if w.isdigit():
            continue
        if w in stopwords_id:
            continue
        if len(w) == 1:
            continue
        all_words.append(w)

counter = Counter(all_words)
top_100 = counter.most_common(500)

for word, freq in top_100:
    print(word, ":", freq)


bagus : 573
nya : 466
sesuai : 419
barang : 380
cepat : 294
pengiriman : 241
banget : 232
yg : 224
produk : 207
aman : 199
semoga : 197
bahan : 194
seller : 172
kualitas : 161
beli : 136
pesanan : 132
pas : 130
harga : 117
enak : 111
ya : 109
cocok : 108
packing : 104
ga : 100
awet : 98
terimakasih : 97
bgt : 97
terima : 94
rapi : 93
bahannya : 93
warna : 92
alhamdulillah : 91
kasih : 88
mantap : 88
paket : 87
toko : 86
gak : 81
suka : 80
amanah : 74
respon : 72
dgn : 71
barangnya : 70
ramah : 70
oke : 68
order : 68
ukuran : 68
berfungsi : 67
ok : 65
udah : 63
jg : 61
anak : 61
makasih : 60
lumayan : 60
kurir : 58
tp : 56
tebal : 55
ori : 54
deskripsi : 54
sdh : 52
diterima : 52
gambar : 50
coba : 50
puas : 50
sukses : 48
rapih : 48
adem : 46
gk : 45
aja : 44
kali : 44
thanks : 44
lembut : 44
fast : 43
sampe : 42
tas : 42
nyaman : 41
murah : 40
pokoknya : 40
cepet : 40
ny : 39
next : 39
pake : 38
hp : 38
tipis : 38
langsung : 37
asli : 37
keren : 37
udh : 37
sih : 36
original : 35
amp 

In [11]:
# top_100 is a list of tuples (word, freq)
df_top_words2 = pd.DataFrame(top_100, columns=['word', 'count'])
df_top_words2.head(40)

Unnamed: 0,word,count
0,bagus,573
1,nya,466
2,sesuai,419
3,barang,380
4,cepat,294
5,pengiriman,241
6,banget,232
7,yg,224
8,produk,207
9,aman,199


## Most Common Words from both

In [12]:
# Gabungkan kedua DataFrame top words
df_top_words_combined = pd.concat([df_top_words1, df_top_words2])
df_top_words_combined = df_top_words_combined.groupby('word', as_index=False)['count'].sum()
df_top_words_combined = df_top_words_combined.sort_values(by='count', ascending=False).reset_index(drop=True)

df_top_words_combined.head(30)

Unnamed: 0,word,count
0,produk,4725
1,cm,4060
2,bahan,3329
3,x,2949
4,ukuran,2659
5,barang,2156
6,warna,2132
7,kulit,2104
8,cocok,1886
9,paket,1843


## **Detect Acronyms**

In [13]:
import re
import pandas as pd

# === whitelist akronim/unit yang valid (boleh kamu tambah terus) ===
SEED = {
    # --- umum / teknis ---
    "sku", "cod", "oem", "ori", "sni", "bpom", "oem", "hires", "gsm", "bpa",
    "bb", "dll", "pcs", "jpg",

    # power / elektronik
    "hz", "khz", "mhz", "ghz",
    "mah", "ac", "pd", "qc",

    # data
    "kb", "mb", "gb", "tb",

    # konektivitas & antarmuka
    "usbc", "typec", "hdmi", "vga", "bt", "ble", "nfc",
    "ir", "lte", "volte", "sim", "esim",

    # komponen / penyimpanan
    "ssd", "hdd",

    # layar / gambar
    "lcd", "led", "oled", "amoled", "ips",
    "hdr", "fps", "rgb", "srgb", "ppi", "dpi", "mp", "hd",

    # ukuran / berat / volume
    "cm", "mm", "m", "km", "kg", "g", "gr", "l", "ml", "oz", "lb", "lbs", "psi",

    # fashion (size code + ukuran lokal)
    "xs", "s", "m", "l", "xl", "xxl", "xxxl", "xxxs",
    "ld", "lp", "pb",

    # kecantikan / kesehatan / makanan
    "spf", "pa", "uv", "uva", "uvb", "ph",
    "uht", "ppm", "mg", "mcg", "iu", "kcal", "cal",

    # hp / jaringan
    "nfc",

    # audio
    "tws", "anc", "enc", "aac", "sbc", "ldac",

    # makanan / minuman / label
    "mfg", "exp",
}


RULES = [
    # huruf saja 2–5 -> harus ada di SEED (biar kata umum gak ikut)
    ("alpha_short_seeded", re.compile(r"^[a-z]{2,5}$")),

    # huruf+angka 2–8 (ip67, 1080p, 5g, 34b bra, a4, b5, 256gb, uk9)
    ("alnum_mixed", re.compile(r"^(?=.*[a-z])(?=.*\d)[a-z0-9]{2,8}$")),

    # angka + unit huruf (10cm, 15ml, 230v, 65w, 500gb)
    ("num_unit", re.compile(r"^\d{1,4}[a-z]{1,4}$")),

    # resolusi / label p (720p, 1080p, 4k -> dianggap alnum_mixed tapi ini khusus biar jelas)
    ("resolution_p", re.compile(r"^\d{3,4}p$|^[2348]k$")),

    # dimensi 2D/3D (20x30, 200x200x50)
    ("dimension", re.compile(r"^\d{1,4}x\d{1,4}(x\d{1,4})?$")),

    # ukuran fashion (override 1-huruf S/M/L) -> valid
    ("size_code", re.compile(r"^(xs|s|m|l|xl|xxl|xxxl|xxxs)$")),

    # rating proteksi air (ipx7, ipx8) -> kadang tidak di SEED semua
    ("ipx_rating", re.compile(r"^ipx[0-8]$")),
]

# (opsional) stopwords domain: kata umum e-commerce yang ingin dibuang
DOMAIN_STOP = {
    "bahan","warna","ukuran","size","anak","pria","wanita","bayi","toko",
    "alat","jam","aman","kaos","harga","wajah","berat","dada","lebar",
    "video","isi","baju","tas","paket","cocok","mudah","barang","produk"
}

def detect_candidates(df_words: pd.DataFrame, min_count: int = 20) -> pd.DataFrame:
    rows = []
    for _, row in df_words.iterrows():
        w = str(row["word"]).lower()
        c = int(row["count"])

        if c < min_count:
            continue
        if w in DOMAIN_STOP:
            continue
        # hanya token alnum (sudah cocok dengan cara kamu bikin df_top_words)
        if not re.fullmatch(r"[a-z0-9]+", w):
            continue

        matched = None
        for name, rx in RULES:
            if not rx.match(w):
                continue
            if name == "alpha_short_seeded" and w not in SEED:
                # huruf pendek wajib ada di SEED (hindari 'bahan', 'warna', dst)
                continue
            matched = name
            break

        # kalau belum match apa-apa & token 1 huruf (bukan S/M/L), skip
        if not matched:
            continue

        rows.append({"word": w, "count": c, "rule": matched})

    return (
        pd.DataFrame(rows)
        .sort_values(["count","word"], ascending=[False, True])
        .reset_index(drop=True)
    )

# pakai:
df_acronyms = detect_candidates(df_top_words_combined, min_count=20)
df_acronyms.head(40)


Unnamed: 0,word,count,rule
0,cm,4060,alpha_short_seeded
1,l,913,size_code
2,m,825,size_code
3,kg,761,alpha_short_seeded
4,pcs,734,alpha_short_seeded
5,xl,678,alpha_short_seeded
6,bb,591,alpha_short_seeded
7,ssd,515,alpha_short_seeded
8,8gb,514,alnum_mixed
9,bpom,503,alpha_short_seeded


In [14]:
df_acronyms['word'].unique()

array(['cm', 'l', 'm', 'kg', 'pcs', 'xl', 'bb', 'ssd', '8gb', 'bpom',
       'ld', 's', '1x', 'ml', 'xxl', 'dll', 'sim', 'i5', '256gb', 'led',
       'cod', 'gb', '128gb', '16gb', 'mm', 'mp', 'hd', '4gb', 'vga',
       '5cm', 'i7', 'lp', 'ghz', '5g', '512gb', 'mah', 'gr', 'ori', 'i3',
       'hdd', '1tb', 'ddr3', '2x', '4g', 'pb', 'lcd', '5mm', 'mg', '1kg',
       'ddr4', '30fps', '64gb', '70cm', 'ips', 'hdmi', 'nfc', 'rgb',
       '100cm', 'ac', '50cm', 'uv', '3pcs', '500gb', '1080p', '60cm',
       '120hz', '2cm', 'x1'], dtype=object)

### Making the dictionary (acronyms)

In [15]:
# buat skeleton dict dari hasil deteksi
acronym_dict = {w: "" for w in df_acronyms["word"].tolist()}

list(acronym_dict.items())[:20]

[('cm', ''),
 ('l', ''),
 ('m', ''),
 ('kg', ''),
 ('pcs', ''),
 ('xl', ''),
 ('bb', ''),
 ('ssd', ''),
 ('8gb', ''),
 ('bpom', ''),
 ('ld', ''),
 ('s', ''),
 ('1x', ''),
 ('ml', ''),
 ('xxl', ''),
 ('dll', ''),
 ('sim', ''),
 ('i5', ''),
 ('256gb', ''),
 ('led', '')]

tambahin seeds yang blom ada disitu

In [16]:
# Tambahkan seed yang belum ada di acronym_dict, isi dengan string kosong
for s in SEED:
    if s not in acronym_dict:
        acronym_dict[s] = ""

list(acronym_dict.items())

[('cm', ''),
 ('l', ''),
 ('m', ''),
 ('kg', ''),
 ('pcs', ''),
 ('xl', ''),
 ('bb', ''),
 ('ssd', ''),
 ('8gb', ''),
 ('bpom', ''),
 ('ld', ''),
 ('s', ''),
 ('1x', ''),
 ('ml', ''),
 ('xxl', ''),
 ('dll', ''),
 ('sim', ''),
 ('i5', ''),
 ('256gb', ''),
 ('led', ''),
 ('cod', ''),
 ('gb', ''),
 ('128gb', ''),
 ('16gb', ''),
 ('mm', ''),
 ('mp', ''),
 ('hd', ''),
 ('4gb', ''),
 ('vga', ''),
 ('5cm', ''),
 ('i7', ''),
 ('lp', ''),
 ('ghz', ''),
 ('5g', ''),
 ('512gb', ''),
 ('mah', ''),
 ('gr', ''),
 ('ori', ''),
 ('i3', ''),
 ('hdd', ''),
 ('1tb', ''),
 ('ddr3', ''),
 ('2x', ''),
 ('4g', ''),
 ('pb', ''),
 ('lcd', ''),
 ('5mm', ''),
 ('mg', ''),
 ('1kg', ''),
 ('ddr4', ''),
 ('30fps', ''),
 ('64gb', ''),
 ('70cm', ''),
 ('ips', ''),
 ('hdmi', ''),
 ('nfc', ''),
 ('rgb', ''),
 ('100cm', ''),
 ('ac', ''),
 ('50cm', ''),
 ('uv', ''),
 ('3pcs', ''),
 ('500gb', ''),
 ('1080p', ''),
 ('60cm', ''),
 ('120hz', ''),
 ('2cm', ''),
 ('x1', ''),
 ('ppi', ''),
 ('km', ''),
 ('kcal', ''),
 ('sbc', '

tampilin semua dict nya

In [17]:
# Tampilkan seluruh isi acronym_dict
for k, v in acronym_dict.items():
    print(f"{k}: {v}")

cm: 
l: 
m: 
kg: 
pcs: 
xl: 
bb: 
ssd: 
8gb: 
bpom: 
ld: 
s: 
1x: 
ml: 
xxl: 
dll: 
sim: 
i5: 
256gb: 
led: 
cod: 
gb: 
128gb: 
16gb: 
mm: 
mp: 
hd: 
4gb: 
vga: 
5cm: 
i7: 
lp: 
ghz: 
5g: 
512gb: 
mah: 
gr: 
ori: 
i3: 
hdd: 
1tb: 
ddr3: 
2x: 
4g: 
pb: 
lcd: 
5mm: 
mg: 
1kg: 
ddr4: 
30fps: 
64gb: 
70cm: 
ips: 
hdmi: 
nfc: 
rgb: 
100cm: 
ac: 
50cm: 
uv: 
3pcs: 
500gb: 
1080p: 
60cm: 
120hz: 
2cm: 
x1: 
ppi: 
km: 
kcal: 
sbc: 
mhz: 
aac: 
oled: 
sni: 
lbs: 
dpi: 
sku: 
enc: 
tb: 
srgb: 
hdr: 
gsm: 
lb: 
amoled: 
anc: 
iu: 
bpa: 
xxxl: 
g: 
esim: 
mcg: 
uht: 
cal: 
ir: 
xs: 
tws: 
pa: 
ble: 
oem: 
typec: 
hires: 
fps: 
spf: 
khz: 
ph: 
jpg: 
ppm: 
psi: 
uva: 
lte: 
mb: 
oz: 
volte: 
kb: 
bt: 
qc: 
hz: 
usbc: 
xxxs: 
uvb: 
exp: 
mfg: 
pd: 
ldac: 


In [18]:
# Gabungkan kandidat + seed
all_words = set(df_acronyms["word"].tolist()) | set(SEED)
df_all = pd.DataFrame({"word": list(all_words), "rule": [None]*len(all_words)})

In [19]:
import re

# --- seri/brand yang tidak ingin direplace (biarkan apa adanya) ---
EXCLUDE_SERIES = {"i3", "i5", "i7", "i9"}

# --- Units (singkat -> Bahasa Indonesia, tanpa tanda kurung) ---
UNITS = {
    # panjang/ukuran
    "cm":"centimeter", "mm":"milimeter", "m":"meter", "km":"kilometer",
    # massa/berat
    "kg":"kilogram", "g":"gram", "gr":"gram", "lb":"pound", "lbs":"pounds",
    # volume
    "l":"liter", "ml":"mililiter", "oz":"ons",
    # daya/kelistrikan & frekuensi
    "w":"watt",
    "hz":"hertz", "khz":"kilohertz", "mhz":"megahertz", "ghz":"gigahertz",
    # baterai & arus
    "mah":"miliampere jam", "dc":"arus searah", "ac":"arus bolak-balik",
    # data
    "kb":"kilobyte", "mb":"megabyte", "gb":"gigabyte", "tb":"terabyte",
    # tampilan/warna/ketajaman
    "ppi":"piksel per inci", "dpi":"titik per inci", 
    # tekanan
    "psi":"pon per inci persegi",
    # video/fps
    "fps":"frame per detik",
}

# --- Kode ukuran (fashion) ---
SIZES = {
    "xxxs":"Triple Ekstra Kecil",
    "xxl":"Double Ekstra Besar",
    "xxxl":"Triple Ekstra Besar",
    "xs":"Ekstra Kecil",
    "xl":"Ekstra Besar",
    "s":"Kecil",
    "m":"Sedang",
    "l":"Besar",
    # ukuran lokal populer
    "ld":"Lingkar Dada",
    "lp":"Lingkar Pinggang",
    "pb":"Panjang Baju",
}

# --- Akronim umum ---
ACRONYMS = {
    # konektivitas & antarmuka
    "usbc":"Universal Serial Bus Type-C",
    "typec":"Type-C",
    "bt":"Bluetooth",
    "ble":"Bluetooth Low Energy",
    "nfc":"Near Field Communication",
    "volte":"Voice over LTE",
    "sim":"Kartu SIM",
    "esim":"eSIM",

    # komponen/penyimpanan
    "ssd":"Solid State Drive",
    "hdd":"Hard Disk Drive",

    # layar/gambar
    "lcd":"Liquid Crystal Display",
    "led":"Light Emitting Diode",
    "oled":"Organic Light Emitting Diode",
    "amoled":"Active Matrix Organic Light Emitting Diode",
    "hdr":"High Dynamic Range",
    "rgb":"Red Green Blue",
    "srgb":"Standard Red Green Blue",
    "hd":"High Definition",

    # audio
    "tws":"True Wireless Stereo",
    "anc":"Active Noise Cancelling",
    "enc":"Environmental Noise Cancelling",
    "hires":"Hi-Res Audio",

    # codec audio
    "aac":"Advanced Audio Coding (codec audio)",
    "sbc":"Subband Codec (codec audio)",
    "ldac":"Low Latency Audio Codec (codec audio)",

    # pengisian
    "pd":"Power Delivery",
    "qc":"Quick Charge",

    # label/regulator/logistik
    "bpom":"Badan Pengawas Obat dan Makanan",
    "sni":"Standar Nasional Indonesia",
    "cod":"Bayar di tempat",
    "ori":"Asli",
    "oem":"Original Equipment Manufacturer",
    "sku":"Kode Stok",
    "gsm":"Gram per meter persegi (kertas)",

    # kecantikan/kesehatan/makanan

    "uv":"Sinar Ultraviolet",
    "uva":"Sinar Ultraviolet A",
    "uvb":"Sinar Ultraviolet B",
    "ppm":"Bagian per sejuta",
    "mg":"miligram",
    "mcg":"mikrogram",
    "iu":"International Unit",
    "kcal":"kilokalori",
    "cal":"kalori",
    "netto":"Berat bersih",
    "mfg":"Tanggal produksi",
    "exp":"Tanggal kedaluwarsa",

    # lain
    "pcs":"pieces",
    "bb":"berat badan",
    "dll":"dan lain-lain",
}

# --- Pola angka + unit (mendukung desimal & huruf) ---
NUM_UNIT_RE   = re.compile(r"^(\d+(?:[.,]\d+)?)([a-z]+)$", re.I)   
RES_P_RE      = re.compile(r"^(\d{3,4})p$", re.I)                  # 720p, 1080p
RES_K_RE      = re.compile(r"^(\d+(?:[.,]\d+)?)k$", re.I)          # 2k, 4k, 5.5k
MULT_TRAIL_RE = re.compile(r"^(\d+)x$", re.I)                      # 2x, 3x
MULT_LEAD_RE  = re.compile(r"^x(\d+)$", re.I)                      # x1, x2

In [20]:
def expand_token(word: str, rule: str | None) -> str:
    """
    Mengembalikan ekspansi BI untuk token 'word'.
    Jika tidak ada ekspansi relevan, return "" (artinya: biarkan kata asli).
    """
    w = (word or "").strip()
    if not w:
        return ""
    wl = w.lower()

    # 1) size code (prioritas jika rule= size_code)
    if rule == "size_code" and wl in SIZES:
        return SIZES[wl]

    # 2) resolusi khusus: 720p/1080p/2k/4k
    m = RES_P_RE.match(wl)
    if m:
        lines = m.group(1)
        return f"resolusi {lines}p"
    m = RES_K_RE.match(wl)
    if m:
        kval = m.group(1).replace(",", ".")
        return f"resolusi {kval}K"

    # 3) pengali X di belakang saja (2x)
    m = MULT_TRAIL_RE.match(wl)
    if m:
        return f"{m.group(1)} kali"

    # 4) angka + unit (128gb, 5cm, 1tb, 30fps, 500nits, 1.5l, 120hz)
    m = NUM_UNIT_RE.match(wl)
    if m:
        num, unit = m.groups()
        num = num.replace(",", ".")
        if unit in UNITS:
            return f"{num} {UNITS[unit]}"
        # unit tak dikenal -> jangan paksa
        return ""

    # 5) unit berdiri sendiri
    if wl in UNITS:
        return UNITS[wl]

    # 6) size code umum walau rule bukan size_code
    if wl in SIZES:
        return SIZES[wl]

    # 7) akronim umum
    if wl in ACRONYMS:
        return ACRONYMS[wl]

    # 8) default
    return ""


In [21]:
def build_acronym_dict(
    df_acronyms: pd.DataFrame,
    keep_unmapped: bool = False,
    exclude_series: set[str] = EXCLUDE_SERIES,
) -> dict[str, str]:
    """
    Membangun dictionary dari df_acronyms (kolom: 'word','rule').
    - keep_unmapped=False  -> hanya simpan entri yang punya ekspansi (value != "")
    - keep_unmapped=True   -> entri tanpa ekspansi tetap dimasukkan dengan value = token aslinya
    - token yang ada di exclude_series TIDAK di-replace (value = token aslinya jika keep_unmapped=True, atau di-skip jika False)
    """
    mapping: dict[str, str] = {}
    for _, r in df_acronyms[["word", "rule"]].drop_duplicates().iterrows():
        w = str(r["word"]).lower()
        rule = r["rule"] if "rule" in r and pd.notna(r["rule"]) else None

        if w in exclude_series:
            if keep_unmapped:
                mapping[w] = w  # biarkan apa adanya
            # jika tidak keep_unmapped, lewati
            continue

        expanded = expand_token(w, rule)

        if expanded:
            mapping[w] = expanded
        else:
            if keep_unmapped:
                mapping[w] = w  # tetap simpan, tapi tidak diubah

    return mapping

In [22]:
list(acronym_dict.items())

[('cm', ''),
 ('l', ''),
 ('m', ''),
 ('kg', ''),
 ('pcs', ''),
 ('xl', ''),
 ('bb', ''),
 ('ssd', ''),
 ('8gb', ''),
 ('bpom', ''),
 ('ld', ''),
 ('s', ''),
 ('1x', ''),
 ('ml', ''),
 ('xxl', ''),
 ('dll', ''),
 ('sim', ''),
 ('i5', ''),
 ('256gb', ''),
 ('led', ''),
 ('cod', ''),
 ('gb', ''),
 ('128gb', ''),
 ('16gb', ''),
 ('mm', ''),
 ('mp', ''),
 ('hd', ''),
 ('4gb', ''),
 ('vga', ''),
 ('5cm', ''),
 ('i7', ''),
 ('lp', ''),
 ('ghz', ''),
 ('5g', ''),
 ('512gb', ''),
 ('mah', ''),
 ('gr', ''),
 ('ori', ''),
 ('i3', ''),
 ('hdd', ''),
 ('1tb', ''),
 ('ddr3', ''),
 ('2x', ''),
 ('4g', ''),
 ('pb', ''),
 ('lcd', ''),
 ('5mm', ''),
 ('mg', ''),
 ('1kg', ''),
 ('ddr4', ''),
 ('30fps', ''),
 ('64gb', ''),
 ('70cm', ''),
 ('ips', ''),
 ('hdmi', ''),
 ('nfc', ''),
 ('rgb', ''),
 ('100cm', ''),
 ('ac', ''),
 ('50cm', ''),
 ('uv', ''),
 ('3pcs', ''),
 ('500gb', ''),
 ('1080p', ''),
 ('60cm', ''),
 ('120hz', ''),
 ('2cm', ''),
 ('x1', ''),
 ('ppi', ''),
 ('km', ''),
 ('kcal', ''),
 ('sbc', '

In [23]:
# mapping hanya untuk entri yang punya ekspansi (aman untuk model):
acronym_dict = build_acronym_dict(df_all, keep_unmapped=False)

# Cek sebagian:
print({k: acronym_dict[k] for k in list(acronym_dict)[:20]})

{'ghz': 'gigahertz', 'km': 'kilometer', 'sbc': 'Subband Codec (codec audio)', '5mm': '5 milimeter', 'lbs': 'pounds', 'dpi': 'titik per inci', 'enc': 'Environmental Noise Cancelling', 's': 'Kecil', 'hdr': 'High Dynamic Range', 'gsm': 'Gram per meter persegi (kertas)', 'lb': 'pound', 'ac': 'arus bolak-balik', 'l': 'liter', 'amoled': 'Active Matrix Organic Light Emitting Diode', '2cm': '2 centimeter', '1080p': 'resolusi 1080p', '4g': '4 gram', 'cod': 'Bayar di tempat', 'bb': 'berat badan', 'bpom': 'Badan Pengawas Obat dan Makanan'}


In [24]:
# Simpan ke JSON:
with open("data/acronym_dict.json", "w", encoding="utf-8") as f:
    json.dump(acronym_dict, f, ensure_ascii=False, indent=2)

## **Detect Rating Patterns**

In [54]:
# gabungan keyword rating yang mungkin
pattern_all = re.compile(r"(bintang|rating|rate|⭐|★|🌟|\d+/\d+)", re.IGNORECASE)

DATA_SOURCES = [
    {"name": "reviews_only", "path": "data/reviews_only.csv", "text_cols": ["review"]},
    {"name": "tokopedia_500", "path": "data/tokopedia_500.csv",
     "text_cols": ["product_name", "description", "category", "keyword"]},
]

def load_texts(df: pd.DataFrame, cols: list[str]) -> pd.Series:
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return pd.Series(dtype=str)
    return df[cols].astype(str).fillna("").agg(" ".join, axis=1)

def get_snippets(text, regex, window=4):
    snippets = []
    for m in regex.finditer(text):
        # potong teks di sekitar match
        words = text.split()
        # cari index kata yang mengandung match
        for idx, w in enumerate(words):
            if m.group(0).lower() in w.lower():
                start = max(0, idx - window)
                end = min(len(words), idx + window + 1)
                snippet = " ".join(words[start:end])
                snippets.append(snippet)
                break
    return snippets

for src in DATA_SOURCES:
    path, name, cols = src["path"], src["name"], src["text_cols"]
    if not os.path.exists(path):
        print(f"⚠️ file tidak ditemukan: {path}")
        continue

    df = pd.read_csv(path)
    texts = load_texts(df, cols)

    print(f"\n=== Source: {name} ===")
    count = 0
    for t in texts:
        snippets = get_snippets(t, pattern_all, window=4)
        if snippets:
            for s in snippets[:2]:  # tampilkan max 2 snippet per baris biar ringkas
                print("→", s)
                count += 1
        if count >= 40:  # batas contoh total biar ga kepanjangan
            break
    print(f"(ditampilkan {count} snippet contoh)")



=== Source: reviews_only ===
→ ProductReview(feedback_id=1269906046, variant_name='hitam', message='', rating=5.0, review_age='6 bulan lalu', user_full_name='E***o',
→ ProductReview(feedback_id=1649919524, variant_name='Agate Green', message='', rating=5.0, review_age='5 hari lalu', user_full_name='dony',
→ terbaru', response_created_text='5 hari lalu', images=['https://images.tokopedia.net/img/cache/600/aphluv/1997/1/1/6660d6c5ab0a418ea4e72e60c8c5bf9b~'], videos=[], likes=0)
→ harga kaki lima kualitas bintang 5... mudah2 awet ...
→ aftersell nya keren poll! bintang 1000++ beli disini ada
→ series tapi bahan yang berateful lebih tipis daripada yanga
→ produk: Sangat memuaskan 👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍 🌟🌟🌟🌟🌟
→ produk: Sangat memuaskan 👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍 🌟🌟🌟🌟🌟
→ abu - 4XL-5XL=38(90-102kg)', message='', rating=5.0, review_age='3 minggu lalu', user_full_name='S***w',
→ Sengaja ngasih bintang 5 supaya bisa di
→ Padhaal bintang nya bgus tapi bahan
→ bahan nya nyaman banget bestt⭐⭐⭐⭐⭐
→

In [69]:

PATTERNS = {
    # --- ANGKA /5 -----------------------------------------------------------
    "frac_out_of_5": r"\b([1-5](?:[.,]5)?)\s*/\s*5\b",
    # contoh: "4.5/5 mantap", "5/5 recommended", "4,5/5 untuk kualitas"

    # --- “BINTANG” + ANGKA --------------------------------------------------
    "bintang_number": r"\bbintang\s*([1-5](?:[.,]5)?)\b",
    # contoh: "bintang 5, puas!", "bintang 4,5 cukup oke"

    "number_bintang": r"\b([1-5])\s*bintang\b",
    # contoh: "5 bintang buat seller ini", "2 bintang karena packing kurang"

    "word_number_bintang": r"\b(satu|dua|tiga|empat|lima)\s*bintang\b",
    # contoh: "tiga bintang, sesuai harga", "lima bintang pokoknya"

    # variasi “bintangnya 5”
    "bintangnya_number": r"\bbintang(?:nya|ny| nya)\s*([1-5])\b",
    # contoh: "bintangnya 5", "bintangnya 4 karena telat kirim"

    # user memberi/“kasih” bintang
    "kasih_bintang": r"\b(kasih|beri|dapat|ngasih)\s*([1-5])\s*bintang\b",
    # contoh: "kasih 5 bintang", "dapat 3 bintang dari saya"
    
    # khusus "bintang 1000++" / angka besar → langsung map ke 5
    "bintang_many": r"\bbintang\s*\d{3,}(?:\++)?(?=\s|$|[^\w])",
    # contoh: "bintang 1000++", "bintang 2000+"

    # --- SETENGAH BINTANG ---------------------------------------------------
    "fraction_half": r"\b([1-5])(?:\s?½|(?:\s?1\/2)|[.,]5)\s*/\s*5\b",
    # contoh: "4½/5 layak beli", "4 1/2 / 5", "4,5/5"

    "bintang_half_after": r"\bbintang\s*([1-5])(?:\s?½|(?:\s?1\/2)|[.,]5)\b",
    # contoh: "bintang 4½", "bintang 3,5 untuk kualitas"

    "bintang_half_before": r"\b([1-5])(?:\s?½|(?:\s?1\/2)|[.,]5)\s*bintang\b",
    # contoh: "4,5 bintang", "3½ bintang cukup"

    # --- SKALA LAIN (opsional: remap ke 1–5) --------------------------------
    "out_of_ten": r"\b([0-9]{1,2}(?:[.,]5)?)\s*/\s*10\b",
    # contoh: "9/10 worth it", "8,5/10 untuk fitur"

    # --- “RATING/RATE/NILAI/SKOR” ------------------------------------------
    "rating_colon": r"\b(rating|rate|nilai|skor|score|grade)\s*[:\-]?\s*([1-5](?:[.,]5)?)\b",
    # contoh: "rating: 5", "nilai 4,5", "skor-4"

    # --- “STARS” DALAM BAHASA INGGRIS/INDO ---------------------------------
    "en_stars_word": r"\b([1-5])\s*stars?\b",
    # contoh: "4 stars overall", "5 star quality"

    "id_stars_word": r"\b(satu|dua|tiga|empat|lima)\s*star\b",
    # contoh: "tiga star lumayan", "lima star terbaik"

    # --- EMOJI / SIMBOL BINTANG --------------------------------------------
    "stars_repeated": r"(?:⭐(?:\uFE0F)?|★|🌟|✩|✮|✯){2,10}",
    # contoh: "★★★★★", "⭐⭐⭐⭐⭐", "🌟🌟🌟", "⭐️⭐️⭐️⭐️⭐️"  (⭐️ = '⭐' + FE0F)

    "star_multiplication": r"(?:⭐(?:\uFE0F)?|★)\s*[xX×]\s*([1-5])",
    # contoh: "⭐ x5", "★×4 bagus", "⭐x3 lumayan"

    # --- VARIASI TANDA ------------------------------------------------------
    "paren_or_bracket_5_5": r"[\(\[]\s*([1-5](?:[.,]5)?)\s*/\s*5\s*[\)\]]",
    # contoh: "(5/5) recommended", "[4,5/5] puas"

    # --- GAYA “OF/DR” -------------------------------------------------------
    "number_of_number": r"\b([1-5])\s*(?:of|dr)\s*5\b",
    # contoh: "4 of 5 for design", "3 dr 5 lah"
    
}

In [70]:
import re
s = "aftersell nya keren poll! bintang 1000++ beli disini ada"
rx = re.compile(r"\bbintang\s*\d{3,}(?:\+{1,2})?(?=\s|$|[^\w])", re.IGNORECASE)
print(bool(rx.search(s)))  # True kalau fix-nya bekerja


True


In [71]:
FLAGS = re.IGNORECASE | re.MULTILINE
COMPILED = {k: re.compile(v, FLAGS) for k, v in PATTERNS.items()}

In [72]:
# --- 2) sumber data & kolom teks yang dipakai ---
DATA_SOURCES = [
    {
        "name": "reviews_only",
        "path": "data/reviews_only.csv",
        "text_cols": ["review"],  # hanya kolom ini yang ada
    },
    {
        "name": "tokopedia_500",
        "path": "data/tokopedia_500.csv",
        # pilih kolom teks yang relevan (hindari kolom numerik)
        "text_cols": ["product_name", "description", "category", "keyword"],
    },
]

In [73]:
# --- util: gabung beberapa kolom teks jadi satu string per baris ---
def load_texts(df: pd.DataFrame, cols: list[str]) -> pd.Series:
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return pd.Series(dtype=str)
    s = df[cols].astype(str).fillna("").agg(" ".join, axis=1)
    return s[s.str.strip() != ""]

def probe_series(series: pd.Series, compiled: dict, n_examples: int = 5):
    counts = defaultdict(int)
    examples = defaultdict(list)
    for t in series.astype(str).fillna(""):
        for name, rx in compiled.items():
            ms = list(rx.finditer(t))
            if not ms:
                continue
            counts[name] += len(ms)
            if len(examples[name]) < n_examples:
                m = ms[0]
                start, end = max(0, m.start()-25), min(len(t), m.end()+25)
                examples[name].append(t[start:end])
    rows = []
    for name in compiled.keys():
        rows.append({
            "pattern": name,
            "count": int(counts.get(name, 0)),
            "examples": examples.get(name, []),
        })
    return pd.DataFrame(rows).sort_values("count", ascending=False).reset_index(drop=True)


In [74]:
# --- 3) run: per-source & total ---
per_source = []
tot_counts = defaultdict(int)
tot_examples = defaultdict(list)

for src in DATA_SOURCES:
    path, name = src["path"], src["name"]
    if not os.path.exists(path):
        print(f"⚠️ file tidak ditemukan: {path}")
        continue
    df = pd.read_csv(path)
    texts = load_texts(df, src["text_cols"])
    if texts.empty:
        print(f"⚠️ {name}: tidak ada teks yang bisa dipakai.")
        continue

    df_probe = probe_series(texts, COMPILED, n_examples=5)
    df_probe.insert(0, "source", name)
    per_source.append(df_probe)

    for _, row in df_probe.iterrows():
        p = row["pattern"]
        tot_counts[p] += int(row["count"])
        for ex in row["examples"]:
            if len(tot_examples[p]) < 5:
                tot_examples[p].append(ex)
                
# simpan per-source
if per_source:
    df_all = pd.concat(per_source, ignore_index=True)
    df_all.to_csv("rating_probe_per_source.csv", index=False)
    print("✅ disimpan: rating_probe_per_source.csv")
else:
    print("⚠️ tidak ada sumber yang valid.")

✅ disimpan: rating_probe_per_source.csv


In [75]:
# simpan total
summary = []
for name in PATTERNS.keys():
    summary.append({
        "pattern": name,
        "total_count": tot_counts.get(name, 0),
        "examples": tot_examples.get(name, []),
    })
df_total = pd.DataFrame(summary).sort_values("total_count", ascending=False)
# df_total.to_csv("rating_probe_total.csv", index=False)
# df_total.to_json("rating_probe_total.json", orient="records", force_ascii=False, indent=2)
# print("✅ disimpan: rating_probe_total.(csv|json)")

print("\nTop 20 total pattern:")
print(df_total[["pattern","total_count"]].head(20))


Top 20 total pattern:
                 pattern  total_count
1         bintang_number           98
14        stars_repeated           23
0          frac_out_of_5           18
2         number_bintang           16
11          rating_colon            8
10            out_of_ten            7
12         en_stars_word            1
3    word_number_bintang            1
6           bintang_many            1
16  paren_or_bracket_5_5            0
15   star_multiplication            0
13         id_stars_word            0
9    bintang_half_before            0
8     bintang_half_after            0
7          fraction_half            0
5          kasih_bintang            0
4      bintangnya_number            0
17      number_of_number            0


### Making the dictionary

In [76]:
EXAMPLES = {
    "frac_out_of_5": "4.5/5 mantap",
    "bintang_number": "bintang 4,5 cukup oke",
    "number_bintang": "5 bintang buat seller ini",
    "word_number_bintang": "tiga bintang, sesuai harga",
    "bintangnya_number": "bintangnya 4 karena telat kirim",
    "kasih_bintang": "kasih 5 bintang",
    "bintang_many": "bintang 1000++",
    "fraction_half": "4½/5 layak beli",
    "bintang_half_after": "bintang 3,5 untuk kualitas",
    "bintang_half_before": "4,5 bintang",
    "out_of_ten": "9/10 worth it",
    "rating_colon": "rating: 5",
    "en_stars_word": "4 stars overall",
    "id_stars_word": "tiga star lumayan",
    "stars_repeated": "⭐⭐⭐⭐⭐",
    "star_multiplication": "⭐ x5",
    "paren_or_bracket_5_5": "(5/5) recommended",
    "number_of_number": "4 of 5 for design"
}


**simple rating_patterns dictiinary**

In [77]:
# ---------- 2) patterns.json (cuma regex + example) ----------
def build_patterns_json(patterns, examples):
    return [
        {"id": k, "pattern": v, "example": examples.get(k, "")}
        for k,v in patterns.items()
    ]

In [79]:
from pathlib import Path

patterns_json = build_patterns_json(PATTERNS, EXAMPLES)
Path("rating_patterns.json").write_text(json.dumps(patterns_json, ensure_ascii=False, indent=2), encoding="utf-8")

2404

**advanced rating_rules dictionary**

In [80]:
# 2) GENERATOR
WORD2NUM = {"satu":1,"dua":2,"tiga":3,"empat":4,"lima":5}
PRIORITY_ORDER = [
    "frac_out_of_5", "paren_or_bracket_5_5",
    "bintang_many", "bintang_number", "number_bintang",
    "word_number_bintang", "bintangnya_number", "kasih_bintang",
    "bintang_half_after", "bintang_half_before", "fraction_half",
    "rating_colon",
    "en_stars_word", "id_stars_word",
    "stars_repeated", "star_multiplication",
    "number_of_number",
]
PRIORITY_BASE = 100

In [81]:
def build_patterns_json():
    """Versi ringkas: id, pattern, example."""
    items = []
    for k, v in PATTERNS.items():
        items.append({"id": k, "pattern": v, "example": EXAMPLES.get(k, "")})
    return items

def infer_rule(key: str, pattern: str) -> dict:
    """Bangun 1 rule lengkap untuk rating_rules.json dari PATTERNS."""
    rule = {"id": key, "pattern": pattern, "example": EXAMPLES.get(key, "")}

    # Tipe khusus
    if key == "stars_repeated":
        rule.update({
            "type": "count_emoji",
            "emojis": ["⭐","★","🌟","✩","✮","✯"],
            "min_count": 2, "max_count": 5
        })
    elif key == "bintang_many":
        rule.update({"type": "assign", "value": 5.0})
    elif key in ("word_number_bintang", "id_stars_word"):
        rule.update({
            "type": "assign",
            "mapping": WORD2NUM,
            "value_from_group": 1
        })
    elif key in ("bintang_half_after", "bintang_half_before"):
        rule.update({
            "type": "extract",
            "value_group": 1,
            "add": 0.5,
            "postprocess": {"replace": {",":"."}},
            "clamp": [1.0, 5.0]
        })
    elif key == "star_multiplication":
        rule.update({
            "type": "extract",
            "value_group": 1,
            "clamp": [1.0, 5.0]
        })
    else:
        # default: extract angka dari group pertama kalau ada,
        # lalu normalisasi dan clamp
        if re.search(r"\((?!\?:).*?\)", pattern):
            rule.update({
                "type": "extract",
                "value_group": 1,
                "postprocess": {"replace": {",":"."}},
                "clamp": [1.0, 5.0]
            })
        else:
            # fallback aman: treat as assign 5 (jarang kena di set ini)
            rule.update({"type": "assign", "value": 5.0})

    # priority
    if key in PRIORITY_ORDER:
        priority = PRIORITY_BASE - PRIORITY_ORDER.index(key)*2
    else:
        priority = 50
    rule["priority"] = priority

    return rule

In [82]:
def build_rating_rules_json(include_out_of_ten: bool = False):
    rules = []
    for k, v in PATTERNS.items():
        if not include_out_of_ten and k == "out_of_ten":
            continue  # exclude by default
        rules.append(infer_rule(k, v))
    rules = sorted(rules, key=lambda r: -r["priority"])
    return {
        "schema_version": "1.0",
        "defaults": {
            "flags": ["IGNORECASE", "MULTILINE"],
            "min_rating": 1.0,
            "max_rating": 5.0
        },
        "rules": rules,
        "blacklist": [
            {
                "id": "star_bullet_single",
                "pattern": r"^[\s]*[★⭐]",
                "reason": "Bullet list (satu bintang di awal baris), bukan rating."
            }
        ]
    }


In [83]:
def save_all(out_dir: str = "."):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)

    rating_rules_json = build_rating_rules_json(include_out_of_ten=False)
    (out / "rating_rules.json").write_text(
        json.dumps(rating_rules_json, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print("✅ created:", out / "rating_rules.json")

In [84]:
if __name__ == "__main__":
    save_all(".")

✅ created: rating_rules.json
