In [2]:
from tokopaedi import search, get_product
import pandas as pd
import time, random, uuid, os, json
from typing import Any, Dict, List, Optional

In [4]:
KEYWORDS = [
    "elektronik", "handphone", "komputer", "fashion pria", "fashion wanita",
    "ibu dan bayi", "kecantikan", "kesehatan", "rumah tangga", "olahraga",
    "hobi", "makanan", "minuman", "sekolah",
]

MAX_PER_KEYWORD = 500
SLEEP_SEARCH = (3.0, 5.0)
SLEEP_DETAIL = (1.2, 2.0)
RETRY_PER_ITEM = 2

CSV_PATH = "tokopedia_500.csv"
PARQUET_PATH = "tokopedia_500.parquet"  # ditulis kalau pyarrow ada
STATE_PATH = "tokopedia_state.json"     # menyimpan keyword yang sudah selesai

In [5]:
# ---------- Util State & IO ----------

def load_state() -> Dict[str, Any]:
    if os.path.exists(STATE_PATH):
        try:
            with open(STATE_PATH, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {"completed_keywords": []}

def save_state(state: Dict[str, Any]):
    tmp = STATE_PATH + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(state, f, ensure_ascii=False, indent=2)
    os.replace(tmp, STATE_PATH)

def load_existing_rows() -> pd.DataFrame:
    """Load CSV/Parquet kalau ada (prioritas CSV)."""
    if os.path.exists(CSV_PATH):
        try:
            return pd.read_csv(CSV_PATH)
        except Exception:
            pass
    if os.path.exists(PARQUET_PATH):
        try:
            return pd.read_parquet(PARQUET_PATH)
        except Exception:
            pass
    return pd.DataFrame(columns=[
        "product_id","product_name","url","description","price",
        "sold_count","rating","total_stock","category","keyword"
    ])

def write_checkpoint(df: pd.DataFrame):
    # tulis CSV
    tmp = CSV_PATH + ".tmp"
    df.to_csv(tmp, index=False, encoding="utf-8-sig")
    os.replace(tmp, CSV_PATH)

    # tulis Parquet kalau tersedia
    try:
        import pyarrow  # noqa: F401
        tmp_pq = PARQUET_PATH + ".tmp"
        df.to_parquet(tmp_pq, index=False)
        os.replace(tmp_pq, PARQUET_PATH)
    except Exception:
        # abaikan kalau pyarrow tidak ada
        pass

# ---------- Helpers scraping ----------

def extract_description(detail) -> str:
    data = None
    if hasattr(detail, "json"):
        try:
            data = detail.json()
        except Exception:
            data = None
    if data is None and isinstance(detail, dict):
        data = detail

    if isinstance(data, dict):
        pdp = data.get("product_detail") or {}
        if isinstance(pdp, dict) and pdp.get("description"):
            return str(pdp["description"]).strip()
        d = data.get("description") or data.get("long_description") or ""
        return str(d).strip()

    desc = getattr(detail, "description", "") or ""
    if not desc:
        pdp = getattr(detail, "product_detail", None)
        if isinstance(pdp, dict) and pdp.get("description"):
            desc = pdp["description"]
    return str(desc).strip()

def to_dict(detail) -> Dict[str, Any]:
    if hasattr(detail, "json"):
        try:
            return detail.json()
        except Exception:
            pass
    if isinstance(detail, dict):
        return detail
    # fallback objek attribute ke dict minimal
    return {
        "sold_count": getattr(detail, "sold_count", None),
        "total_stock": getattr(detail, "total_stock", None),
        "category": getattr(detail, "category", None),
    }

def get_detail_safe(pid):
    last_err = None
    for attempt in range(RETRY_PER_ITEM + 1):
        try:
            return get_product(product_id=pid, debug=False)
        except Exception as e:
            last_err = e
            if attempt < RETRY_PER_ITEM:
                time.sleep((1.2 * (2 ** attempt)) + random.uniform(0, 0.5))
            else:
                raise last_err

def safe_pid(pid, name, url):
    if pid:
        return pid
    base = url or name or str(uuid.uuid4())
    return f"noid::{abs(hash(base))}"



In [6]:
# ---------- Main ----------

def main():
    # Load state + data existing (resume)
    state = load_state()
    completed = set(state.get("completed_keywords", []))

    df_existing = load_existing_rows()
    if "product_id" not in df_existing.columns:
        df_existing["product_id"] = []
    # set untuk skip id yang sudah ada
    seen_ids = set(df_existing["product_id"].astype(str).tolist())

    # working buffer (biar tidak bolak-balik read/write besar)
    rows_buffer: List[Dict[str, Any]] = []

    for kw in KEYWORDS:
        if kw in completed:
            print(f"✓ Skip '{kw}' (sudah selesai sebelumnya)")
            continue

        print(f"\n=== Keyword: {kw} (max {MAX_PER_KEYWORD}) ===")
        # SEARCH: guard NoneType (memperbaiki error Non-iterable)
        try:
            results = search(kw, max_result=MAX_PER_KEYWORD, debug=False)
        except Exception as e:
            print(f"search gagal untuk '{kw}': {e}")
            time.sleep(random.uniform(*SLEEP_SEARCH))
            continue

        # pastikan iterable -> list, handle None
        try:
            results = list(results or [])
        except TypeError:
            results = []

        if not results:
            print(f"search untuk '{kw}' tidak mengembalikan hasil, skip.")
            time.sleep(random.uniform(*SLEEP_SEARCH))
            # tandai keyword “selesai” meski kosong, agar tidak diulang-ulang
            completed.add(kw)
            state["completed_keywords"] = sorted(list(completed))
            save_state(state)
            continue

        n = len(results)
        print(f"Ditemukan {n} produk untuk '{kw}'")
        time.sleep(random.uniform(*SLEEP_SEARCH))

        for i, prod in enumerate(results, 1):
            pid   = getattr(prod, "product_id", None)
            name  = getattr(prod, "product_name", "") or ""
            url   = getattr(prod, "url", None)
            price = getattr(prod, "price", None)
            rating= getattr(prod, "rating", None)
            category = getattr(prod, "category", None)

            # stabilkan id + skip jika sudah ada
            pid = str(safe_pid(pid, name, url))
            if pid in seen_ids:
                if i % 50 == 0:
                    print(f"[{kw} {i}/{n}] (skip dup) {name[:60]}")
                continue

            desc = None
            sold_count = None
            total_stock = None

            try:
                det = get_detail_safe(pid)
                desc = extract_description(det)
                data = to_dict(det)
                sold_count  = data.get("sold_count")
                total_stock = data.get("total_stock")
                category    = data.get("category") or category
            except Exception as e:
                print(f"[{kw} {i}/{n}] detail gagal ({e}) — simpan meta search saja.")

            row = {
                "product_id": pid,
                "product_name": name,
                "url": url,
                "description": desc,
                "price": price,
                "sold_count": sold_count,
                "rating": rating,
                "total_stock": total_stock,
                "category": category,
                "keyword": kw,
            }
            rows_buffer.append(row)
            seen_ids.add(pid)

            # progress log ringan
            print(f"[{kw} {i}/{n}] {name[:60]}")
            time.sleep(random.uniform(*SLEEP_DETAIL))

            # checkpoint per 100 item baru (hemat I/O)
            if len(rows_buffer) >= 100:
                df_add = pd.DataFrame(rows_buffer)
                rows_buffer.clear()
                df_existing = pd.concat([df_existing, df_add], ignore_index=True)
                # dedup + write
                df_existing = df_existing.drop_duplicates(subset=["product_id"]).reset_index(drop=True)
                write_checkpoint(df_existing)
                print(f"↳ checkpoint ({len(df_existing)} rows)")

        # selesai satu keyword → tulis checkpoint & update state
        if rows_buffer:
            df_add = pd.DataFrame(rows_buffer)
            rows_buffer.clear()
            df_existing = pd.concat([df_existing, df_add], ignore_index=True)
            df_existing = df_existing.drop_duplicates(subset=["product_id"]).reset_index(drop=True)
            write_checkpoint(df_existing)

        completed.add(kw)
        state["completed_keywords"] = sorted(list(completed))
        save_state(state)
        print(f"↳ selesai '{kw}' — total rows: {len(df_existing)}")

    # final save (idempotent)
    df_existing = df_existing.drop_duplicates(subset=["product_id"]).reset_index(drop=True)
    write_checkpoint(df_existing)
    print(f"\n✅ Done. Saved {len(df_existing)} rows → {CSV_PATH} (+parquet jika tersedia)")

In [7]:

if __name__ == "__main__":
    main()



=== Keyword: elektronik (max 500) ===
Ditemukan 258 produk untuk 'elektronik'
[elektronik 1/258] ENOVE - Device Stylish 3 Warna | Bisa Dijeda | Compact & Ele
[elektronik 2/258] LIGE sub-merek FOXBOX Jam Tangan Pria Fashion Baru Jam Tanga
[elektronik 3/258] HEPU HP-985 Power Bank 30.000 Mah  1 to 6
[elektronik 4/258] PESANAN KHUSUS CV. SYAILENDRA ELEKTRONIK
[elektronik 5/258] Timbangan elektronik presisi tinggi timbangan elektronik por
[elektronik 6/258] [EXCLUSIVE DADAHAPHONE] MIXIO MP-08 5000mAh Powerbank Mini 2
[elektronik 7/258] LED TV POLYTRON 55" PLD 55UG9959 55 INCH USB MOVIE ANDROID N
[elektronik 8/258] IFM ELECTRONIC PN7094 75 BAR
[elektronik 9/258] SS 12 IN 1 Electric Stationery Set / Set Alat Tulis Elektron
[elektronik 10/258] Komputer All in One Baru, Intel Core i7 Gen ke-6, RAM 16GB +
[elektronik 11/258] 【Ready+Bisa COD】Advance Speaker Bluetooth Karaoke TWS 8inch+
[elektronik 12/258] 【Promo Live】HSA Electric Saver GEN 3 | Alat Penghemat Daya L
[elektronik 13/258] VIVAN VS1

  df_existing = pd.concat([df_existing, df_add], ignore_index=True)


↳ checkpoint (100 rows)
[elektronik 101/258] SHARP KSN-18 MG-SL Rice Cooker - 1.8 Liter
[elektronik 102/258] UPHOME 20000mAh Kipas Angin Lipat Portable Mini USB Charger 
[elektronik 103/258] 【Harga Spesial】MIC-206 & MIC-101 PRO Advance UHF (Nirkabel) 
[elektronik 104/258] RST Ebook Anak Buku Pintar Elektronik Anak 3 Bahasa dan 4 Ba
[elektronik 105/258] TEBUS MURAH CUCI GUDANG TOKOPEDIA Sharp Mesin Cuci 2 tabung 
[elektronik 106/258] Cuci gudang Sharp Kulkas SJ-316MG-GB/GR Kulkas 2 Pintu Shine
[elektronik 107/258] Goto [COD] Kenko Air Fryer Low Watt 3.5 L Mesin Penggorengan
[elektronik 108/258] SPEAKER FLECO F410M FREE MIC
[elektronik 109/258] 【COD】NEW Advance Votre V-120 Oven Listrik Pemanas atas bawah
[elektronik 110/258] PPlus Pancil Elektrik Multifungsi Anti Lengket Listrik Panci
[elektronik 111/258] 【COD】Jovitech Speaker Bluetooth Jam LED Smart Multi-function
[elektronik 112/258] [Exclusive Rizky.chikay] Mesin Cuci Mini Portable 8L Sterili
[elektronik 113/258] 【Hello Nuki】Ripple Ri