## Shopee Multi-product Scraper(DOM/UI Method)

In [1]:
"""
Project: Shopee Shop Products Scraper (DOM/UI Method)
Description: 
    Scrapes products from Shopee Shop URLs.
    - Cleans Price, Sold, Rating.
    - Extracts Shop/Product IDs.
    - Adds timestamp.
"""

import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import datetime
import re

# URL ของร้านค้า
SHOP_URLS = [
    "https://shopee.co.th/mizumi_officialshop",
    "https://shopee.co.th/moleculogy_by_diamond_grains",
    # "https://shopee.co.th/drpongshop"
]

# --- Helper Functions ---
def clean_price(s):
    if not s: return 0.0
    s = str(s).replace(",", "")
    match = re.search(r"\d+(\.\d+)?", s)
    return float(match.group()) if match else 0.0

def clean_sold(s):
    if not s: return 0
    s = str(s).replace(",", "")                                   # ลบลูกน้ำ
    mult = 1000 if re.search(r"k|พัน", s, re.IGNORECASE) else 1    # ถ้าเจอ 'k' หรือ 'พัน' ให้คูณ 1000, ถ้าไม่เจอคูณ 1
    match = re.search(r"[0-9.]+", s)
    return int(float(match.group()) * mult) if match else 0
    
def clean_rating(s):
    match = re.search(r"\d+\.?\d*", str(s))        # ค้นหาตัวเลข (จำนวนเต็ม หรือ ทศนิยม) จากข้อความ
    return float(match.group()) if match else 0.0  # ถ้าเจอแปลงเป็น float, ถ้าไม่เจอคืนค่า 0.0

async def get_current_page_number(page):
    active_btn = page.locator("button.shopee-button-solid--primary")
    if await active_btn.count() > 0:
        txt = await active_btn.inner_text()
        try:
            return int(txt.strip())
        except:
            return None
    return None

def store_from_url(url):
    url = url.replace("https://shopee.co.th/", "").split("#")[0].split("?")[0]
    return url.strip("/")

async def extract_items(page, shop_name):
    items = []
    cards = page.locator("div.shop-search-result-view__item")
    count = await cards.count()

    for i in range(count):
        c = cards.nth(i)
        product_name = await c.locator("div.line-clamp-2").inner_text() if await c.locator("div.line-clamp-2").count() > 0 else ""
        
        # Price
        price_raw = await c.locator("span.font-medium.truncate").inner_text() if await c.locator("span.font-medium.truncate").count() > 0 else ""
        price = clean_price(price_raw)
        
        # Sold
        sold_raw = await c.locator("div.text-shopee-black87").inner_text() if await c.locator("div.text-shopee-black87").count() > 0 else ""
        sold = clean_sold(sold_raw)

        # Rating
        rating_raw = await c.locator("span.inline-block").inner_text() if await c.locator("span.inline-block").count() > 0 else ""
        rating = clean_rating(rating_raw)

        img = await c.locator("img").first.get_attribute("src")
        
        href_el = c.locator("a").first
        url = await href_el.get_attribute("href") if await href_el.count() > 0 else ""
        if url and not url.startswith("https"):
            url = "https://shopee.co.th" + url

        # Shop ID และ Product ID จาก URL
        shop_id = ""
        product_id = ""
        match = re.search(r"i\.(\d+)\.(\d+)", url)
        if match:
            shop_id = match.group(1)
            product_id = match.group(2)
        
        items.append({
            "shop_id": shop_id,
            "product_id": product_id,
            "shop_name": shop_name,
            "product_name": product_name.strip(),
            "price": price,
            "sold": sold,
            "rating": rating,
            "image": img,
            "url": url
        })
    return items

# --- Main Scraper ---
async def main():
    async with async_playwright() as p:
        print("Attaching to running browser...")
        try:
            browser = await p.chromium.connect_over_cdp("http://127.0.0.1:9222")
            context = browser.contexts[0]
        except Exception as e:
            print(f"Connection failed: {e}")
            return

        master_items = []

        for idx, shop_url in enumerate(SHOP_URLS):
            print(f"\n[{idx + 1}/{len(SHOP_URLS)}] Processing URL: {shop_url}")
            page = await context.new_page()
            shop_total_items = 0

            # Shop Name
            try:
                await page.goto(shop_url, wait_until="domcontentloaded")
                
                try:
                    h1_selector = "h1.section-seller-overview-horizontal__portrait-name"
                    await page.wait_for_selector(h1_selector, state="visible", timeout=5000)
                    shop_name = (await page.locator(h1_selector).first.inner_text()).strip()
                except:
                    # ถ้าหาไม่เจอจริงๆ ค่อยใช้ URL (กัน Error)
                    print("   ! Warning: H1 not found within 5s, fallback to URL name.")
                    shop_name = store_from_url(shop_url)
                
                print(f"   > Shop Name: {shop_name}")

                page_no = 1
                while True:
                    # Scroll เพื่อโหลดรูป
                    for _ in range(8):
                        await page.mouse.wheel(0, 2000)
                        await asyncio.sleep(0.5)

                    items = await extract_items(page, shop_name)
                    count = len(items)
                    
                    if count == 0:
                         print(f"   ! Page {page_no}: No items found. Stop.")
                         break

                    master_items.extend(items)
                    shop_total_items += count
                    
                    print(f"   > Page {page_no}: Found {count} items | Total: {shop_total_items}")

                    # ตรวจสอบ Pagination
                    current_page = await get_current_page_number(page)
                    next_btn = page.locator("button.shopee-icon-button--right")

                    if await next_btn.count() == 0:
                        print(f"   ! Page {page_no}: No next button. Stop.")
                        break

                    if await next_btn.get_attribute("disabled") is not None:
                        print(f"   ! Page {page_no}: Next button disabled (Last Page). Stop.")
                        break

                    await next_btn.click()
                    await asyncio.sleep(2)
                    
                    new_page = await get_current_page_number(page)
                    if new_page == current_page:
                         print(f"   ! Page {page_no}: Page number didn't change. Stop.")
                         break
                    
                    page_no += 1
            
            except Exception as e:
                print(f"   ! Error: {e}")
            
            finally:
                print(f"   >>> Collected {shop_total_items} items for this shop.")
                print(f"   [Closed Tab for Shop {idx + 1}]")
                await page.close()

        # Final Summary
        if master_items:
            df = pd.DataFrame(master_items)
            
            df['collected_at'] = pd.Timestamp.now().floor('s') 
            
            cols = [
                "shop_id", 
                "product_id", 
                "shop_name", 
                "product_name", 
                "price", 
                "sold", 
                "rating", 
                "image", 
                "url",
                "collected_at"
            ]
            
            df = df[[c for c in cols if c in df.columns]]

            # Excel
            now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"shopee_products_{now}.xlsx"
            df.to_excel(filename, index=False)
            
            print(f"\n{'='*50}")
            print(f"All done! Grand Total: {len(df)} items collected.")
            print(f"{'='*50}")
            
            try:
                display(df.head())
            except:
                print(df.head())
        else:
            print("\nNo items collected.")

# await main()
df = asyncio.run(main())

Attaching to running browser...

[1/2] Processing URL: https://shopee.co.th/mizumi_officialshop
   > Shop Name: mizumi_officialshop
   > Page 1: Found 30 items | Total: 30
   > Page 2: Found 30 items | Total: 60
   > Page 3: Found 30 items | Total: 90
   > Page 4: Found 30 items | Total: 120
   > Page 5: Found 27 items | Total: 147
   ! Page 5: Page number didn't change. Stop.
   >>> Collected 147 items for this shop.
   [Closed Tab for Shop 1]

[2/2] Processing URL: https://shopee.co.th/moleculogy_by_diamond_grains
   > Shop Name: Moleculogy by Diamond Grains
   > Page 1: Found 30 items | Total: 30
   > Page 2: Found 1 items | Total: 31
   ! Page 2: Page number didn't change. Stop.
   >>> Collected 31 items for this shop.
   [Closed Tab for Shop 2]

All done! Grand Total: 178 items collected.


Unnamed: 0,shop_id,product_id,shop_name,product_name,price,sold,rating,image,url,collected_at
0,70802054,23976658475,mizumi_officialshop,[แพ็คคู่] MizuMi UV Bright Body Serum Fragranc...,389.0,50000,4.9,https://down-th.img.susercontent.com/file/th-1...,https://shopee.co.th/-แพ็คคู่-MizuMi-UV-Bright...,2025-12-15 09:53:33
1,70802054,18593775620,mizumi_officialshop,[แพ็คคู่ SET DUO] MizuMi Bright Body Serum เซต...,389.0,20000,4.9,https://down-th.img.susercontent.com/file/sg-1...,https://shopee.co.th/-แพ็คคู่-SET-DUO-MizuMi-B...,2025-12-15 09:53:33
2,70802054,44000340668,mizumi_officialshop,MizuMi pH Balancing Facial Cleanser 100 ml. เจ...,195.0,10000,4.9,https://down-th.img.susercontent.com/file/sg-1...,https://shopee.co.th/MizuMi-pH-Balancing-Facia...,2025-12-15 09:53:33
3,70802054,1464312736,mizumi_officialshop,[เลือกแพ็คด้านใน] MizuMi Smooth Cleansing Wate...,489.0,100000,4.9,https://down-th.img.susercontent.com/file/sg-1...,https://shopee.co.th/-เลือกแพ็คด้านใน-MizuMi-S...,2025-12-15 09:53:33
4,70802054,1208665973,mizumi_officialshop,MizuMi Extra Mild Facial Cleanser 100ml เจลล้า...,114.0,20000,4.9,https://down-th.img.susercontent.com/file/sg-1...,https://shopee.co.th/MizuMi-Extra-Mild-Facial-...,2025-12-15 09:53:33
