## Shopee Reviews Scraper (DOM/UI Method)

In [2]:
import nest_asyncio
nest_asyncio.apply() 

import asyncio
import pandas as pd
import re
from playwright.async_api import async_playwright

PRODUCTS_URLS = [
    "https://shopee.co.th/Samsung-Galaxy-S25-Ultra-(5G)-12-256GB-%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%88%E0%B8%AD-6.9-%E0%B8%99%E0%B8%B4%E0%B9%89%E0%B8%A7-%E0%B8%81%E0%B8%A5%E0%B9%89%E0%B8%AD%E0%B8%87200MP-%E0%B9%81%E0%B8%9A%E0%B8%95%E0%B9%80%E0%B8%95%E0%B8%AD%E0%B8%A3%E0%B8%B5%E0%B9%88-5-000mAh-%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%81%E0%B8%B1%E0%B8%99%E0%B8%A8%E0%B8%B9%E0%B8%99%E0%B8%A2%E0%B9%8C%E0%B9%84%E0%B8%97%E0%B8%A2-%E0%B8%9E%E0%B8%A3%E0%B9%89%E0%B8%AD%E0%B8%A1%E0%B8%AA%E0%B9%88%E0%B8%87-By-WPN-Mall-i.1249488955.27775912242?extraParams=%7B%22display_model_id%22%3A198207568255%7D", 
    "https://shopee.co.th/adidas-Lifestyle-Gazelle-Indoor-Shoes-Unisex-Pink-IE7002-i.217077552.29856820699?extraParams=%7B%22display_model_id%22%3A233059988107%2C%22model_selection_logic%22%3A3%7D&xptdk=73309b93-28b3-42a3-9d59-07188f31cd2a",
]

async def main():
    async with async_playwright() as p:
        print(f"Attaching to running browser...\n")
        try:
            browser = await p.chromium.connect_over_cdp("http://127.0.0.1:9222")
            context = browser.contexts[0]
        except Exception as e:
            print(f"Connection failed: {e}")
            return pd.DataFrame()

        master_reviews = []

        for idx, url in enumerate(PRODUCTS_URLS):
            print(f"\n[{idx + 1}/{len(PRODUCTS_URLS)}] Processing URL: {url}")
            page = await context.new_page()
            product_total_comments = 0 
            
            try:
                await page.goto(url, wait_until="networkidle")

                # --- Product Info ---
                match = re.search(r"i\.(\d+)\.(\d+)", page.url)
                shop_id = match.group(1) if match else ""
                product_id = match.group(2) if match else ""
                
                # Shop Name
                shop_name = ""
                try:
                    shop_name_loc = page.locator("div.fV3TIn")
                    if await shop_name_loc.count() > 0:
                        shop_name = (await shop_name_loc.first.inner_text()).strip()
                except: pass
 
                # Product Name
                product_name = ""
                try:
                    name_loc = page.locator("h1.vR6K3w") 
                    if await name_loc.count() > 0:
                        product_name = (await name_loc.first.inner_text()).strip()
                except: pass

                print(f"   > Shop ID: {shop_id}")
                print(f"   > Product ID: {product_id}")
                print(f"   > Shop Name: {shop_name}")
                print(f"   > Product Name: {product_name}")

                # --- Scraping Loop ---
                comment_filter = page.locator("div.product-rating-overview__filter", has_text="ความคิดเห็น")
                if await comment_filter.count() > 0:
                    await comment_filter.first.click()
                    await page.wait_for_timeout(1000)
                else:
                    print("   ! No comment filter detected.")

                page_idx = 1
                seen_pages_signatures = []
                
                while True:
                    root = page.locator("div.shopee-product-comment-list")
                    try:
                        await root.wait_for(state="visible", timeout=8000)
                    except:
                        break

                    cards = root.locator("> div")
                    count = await cards.count()

                    if count == 0:
                        break

                    current_page_reviews_list = []
                    current_page_signature = set() 

                    for i in range(count):
                        card = cards.nth(i)
                        header = card.locator("div.d72He7")
                        user = (await header.locator(".InK5kS").first.inner_text()).strip() if await header.locator(".InK5kS").count() > 0 else None
                        time = (await header.locator(".XYk98l").first.inner_text()).strip() if await header.locator(".XYk98l").count() > 0 else None
                        rating = await header.locator(".BnXxkm svg.icon-rating-solid").count()
                        comment_loc = card.locator("div.meQyXP")
                        comment = (await comment_loc.inner_text()).replace('\n', ' ').strip() if await comment_loc.count() > 0 else "" # .replace('\n', ' ') ก่อน .strip()

                        review = {
                            "shop_id": shop_id,
                            "shop_name": shop_name,
                            "product_id": product_id,
                            "product_name": product_name,
                            "user": user,
                            "rating": rating,
                            "time": time,
                            "comment": comment,
                            "page": page_idx,
                            "source_url": url
                        }
                        current_page_reviews_list.append(review)
                        current_page_signature.add((user, time, comment))

                    # ตรวจสอบว่าซ้ำหรือไม่
                    if page_idx > 1 and current_page_signature == seen_pages_signatures[-1]:
                        # ถ้าซ้ำ print แค่นี้แล้ว break เลย (บรรทัด Found... จะยังไม่ถูก print)
                        print(f"   ! Page {page_idx}: Data duplicate. Stop.")
                        break 
                    
                    # >>> ถ้าโค้ดมาถึงตรงนี้ แปลว่าไม่ซ้ำ ให้ Print ตรงนี้แทน <<<
                    # print(f"   > Page {page_idx}: Found {count} comments")
                    current_accumulated = product_total_comments + count # # คำนวณยอดสะสม
                    print(f"   > Page {page_idx}: Found {count} comments | Total: {current_accumulated}")
                    
                    seen_pages_signatures.append(current_page_signature)
                    master_reviews.extend(current_page_reviews_list)
                    product_total_comments += len(current_page_reviews_list)
                    
                    next_btn = page.locator("button.shopee-icon-button.shopee-icon-button--right")
                    if await next_btn.count() == 0 or await next_btn.get_attribute("disabled") is not None:
                        break

                    await next_btn.click()
                    await page.wait_for_timeout(2500) 
                    page_idx += 1
            
            except Exception as e:
                print(f"Error processing URL {url}: {e}")
            
            finally:
                print(f"   >>> Collected {product_total_comments} comments for this product.")
                print(f"   [Closed Tab for Product {idx + 1}]")
                await page.close()

        # Final Summary
        df = pd.DataFrame(master_reviews)

        # --- Post-Processing ---
        df = pd.DataFrame(master_reviews)
        
        if not df.empty:
            df['collected_at'] = pd.Timestamp.now().floor('s') 
            
            # แยก Date กับ Option
            split_data = df['time'].str.split('|', n=1, expand=True)
            df['date'] = split_data[0].str.strip()
            df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M', errors='coerce')
            if 1 in split_data.columns:
                df['option'] = split_data[1].str.split(':', n=1).str[1].str.strip()
            else:
                df['option'] = None

        cols = [
            "shop_id", 
            "product_id", 
            "shop_name",
            "product_name", 
            "user", 
            "rating",
            # "time",         # date + time + option (format shopee)
            "date",        
            "option", 
            "comment", 
            "collected_at"
        ]
        
        df = df[cols] if not df.empty else df
        
        print(f"\n{'='*50}")
        print(f"All done! Grand Total: {len(df)} comments collected.")
        print(f"{'='*50}")
        return df

        # Display head if in notebook
        try:
            display(df.head())
        except:
            print(df.head())

# await main()
df = asyncio.run(main())

Attaching to running browser...


[1/2] Processing URL: https://shopee.co.th/Samsung-Galaxy-S25-Ultra-(5G)-12-256GB-%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2%E0%B8%88%E0%B8%AD-6.9-%E0%B8%99%E0%B8%B4%E0%B9%89%E0%B8%A7-%E0%B8%81%E0%B8%A5%E0%B9%89%E0%B8%AD%E0%B8%87200MP-%E0%B9%81%E0%B8%9A%E0%B8%95%E0%B9%80%E0%B8%95%E0%B8%AD%E0%B8%A3%E0%B8%B5%E0%B9%88-5-000mAh-%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%81%E0%B8%B1%E0%B8%99%E0%B8%A8%E0%B8%B9%E0%B8%99%E0%B8%A2%E0%B9%8C%E0%B9%84%E0%B8%97%E0%B8%A2-%E0%B8%9E%E0%B8%A3%E0%B9%89%E0%B8%AD%E0%B8%A1%E0%B8%AA%E0%B9%88%E0%B8%87-By-WPN-Mall-i.1249488955.27775912242?extraParams=%7B%22display_model_id%22%3A198207568255%7D
   > Shop ID: 1249488955
   > Product ID: 27775912242
   > Shop Name: wpnmobile mall
   > Product Name: Samsung Galaxy S25 Ultra (5G) 12/256GB หน้าจอ 6.9 นิ้ว กล้อง200MP แบตเตอรี่ 5,000mAh ประกันศูนย์ไทย พร้อมส่ง By WPN Mall
   > Page 1: Found 6 comments | Total: 6
   > Page 2: Found 4 comments | Total: 10
   ! Page 3: Data duplicate. Stop.
   >>> Col