# Shopee API Scraper

In [1]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
import time
import re
import json
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import os


# Set up Edge WebDriver
def setup_webdriver():
    edge_options = Options()
    driver = webdriver.Edge(options=edge_options)

    # Enable Network interception and set custom headers
    try:
        driver.execute_cdp_cmd('Network.enable', {})
        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {
            'headers': {
                'af-ac-enc-dat': 'null'
            }
        })
    except Exception as e:
        print(f"CDP command error: {e}")

    return driver


# Function to wait for Shopee login
def login_shopee(driver):
    # Open a new tab to the Shopee login page
    driver.execute_script("window.open('https://shopee.co.th/buyer/login', '_blank');")
    
    # Switch to the login tab
    driver.switch_to.window(driver.window_handles[1])
    
    # Wait for the user to manually log in
    print("Please log in manually...")
    while True:
        current_url = driver.current_url
        # Check if login is successful
        if "verify/captcha?" in current_url or "home" in current_url:  # Updated: change matching text in current_url from 'verify/traffic/error' to 'verify/captcha?'
            print("Login successful and verified.")
            break
        time.sleep(2)
    
    # Switch back to the product page tab
    driver.switch_to.window(driver.window_handles[0])


# Function to refresh the Shopee product page
def refresh_page(driver):
    driver.refresh()
    print("Refreshed the product page.")


# Function to sanitize file names by removing invalid characters and limiting the length
def sanitize_filename(filename, max_length=100):
    sanitized = re.sub(r'[\\/*?:"<>|]', "", filename)
    return sanitized[:max_length]  # Limit filename length to the specified max_length


# Function to check if the file exists and append a number if necessary
def check_and_create_filename(sanitized_product_name):
    base_filename = sanitized_product_name
    counter = 1
    output_filename = f"{base_filename}.xlsx"

    # Check if file exists and modify name if needed
    while os.path.isfile(output_filename):
        output_filename = f"{base_filename}({counter}).xlsx"
        counter += 1

    return output_filename


def scrape_ratings(driver, url):
    # Define offset range and step
    offset_start = 0
    offset_end = 3000
    offset_step = 50

    # Use regex to extract shopid and itemid from the URL
    r = re.search(r"i\.(\d+)\.(\d+)", url)
    shopid, itemid = r[1], r[2]

    base_url = f"https://shopee.co.th/api/v2/item/get_ratings?filter=0&flag=1&itemid={itemid}&limit=50&offset={{offset}}&shopid={shopid}&type=0"

    # Create a dictionary to store the scraped data
    d = {
        "username": [],
        "rating": [],
        "comment": [],
        "date": [],
        "product_selected": [],
        "name": [],
        "options": [],
    }

    total_comments = 0
    consecutive_empty_offsets = 0  # Counter for consecutive offsets with no data
    max_empty_offsets = 3         # Maximum number of consecutive empty offsets allowed

    # Scrape data from each offset page
    for offset in range(offset_start, offset_end + 1, offset_step):
        url = base_url.format(offset=offset)
        # time.sleep(5)
        driver.get(url)

        # Fetch HTML and convert it to JSON
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        try:
            div_tag = soup.find("div").text
            data = json.loads(div_tag)
        except Exception as e:
            print(f"Error parsing page source for offset {offset}: {e}")
            consecutive_empty_offsets += 1  # Increment empty counter for failed parse
            if consecutive_empty_offsets >= max_empty_offsets:
                # print("No data for consecutive offsets. Skipping to next URL.")
                break
            continue

        if "data" not in data or data["data"] is None or "ratings" not in data["data"] or not isinstance(data["data"]["ratings"], list):
            # print(f"No data found for offset {offset}.")
            consecutive_empty_offsets += 1  # Increment empty counter for no data
            if consecutive_empty_offsets >= max_empty_offsets:
                # print("No data for consecutive offsets. Skipping to next URL.")
                break
            continue

        # Reset counter if data is found
        consecutive_empty_offsets = 0

        comments_in_page = len(data["data"]["ratings"])
        total_comments += comments_in_page
        print(f"Offset {offset}: {comments_in_page} comments")

        # Store scraped data in the dictionary
        for rating in data["data"]["ratings"]:
            product_items = rating.get("product_items", [])
            d["username"].append(rating.get("author_username", np.nan))
            d["rating"].append(rating.get("rating_star", np.nan))
            d["comment"].append(rating.get("comment", ""))

            # Convert timestamp to date
            date = datetime.fromtimestamp(rating["ctime"]).strftime("%Y-%m-%d %H:%M:%S")
            d["date"].append(date)

            if product_items:
                d["product_selected"].append(product_items)
                d["name"].append(product_items[0].get("name", np.nan))
                d["options"].append(product_items[0].get("options", np.nan))
            else:
                d["product_selected"].append(None)
                d["name"].append(None)
                d["options"].append(None)

    # Convert dictionary to DataFrame
    if total_comments > 0:
        df = pd.DataFrame(d)
        print(f"Fetched all {total_comments} comments")
        return df
    else:
        print(f"No comments found for URL: {url}")
        return pd.DataFrame()

####  scraping comments from many products, create correponding df1,df2,df3

In [3]:
driver = setup_webdriver()
login_shopee(driver)

url_list = [
    "https://shopee.co.th/MizuMi-UV-Water-Serum-SPF50-PA-8g-No.1-Best-Selling-Sunscreen-%E0%B9%80%E0%B8%8B%E0%B8%A3%E0%B8%B1%E0%B9%88%E0%B8%A1%E0%B8%81%E0%B8%B1%E0%B8%99%E0%B9%81%E0%B8%94%E0%B8%94-%E0%B8%9A%E0%B8%B2%E0%B8%87%E0%B9%80%E0%B8%9A%E0%B8%B2-%E0%B8%8B%E0%B8%B6%E0%B8%A1%E0%B9%84%E0%B8%A7-%E0%B9%84%E0%B8%A1%E0%B9%88%E0%B8%AD%E0%B8%B8%E0%B8%94%E0%B8%95%E0%B8%B1%E0%B8%99-%E0%B8%9B%E0%B8%81%E0%B8%9B%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%B9%E0%B8%87%E0%B8%AA%E0%B8%B8%E0%B8%94-i.70802054.1208714831?sp_atk=80074ea6-49b8-4581-8d32-650693a36aae"
]

# Scrape data from all URLs in the list
for url in url_list:
    print(f"Processing URL: {url}")
    df = scrape_ratings(driver, url)

    # ตรวจสอบว่า DataFrame มีข้อมูลหรือไม่
    if df.empty:
        print(f"No comments found for URL: {url}. Skipping to next URL.")
        print("-" * 50)
        continue

    # ลบคอมเมนต์ที่ว่าง
    df["comment"] = df["comment"].replace("", np.nan)
    df.dropna(subset=["comment"], inplace=True)

    # ลบข้อมูลที่ซ้ำกัน
    df.drop_duplicates(subset=["username", "comment"], inplace=True)

    # ดึงชื่อผลิตภัณฑ์จากแถวแรก
    product_name = df["name"].iloc[0] if not df.empty else "Unknown Product"

    # แปลงชื่อไฟล์ให้ปลอดภัยและไม่ยาวเกินไป
    sanitized_product_name = sanitize_filename(product_name)

    # สร้างชื่อไฟล์ที่ไม่ซ้ำกัน
    output_filename = check_and_create_filename(sanitized_product_name)

    # บันทึก DataFrame ลงไฟล์ Excel
    df.to_excel(output_filename, index=False)

    # แสดงข้อความยืนยัน
    print(f"Saved: {output_filename}")
    print("-" * 50)

# Quit the driver
# driver.quit()

Please log in manually...
Login successful and verified.
Processing URL: https://shopee.co.th/MizuMi-UV-Water-Serum-SPF50-PA-8g-No.1-Best-Selling-Sunscreen-%E0%B9%80%E0%B8%8B%E0%B8%A3%E0%B8%B1%E0%B9%88%E0%B8%A1%E0%B8%81%E0%B8%B1%E0%B8%99%E0%B9%81%E0%B8%94%E0%B8%94-%E0%B8%9A%E0%B8%B2%E0%B8%87%E0%B9%80%E0%B8%9A%E0%B8%B2-%E0%B8%8B%E0%B8%B6%E0%B8%A1%E0%B9%84%E0%B8%A7-%E0%B9%84%E0%B8%A1%E0%B9%88%E0%B8%AD%E0%B8%B8%E0%B8%94%E0%B8%95%E0%B8%B1%E0%B8%99-%E0%B8%9B%E0%B8%81%E0%B8%9B%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%AA%E0%B8%B9%E0%B8%87%E0%B8%AA%E0%B8%B8%E0%B8%94-i.70802054.1208714831?sp_atk=80074ea6-49b8-4581-8d32-650693a36aae
Error parsing page source for offset 0: Expecting value: line 1 column 1 (char 0)
Error parsing page source for offset 50: Expecting value: line 1 column 1 (char 0)
Error parsing page source for offset 100: Expecting value: line 1 column 1 (char 0)
No comments found for URL: https://shopee.co.th/api/v2/item/get_ratings?filter=0&flag=1&itemid=1208714831&limit=50&offset=100&sh