# Shopee API Scraper

Update: 2024-11-11

In [9]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
import time
import re
import json
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import os

# Set up Edge WebDriver
def setup_webdriver():
    edge_options = Options()
    driver = webdriver.Edge(options=edge_options)

    # Enable Network interception and set custom headers
    try:
        driver.execute_cdp_cmd('Network.enable', {})
        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {
            'headers': {
                'af-ac-enc-dat': 'null'
            }
        })
    except Exception as e:
        print(f"CDP command error: {e}")

    return driver

# Function to wait for Shopee login
def login_shopee(driver):
    # Open a new tab to the Shopee login page
    driver.execute_script("window.open('https://shopee.co.th/buyer/login', '_blank');")
    
    # Switch to the login tab
    driver.switch_to.window(driver.window_handles[1])
    
    # Wait for the user to manually log in
    print("Please log in manually...")
    while True:
        current_url = driver.current_url
        # Check if login is successful
        if "verify/captcha?" in current_url or "home" in current_url:  # Updated: change matching text in current_url from 'verify/traffic/error' to 'verify/captcha?'
            print("Login successful and verified.")
            break
        time.sleep(2)
    
    # Switch back to the product page tab
    driver.switch_to.window(driver.window_handles[0])

# Function to refresh the Shopee product page
def refresh_page(driver):
    driver.refresh()
    print("Refreshed the product page.")

# Function to sanitize file names by removing invalid characters and limiting the length
def sanitize_filename(filename, max_length=100):
    sanitized = re.sub(r'[\\/*?:"<>|]', "", filename)
    return sanitized[:max_length]  # Limit filename length to the specified max_length

# Function to check if the file exists and append a number if necessary
def check_and_create_filename(sanitized_product_name):
    base_filename = sanitized_product_name
    counter = 1
    output_filename = f"{base_filename}.xlsx"

    # Check if file exists and modify name if needed
    while os.path.isfile(output_filename):
        output_filename = f"{base_filename}({counter}).xlsx"
        counter += 1

    return output_filename

# Function to scrape ratings from Shopee
def scrape_ratings(driver, url):
    # Define offset range and step
    offset_start = 0
    offset_end = 3000
    offset_step = 50

    # Use regex to extract shopid and itemid from the URL
    r = re.search(r"i\.(\d+)\.(\d+)", url)
    shopid, itemid = r[1], r[2]

    base_url = f"https://shopee.co.th/api/v2/item/get_ratings?filter=0&flag=1&itemid={itemid}&limit=50&offset={{offset}}&shopid={shopid}&type=0"

    # Create a dictionary to store the scraped data
    d = {
        "username": [],
        "rating": [],
        "comment": [],
        "date": [],
        "product_selected": [],
        "name": [],
        "options": [],
    }

    total_comments = 0

    # Scrape data from each offset page
    for offset in range(offset_start, offset_end + 1, offset_step):
        url = base_url.format(offset=offset)
        time.sleep(5)
        driver.get(url)

        # Fetch HTML and convert it to JSON
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        try:
            div_tag = soup.find("div").text
            data = json.loads(div_tag)
        except Exception as e:
            print(f"Error parsing page source for offset {offset}: {e}")
            continue

        if "data" not in data or data["data"] is None:
            print(f"No data found for URL: {url}")
            continue

        if "ratings" not in data["data"] or not isinstance(data["data"]["ratings"], list):
            continue

        comments_in_page = len(data["data"]["ratings"])
        total_comments += comments_in_page
        print(f"Offset {offset}: {comments_in_page} comments")

        # Store scraped data in the dictionary
        for rating in data["data"]["ratings"]:
            product_items = rating.get("product_items", [])
            d["username"].append(rating.get("author_username", np.nan))
            d["rating"].append(rating.get("rating_star", np.nan))
            d["comment"].append(rating.get("comment", ""))

            # Convert timestamp to date
            date = datetime.fromtimestamp(rating["ctime"]).strftime("%Y-%m-%d %H:%M:%S")
            d["date"].append(date)

            if product_items:
                d["product_selected"].append(product_items)
                d["name"].append(product_items[0].get("name", np.nan))
                d["options"].append(product_items[0].get("options", np.nan))
            else:
                d["product_selected"].append(None)
                d["name"].append(None)
                d["options"].append(None)

    # Convert dictionary to DataFrame
    df = pd.DataFrame(d)
    print(f"Fetched all {total_comments} comments")
    return df

####  scraping comments from many products, create correponding df1,df2,df3

In [10]:
# Set up WebDriver
driver = setup_webdriver()

# Log in to Shopee
login_shopee(driver)


# List of URLs to scrape
url_list = [
    "https://shopee.co.th/LACTACYD-ALL-DAY-CARE-250ML-%E0%B9%81%E0%B8%A5%E0%B8%84%E0%B8%95%E0%B8%B2%E0%B8%8B%E0%B8%B4%E0%B8%94-%E0%B8%97%E0%B8%B3%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1%E0%B8%AA%E0%B8%B0%E0%B8%AD%E0%B8%B2%E0%B8%94%E0%B8%88%E0%B8%B8%E0%B8%94%E0%B8%8B%E0%B9%88%E0%B8%AD%E0%B8%99%E0%B9%80%E0%B8%A3%E0%B9%89%E0%B8%99-%E0%B8%AD%E0%B8%AD%E0%B8%A5-%E0%B9%80%E0%B8%94%E0%B8%A2%E0%B9%8C-%E0%B9%81%E0%B8%84%E0%B8%A3%E0%B9%8C-%E0%B8%94%E0%B8%B9%E0%B9%81%E0%B8%A5%E0%B8%AD%E0%B8%A2%E0%B9%88%E0%B8%B2%E0%B8%87%E0%B8%AD%E0%B9%88%E0%B8%AD%E0%B8%99%E0%B9%82%E0%B8%A2%E0%B8%99-250-%E0%B8%A1%E0%B8%A5.-i.313368031.4854778729?sp_atk=23955430-5bef-46bb-aa48-cd52e63bb26c",
 "https://shopee.co.th/Lactacyd-%E0%B9%81%E0%B8%A5%E0%B8%84%E0%B8%95%E0%B8%B2%E0%B8%8B%E0%B8%B4%E0%B8%94-%E0%B8%AD%E0%B8%AD%E0%B8%A5-%E0%B9%80%E0%B8%94%E0%B8%A2%E0%B9%8C-%E0%B9%81%E0%B8%84%E0%B8%A3%E0%B9%8C-250-%E0%B8%A1%E0%B8%A5.-i.31092295.5205403008?sp_atk=ee9ecfaa-edce-403c-ada1-5cfab4cd2e8f&xptdk=ee9ecfaa-edce-403c-ada1-5cfab4cd2e8f",
 "https://shopee.co.th/%E0%B9%81%E0%B8%A5%E0%B8%84%E0%B8%95%E0%B8%B2%E0%B8%8B%E0%B8%B4%E0%B8%94-%E0%B8%9C%E0%B8%A5%E0%B8%B4%E0%B8%95%E0%B8%A0%E0%B8%B1%E0%B8%93%E0%B8%91%E0%B9%8C%E0%B8%97%E0%B8%B3%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1%E0%B8%AA%E0%B8%B0%E0%B8%AD%E0%B8%B2%E0%B8%94%E0%B8%88%E0%B8%B8%E0%B8%94%E0%B8%8B%E0%B9%88%E0%B8%AD%E0%B8%99%E0%B9%80%E0%B8%A3%E0%B9%89%E0%B8%99-%E0%B9%80%E0%B8%99%E0%B8%8A%E0%B8%AD%E0%B8%A3%E0%B8%B1%E0%B8%A5%E0%B9%81%E0%B8%84%E0%B8%A3%E0%B9%8C-250-%E0%B8%A1%E0%B8%B4%E0%B8%A5%E0%B8%A5%E0%B8%B4%E0%B8%A5%E0%B8%B4%E0%B8%95%E0%B8%A3-%E0%B9%81%E0%B8%9E%E0%B9%87%E0%B8%84%E0%B8%84%E0%B8%B9%E0%B9%88-i.26978286.6915995785?sp_atk=fc1e79fe-c82e-4752-9099-abb58e999540&xptdk=fc1e79fe-c82e-4752-9099-abb58e999540",
 "https://shopee.co.th/Lactacyd-Care-Feminine-Wash-Sweet-Flora-150-ML.-%E0%B9%81%E0%B8%A5%E0%B8%84%E0%B8%95%E0%B8%B2%E0%B8%8B%E0%B8%B4%E0%B8%94%E0%B8%9C%E0%B8%A5%E0%B8%B4%E0%B8%95%E0%B8%A0%E0%B8%B1%E0%B8%93%E0%B8%91%E0%B9%8C%E0%B8%97%E0%B8%B3%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1%E0%B8%AA%E0%B8%B0%E0%B8%AD%E0%B8%B2%E0%B8%94%E0%B8%88%E0%B8%B8%E0%B8%94%E0%B8%8B%E0%B9%88%E0%B8%AD%E0%B8%99%E0%B9%80%E0%B8%A3%E0%B9%89%E0%B8%99%E0%B8%AA%E0%B8%B9%E0%B8%95%E0%B8%A3%E0%B8%AD%E0%B8%AD%E0%B8%A5%E0%B9%80%E0%B8%94%E0%B8%A2%E0%B9%8C%E0%B9%81%E0%B8%84%E0%B8%A3%E0%B9%8C-150-%E0%B8%A1%E0%B8%A5.-i.325262295.8099406529?sp_atk=61222b9f-a460-4daa-88bc-5e6717605b52&xptdk=61222b9f-a460-4daa-88bc-5e6717605b52"
]
    
# Scrape data from all URLs in the list
for url in url_list:
    df = scrape_ratings(driver, url)

    # Remove blank comments 
    df["comment"] = df["comment"].replace("", np.nan)
    df.dropna(subset=["comment"], inplace=True)

    # Drop duplicates
    df.drop_duplicates(subset=["username", "comment"], inplace=True)

    # Get the product name from the first row
    product_name = df["name"][0]

    # Sanitize the product name for the filename and limit its length
    sanitized_product_name = sanitize_filename(product_name)

    # Use the check_and_create_filename function to generate a unique file name
    output_filename = check_and_create_filename(sanitized_product_name)

    # Save the DataFrame to an excel file
    df.to_excel(output_filename, index=False)

    # Print confirmation
    print(f"Saved: {output_filename}")

# Quit the driver
driver.quit()

Please log in manually...
Login successful and verified.
Offset 0: 50 comments
Offset 50: 50 comments
Offset 100: 50 comments
Offset 150: 50 comments
Offset 200: 27 comments
Fetched all 227 comments
Saved: LACTACYD Feminine Wash NATURAL CARE 250 ML  แลคตาซิด เนเชอรัล แคร์ ดูแลอย่างอ่อนโยน  250 มล..xlsx
Offset 0: 50 comments
Offset 50: 50 comments
Offset 100: 50 comments
Offset 150: 50 comments
Offset 200: 50 comments
Offset 250: 50 comments
Offset 300: 50 comments
Offset 350: 50 comments
Offset 400: 50 comments
Offset 450: 50 comments
Offset 500: 50 comments
Offset 550: 50 comments
Offset 600: 23 comments
Fetched all 623 comments
Saved: Lactacyd แลคตาซิด ออล เดย์ แคร์ 250 มล..xlsx
Offset 0: 50 comments
Offset 50: 50 comments
Offset 100: 50 comments
Offset 150: 50 comments
Offset 200: 50 comments
Offset 250: 50 comments
Offset 300: 50 comments
Offset 350: 45 comments
Fetched all 395 comments
Saved: แลคตาซิด ผลิตภัณฑ์ทำความสะอาดจุดซ่อนเร้น เนชอรัลแคร์ 250 มิลลิลิตร แพ็คคู่.xlsx
Offset 0: