In [10]:
import json
import re
from datetime import datetime

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.options import Options

def scrape_ratings(url):
    # Define offset range and step
    offset_start = 0
    offset_end = 4000
    offset_step = 50

    r = re.search(r"i\.(\d+)\.(\d+)", url)
    shopid, itemid = r[1], r[2]
    base_url = f"https://shopee.co.id/api/v2/item/get_ratings?filter=0&flag=1&itemid={itemid}&limit=50&offset={{offset}}&shopid={shopid}&type=0"

    # Initialize dictionary to store data
    d = {
        "username": [],
        "rating": [],
        "comment": [],
        "date": [],
        "product_selected": [],
        "name": [],
        "options": [],
    }

    for offset in range(offset_start, offset_end + 1, offset_step):
        url = base_url.format(offset=offset)
        driver.get(url)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        div_tag = soup.find("pre").text
        data = json.loads(div_tag)

        if "data" not in data or data["data"] is None:
            print(f"No data found for URL: {url}")
            continue

        if "ratings" not in data["data"] or not isinstance(data["data"]["ratings"], list):
            continue

        for rating in data["data"]["ratings"]:
            d["username"].append(rating.get("author_username", np.nan))
            d["rating"].append(rating.get("rating_star", np.nan))
            d["comment"].append(rating.get("comment", np.nan))

            # Convert timestamp to date
            date = datetime.fromtimestamp(rating["ctime"]).strftime("%Y-%m-%d %H:%M:%S")
            d["date"].append(date)

            # Handle product details
            product_items = rating.get("product_items", [])
            if product_items:
                d["product_selected"].append(product_items)
                d["name"].append(product_items[0].get("name", None))
                d["options"].append(product_items[0].get("options", None))
            else:
                d["product_selected"].append(None)
                d["name"].append(None)
                d["options"].append(None)

    # Validate and pad data
    max_length = max(len(value) for value in d.values())
    for key, value in d.items():
        while len(value) < max_length:
            value.append(None)

    # Create DataFrame
    df = pd.DataFrame(d)

    return df

In [11]:
# import requests

# headers = {
#     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
#     "From": "",
#     "af-ac-enc-dat": "",
#     "x-api-source": "pc",
# }

# params = {
#     "filters": "9",
#     "keyword": "funko",
#     "locations": "Nacional",
#     "noCorrection": "true",
#     "page": "0",
#     "sortBy": "relevancy",
# }

# with requests.Session() as session:
#     response = session.get("https://shopee.com.br/search", params=params, headers=headers)


####  scraping comments from same product but has many size options

In [12]:
# edge_options = Options()
# edge_options.use_chromium = True  # This line is important
# edge_options.add_argument("--headless")
# edge_options.add_argument("--disable-gpu")
# edge_options.add_argument(
#     "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54"
# )
# edge_options.add_argument("window-size=1920x1080")
# # Path to MicrosoftWebDriver.exe
# # edge_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge Beta\Application\msedge.exe"

# driver = webdriver.Edge(options=edge_options)

# # List of URLs to scrape
# url_list = [
#     "https://shopee.co.id/-New-Shade-sd-27-OMG-Oh-My-Glam-Matte-Kiss-Lip-Cream-%E2%94%82Lipstik-Lipstick-Lipcream-OMG-Love-Edition-OMG-i.44016624.3585850709?sp_atk=f3cfc768-326c-4f0f-ad04-d47b31912904&xptdk=f3cfc768-326c-4f0f-ad04-d47b31912904",
#     "https://shopee.co.id/HANASUI-MATTEDORABLE-LIPCREAM-BOBA-MATCHA-hanasui-i.44016624.4043989119?sp_atk=8b09879f-119c-4053-b371-c339a009299a&xptdk=8b09879f-119c-4053-b371-c339a009299a",
# ]

# # Create an empty DataFrame to store the combined results
# result_df = pd.DataFrame()

# # Iterate over each URL in url_list
# for i, url in enumerate(url_list, start=1):
#     # Scrape data from the URL and create a DataFrame

#     # Concatenate the DataFrame with the result_df
#     result_df = pd.concat([result_df, df], ignore_index=True)

#     # delete blank comments
#     result_df["comment"] = result_df["comment"].replace("", np.nan)
#     result_df.dropna(subset=["comment"], inplace=True)

#     # Drop duplicates
#     result_df.drop_duplicates(subset=["username", "comment"], inplace=True)

#     product_name = result_df["name"][0]

#     output_filename = f"{product_name}.xlsx"

#     # Save the DataFrame to a excel file
#     result_df.to_excel(output_filename, index=False)

# driver.quit()

####  scraping comments from many products, create correponding df1,df2,df3

In [13]:
edge_options = Options()
edge_options.use_chromium = True  # This line is important
# edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")
edge_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54"
)
edge_options.add_argument("window-size=1920x1080")
# Path to MicrosoftWebDriver.exe
# edge_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge Beta\Application\msedge.exe"

driver = webdriver.Edge(options=edge_options)

# List of URLs to scrape
url_list = [ "https://shopee.vn/N%C6%B0%E1%BB%9Bc-R%E1%BB%ADa-Tay-Lifebuoy-B%E1%BA%A3o-V%E1%BB%87-Kh%E1%BB%8Fi-99.9-Vi-Khu%E1%BA%A9n-G%C3%A2y-B%E1%BB%87nh-H%E1%BB%97-Tr%E1%BB%A3-C%C3%B9ng-%C4%90%E1%BB%81-Kh%C3%A1ng-Da-T%E1%BB%B1-Nhi%C3%AAn-450G-Phi%C3%AAn-b%E1%BA%A3n-T%E1%BA%BFt-499g-i.111138057.2855276381?sp_atk=f2d80890-c401-4b0b-876f-cc56cec5e5c7&xptdk=f2d80890-c401-4b0b-876f-cc56cec5e5c7"]
def sanitize_filename(filename):
    # Remove any character that is not alphanumeric, space, or allowed punctuation
    return re.sub(r'[<>:"/\\|?*+]', "", filename)

# Iterate over each URL in url_list
for i, url in enumerate(url_list, start=1):
    # Scrape data from the URL and create a DataFrame
    df = scrape_ratings(url)

    # Delete blank comments
    df["comment"] = df["comment"].replace("", np.nan)
    df.dropna(subset=["comment"], inplace=True)

    # Drop duplicates
    df.drop_duplicates(subset=["username", "comment"], inplace=True)

    # Get the product name from the first row
    # product_name = df["name"][0]

    # Sanitize the product name for the filename
    sanitized_product_name = sanitize_filename("product5")

    output_filename = f"{sanitized_product_name}.xlsx"

    # Save the DataFrame to an excel file
    df.to_excel(output_filename, index=False)

driver.quit()

### Tiktok Video

In [18]:
from datetime import datetime

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.options import Options

edge_options = Options()
edge_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54"
)
edge_options.add_argument("window-size=1920x1080")
edge_options.add_argument("--disable-blink-features=AutomationControlled")
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])

In [68]:
def scrape_data(driver):
    comments = []
    comment = driver.find_elements(By.CLASS_NAME, "e1g2efjf6")
    for c in comment:
        comments.append(c.text)

    print(f"Scraped {len(comments)} rows.")
    return comments

In [69]:
driver = webdriver.Edge(options=edge_options)
driver.get(
    "https://www.tiktok.com/@khaby.lame/video/7255327059302419738"
)  # Load the webpage with the video

time.sleep(10)

# Define scroll parameters
scroll_interval = 10  # Scroll and wait time in seconds
target_rows = 10  # Scrolling n times

# Counter for the number of rows scraped
scraped_rows = 0

comments = []
# Loop until target number of rows is scraped
while scraped_rows < target_rows:
    # Scraping data

    current = scrape_data(driver)
    comments.extend(current)
    scraped_rows += 1

    # Scroll down the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for the specified interval
    time.sleep(scroll_interval)

# Don't forget to close the WebDriver when you're done
driver.quit()

Scraped 20 rows.
Scraped 38 rows.
Scraped 57 rows.
Scraped 74 rows.
Scraped 94 rows.
Scraped 112 rows.
Scraped 131 rows.
Scraped 150 rows.
Scraped 170 rows.
Scraped 189 rows.


In [71]:
list(set(comments))

['Wooow 🥺🥰',
 'Je comprendrais jamais comment une telle ascension a pu avoir lieu 🤣',
 'Ce gars a réussi sa vie',
 'Wallah je suis grv contant pour lui il est passé de la hess à connue',
 'tell this guy to come back on marvel 😢',
 'It’s not everyday you meet Tony Stark 👀',
 'bro is chiling with RDJ 🥶',
 'How does bro not have more likes then Charlie💀😭',
 'bro meeting more stars then the sky',
 'Tony Stark não deixou de viajar no multiverso',
 'Tony star. Iron man ❤️',
 'la dedizione di avercela fatta davvero',
 'ก่อนหมื่น',
 'Avant les 10k alors que je suis même pas abonné😏',
 'does anybody know where Robert Downey Jr got those boots',
 '1k stark😳',
 'How is bro meeting so many people',
 "It's Iron-man?😳",
 'Bro just met iron man',
 'Mano mentiraaaaa😳 agora sim zerou a vida',
 'Czekam aż Khaby Lame zostanie aktorem Marvela',
 'Brooo🔥',
 'on nie powiedzial ani jedneto slowa a jest taki popularny',
 'CHE COSA MI STAI DICENDO ORA',
 'vay bee adamım demir adam',
 'The man, the myth, the le

### Shopee Product List

In [93]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
import time
import re
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import os


# Set up Edge WebDriver
def setup_webdriver():
    edge_options = Options()
    driver = webdriver.Edge(options=edge_options)
    try:
        # Enable network interception and set custom headers
        driver.execute_cdp_cmd('Network.enable', {})
        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {
            'headers': {
                'af-ac-enc-dat': 'null'
            }
        })
    except Exception as e:
        print(f"CDP command error: {e}")
    return driver


# Wait for Shopee login
def login_shopee(driver):
    # Open a new tab for Shopee login
    driver.execute_script("window.open('https://shopee.co.th/buyer/login', '_blank');")
    driver.switch_to.window(driver.window_handles[1])  # Switch to the login tab
    print("Please log in manually...")
    while True:
        current_url = driver.current_url
        # Check for successful login
        if "verify/captcha?" in current_url or "home" in current_url:
            print("Login successful and verified.")
            break
        time.sleep(2)  # Pause to wait for user to complete login
    driver.switch_to.window(driver.window_handles[0])  # Switch back to the main tab


In [83]:
edge_options = Options()
edge_options.use_chromium = True  # This line is important
# edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")
edge_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54"
)
edge_options.add_argument("window-size=1920x1080")

# Main script
driver = setup_webdriver()  # Initialize the WebDriver
driver.get("https://shopee.co.th/api/v4/search/search_items?by=sales&limit=60&match_id=11045208&newest=0&order=desc&page_type=search&scenario=PAGE_CATEGORY&version=2&view_session_id=4cea0d3a-ed3c-4206-bbd2-d764678916c2")
login_shopee(driver)  # Log in to Shopee manually

Please log in manually...
Login successful and verified.


In [98]:
import requests

url = 'https://shopee.tw/api/v2/search_items/?by=pop&limit=30&match_id=1819984&newest=0&order=desc&page_type=shop&shop_categoryids=9271157&version=2'

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': 'https://shopee.tw/shop/1819984/search?shopCollection=9271157',
}    

r = requests.get(url, headers=headers)

data = r.json()

#print(data['items'][0].keys())

for item in data['items']:
    print('name:', item['name'])
    print('prince:', item['price'])
    print('sold:', item['historical_sold'])
    print('---')

#print(data['items'][0]) # for test only 

KeyError: 'items'

In [99]:
data

{'error': 'error_not_found'}