# **Web Scraping Product Data from Banggood.com**

In [11]:
# Importing Libraries for Extraction

import pandas as pd
import numpy as np
from io import StringIO
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
import random

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


In [18]:
# Selenium Setup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")  # Run without opening browser
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")

# Auto-download correct Chrome driver
driver = webdriver.Chrome(options=options)


In [19]:
# -------------------------------------------------------------
# Categories to Scrape (You can change these)
# -------------------------------------------------------------
categories = {
    "Toys Hobbies and Robot": "https://www.banggood.com/Wholesale-Toys-Hobbies-and-Robot-c-133.html?bid=210701&from=nav",
    "Men & Women's Clothing": "https://www.banggood.com/Wholesale-Men-and-Womens-Clothing-ca-18941.html?bid=210711&from=nav",
    "Lights & Lighting": "https://www.banggood.com/Wholesale-Lights-and-Lighting-ca-14001.html?bid=210710&from=nav",
    "Home Appliances & Health": "https://www.banggood.com/Wholesale-Home-Appliance-and-Health-ca-18949.html?bid=210708&from=nav",
    "Electronics": "https://www.banggood.com/Wholesale-Electronics-ca-2001.html?bid=210705&from=nav"
}

In [39]:
"""
Banggood Full Scraper (All 5 Main Categories)
Auto-recovers if Chrome crashes
Saves -> ExtractedData.csv
"""

import time
import csv
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, WebDriverException

# ---------------------------
# Config
# ---------------------------
HEADLESS = True
MAX_PAGES_PER_SUBCAT = 10
OUTPUT_CSV = "ExtractedData.csv"
SLEEP_SHORT = 1.5
SLEEP_LONG = 3.0

# ---------------------------
# ALL 5 MAIN CATEGORIES
# ---------------------------
categories = {
    "Toys Hobbies and Robot": "https://www.banggood.com/Wholesale-Toys-Hobbies-and-Robot-c-133.html",

    "Men & Women's Clothing": "https://www.banggood.com/Wholesale-Men-and-Womens-Clothing-ca-18941.html?bid=210711&from=nav",

    "Lights & Lighting": "https://www.banggood.com/Wholesale-Lights-and-Lighting-ca-14001.html?bid=210710&from=nav",

    "Home Appliances & Health": "https://www.banggood.com/Wholesale-Home-Appliance-and-Health-ca-18949.html?bid=210708&from=nav",

    "Electronics": "https://www.banggood.com/Wholesale-Electronics-ca-2001.html?bid=210705&from=nav"
}

# ---------------------------
# Start WebDriver (auto-restart)
# ---------------------------
def start_driver():
    options = Options()
    if HEADLESS:
        options.add_argument("--headless=new")

    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    )
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-infobars")

    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(60)  # FIXED TIMEOUT
    return driver

driver = start_driver()

# ---------------------------
# Safe GET (auto recover)
# ---------------------------
def safe_get(url, retries=3):
    global driver
    for attempt in range(retries):
        try:
            driver.get(url)
            return True
        except Exception:
            print(f"   [WARN] Chrome crashed. Restarting... (Attempt {attempt+1})")

            try:
                driver.quit()
            except:
                pass

            driver = start_driver()
            time.sleep(2)

    print("   [FATAL] Failed to load:", url)
    return False

# ---------------------------
# Safe find functions
# ---------------------------
def safe_find_element(parent, css):
    try:
        return parent.find_element("css selector", css)
    except:
        return None

def safe_find_elements(parent, css):
    try:
        return parent.find_elements("css selector", css)
    except:
        return []

def normalize_url(base, href):
    if not href:
        return ""
    return urljoin(base, href)

# ---------------------------
# Selectors (robust)
# ---------------------------
SUBCAT_SELECTORS = [
    ".clothes-main dd a",
    "div.nav a",
    ".category-list a",
    ".nav-and-banner .nav a",
    ".nav a.exclick",
]

PRODUCT_SELECTORS = [
    ".goodlist_item",
    "li[data-product-id]",
    ".product-item",
    ".goods-item",
    ".card-item",
]

NAME_SELECTORS = [
    ".title",
    "a.title",
    ".goods-title a",
    ".name",
    ".product-title a",
    "a.img"
]

PRICE_SELECTORS = [
    ".price",
    ".price-new",
    "span.oriprice",
    "span.price.notranslate",
    ".p-price"
]

RATING_SELECTORS = [
    ".star i",
    ".rating i",
    ".rate-stars i"
]

REVIEWS_SELECTORS = [
    ".review",
    ".review-num",
    ".reviews",
    ".comment-count"
]

# ---------------------------
# CSV Setup
# ---------------------------
csv_file = open(OUTPUT_CSV, "w", newline="", encoding="utf-8")
writer = csv.writer(csv_file)
writer.writerow(["Main Category", "Sub Category", "Product Name", "Price", "Rating", "Reviews", "Product URL"])

# ---------------------------
# Extract Subcategories
# ---------------------------
def get_subcategory_links(category_url):

    safe_get(category_url)
    time.sleep(SLEEP_LONG)

    found = {}
    base = driver.current_url

    for sel in SUBCAT_SELECTORS:
        els = safe_find_elements(driver, sel)
        for el in els:
            href = el.get_attribute("href")
            text = el.text.strip()
            if href and text:
                href = normalize_url(base, href)
                found[href] = text

    if not found:
        anchors = driver.find_elements("tag name", "a")
        for a in anchors:
            href = a.get_attribute("href") or ""
            txt = a.text.strip()
            if "/Wholesale-" in href and txt:
                found[href] = txt

    return [(name, href) for href, name in found.items()]

# ---------------------------
# Extract products from subcategory
# ---------------------------
def extract_products_from_subcat(main_cat, sub_name, sub_url):

    page = 1
    empty_pages = 0

    while page <= MAX_PAGES_PER_SUBCAT:

        page_url = f"{sub_url}?page={page}"
        safe_get(page_url)
        time.sleep(SLEEP_SHORT)

        products = []
        for psel in PRODUCT_SELECTORS:
            p = safe_find_elements(driver, psel)
            if p:
                products = p
                break

        if not products:
            empty_pages += 1
            if empty_pages >= 2:
                break
            page += 1
            continue

        empty_pages = 0

        for p in products:

            # NAME
            name = ""
            for nsel in NAME_SELECTORS:
                el = safe_find_element(p, nsel)
                if el:
                    name = el.text.strip() or el.get_attribute("title") or ""
                    if not name:
                        img = safe_find_element(el, "img")
                        if img:
                            name = img.get_attribute("alt") or ""
                    if name:
                        break

            # PRICE
            price = ""
            for psel in PRICE_SELECTORS:
                el = safe_find_element(p, psel)
                if el:
                    price = el.text.strip() or el.get_attribute("oriprice") or ""
                    if price:
                        break

            # RATING
            rating = ""
            for rsel in RATING_SELECTORS:
                el = safe_find_element(p, rsel)
                if el:
                    st = el.get_attribute("style") or ""
                    if "width" in st:
                        try:
                            pct = st.split("width:")[1].split("%")[0].strip()
                            rating = pct
                        except:
                            rating = el.text.strip()
                    else:
                        rating = el.text.strip()
                    break

            # REVIEWS
            reviews = ""
            for rsel in REVIEWS_SELECTORS:
                el = safe_find_element(p, rsel)
                if el:
                    reviews = el.text.strip()
                    break

            # PRODUCT URL
            product_url = ""
            for a_sel in ["a.title", "a.img", "a", ".img a"]:
                a = safe_find_element(p, a_sel)
                if a:
                    product_url = normalize_url(driver.current_url, a.get_attribute("href"))
                    break

            if not product_url:
                pid = p.get_attribute("data-product-id") or p.get_attribute("data-pid")
                if pid:
                    product_url = f"https://www.banggood.com/p-{pid}.html"

            writer.writerow([main_cat, sub_name, name, price, rating, reviews, product_url])

        page += 1


# ---------------------------
# MAIN SCRAPE LOOP
# ---------------------------
for main_cat, main_url in categories.items():

    print(f"\n[MAIN] Processing: {main_cat}")
    subcats = get_subcategory_links(main_url)

    if not subcats:
        subcats = [(main_cat, main_url)]

    for sub_name, sub_url in subcats:
        print(f"  -> Subcategory: {sub_name}")
        try:
            extract_products_from_subcat(main_cat, sub_name, sub_url)
        except Exception as e:
            print("  [ERROR] Subcategory failed:", e)
            continue

# ---------------------------
# Finish
# ---------------------------
csv_file.close()
driver.quit()
print("\nDone ✔ All 5 categories extracted.")



[MAIN] Processing: Toys Hobbies and Robot
  -> Subcategory: RC Helicopter
  -> Subcategory: RC Quadcopters
  -> Subcategory: RC Airplane
  -> Subcategory: FPV Racing Drone
  -> Subcategory: RC Car
   [WARN] Chrome crashed. Restarting... (Attempt 1)
  -> Subcategory: RC Boat
  -> Subcategory: Multi Rotor Parts
  -> Subcategory: FPV System
  -> Subcategory: Radios & Receiver
  -> Subcategory: Battery & Charger
  -> Subcategory: Model & Building Toys
   [WARN] Chrome crashed. Restarting... (Attempt 1)
   [WARN] Chrome crashed. Restarting... (Attempt 2)
   [WARN] Chrome crashed. Restarting... (Attempt 3)
   [FATAL] Failed to load: https://www.banggood.com/Wholesale-Model-and-Building-Toys-ca-7109.html?bid=81131&from=nav?page=4
   [WARN] Chrome crashed. Restarting... (Attempt 1)
   [WARN] Chrome crashed. Restarting... (Attempt 2)
   [WARN] Chrome crashed. Restarting... (Attempt 3)
   [FATAL] Failed to load: https://www.banggood.com/Wholesale-Model-and-Building-Toys-ca-7109.html?bid=81131&f

Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


  -> Subcategory: Digital Cameras
  -> Subcategory: Video Doorbell

Done ✔ All 5 categories extracted.
