# 2. Craw product by Link


In [6]:
import os
import logging
import random
import time
import csv
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from urllib.parse import urljoin
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core import driver_cache
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

# Constants
BASE_URL = "https://www.jomashop.com/watches.html?p={}"
BATCH_SIZE = 5
MAX_WORKERS = 8
OUTPUT_FILE = "../data/products.csv"
LINKS_FILE = "../data/links.csv"
MAX_RETRIES = 3
RETRY_DELAY = 2
DRIVER_SETUP_RETRIES = 3
DRIVER_SETUP_DELAY = 5
IMPLICIT_WAIT = 5.0  # seconds
SCROLL_PAUSE = 2.0  # seconds between scrolls
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# -----------------------------------
# Driver setup with retry
# -----------------------------------
def setup_driver_service():
    """Download or fetch ChromeDriver binary with retry logic."""
    for attempt in range(1, DRIVER_SETUP_RETRIES + 1):
        try:
            driver_path = ChromeDriverManager().install()
            return Service(driver_path)
        except Exception as e:
            logger.warning(f"Driver setup attempt {attempt} failed: {e}")
            time.sleep(DRIVER_SETUP_DELAY)
    logger.error(f"All {DRIVER_SETUP_RETRIES} chrome driver setups failed")
    raise RuntimeError("Unable to install ChromeDriver")

# -----------------------------------
# Driver initialization
# -----------------------------------
def init_driver(headless: bool = True) -> webdriver.Chrome:
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = setup_driver_service()
    driver = webdriver.Chrome(service=service, options=options)
    driver.implicitly_wait(IMPLICIT_WAIT)
    return driver

# -----------------------------------
# Page scrolling utility
# -----------------------------------

def scroll_down_slowly(driver: webdriver.Chrome, pause_time: float = 2.0, max_scrolls: int = 30) -> None:
    print("Scrolling down the page...")
    scroll_count = 0

    while scroll_count < max_scrolls:
        # Cuộn xuống 2000 pixel
        driver.execute_script("window.scrollBy(0, 4000);")
        time.sleep(pause_time)

        # Lấy chiều cao trang và vị trí hiện tại
        scroll_height = driver.execute_script("return document.body.scrollHeight")
        scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")

        print(f"Scroll #{scroll_count + 1} | Scroll position: {scroll_position:.0f} | Total height: {scroll_height:.0f}")

        # Nếu đã cuộn tới đáy, dừng lại
        if scroll_position >= scroll_height:
            print("Reached bottom of the page.")
            break

        scroll_count += 1

    if scroll_count >= max_scrolls:
        print("Reached max scroll limit.")

# -----------------------------------
# Link extraction per page with retry
# -----------------------------------

def find_product_link(prod_element, max_strategy_retries=2):
    """
    Thử lần lượt nhiều cách khác nhau để extract href từ prod_element:
      1. Dùng data-scroll-target attribute
      2. CSS selector (.productItemBlock a)
      3. className productImg-link
      4. className productName-link
      5. XPath
      6. JS querySelector
      7. find_elements + filter tất cả <a>
    """
    strategies = [
        # 1. data-scroll-target trên .productItemBlock
        lambda el: el.find_element(By.CLASS_NAME, "productItemBlock").get_attribute("data-scroll-target"),
        # 2. CSS selector
        lambda el: el.find_element(By.CSS_SELECTOR, ".productItemBlock a").get_attribute("href"),
        # 3. className productImg-link
        lambda el: el.find_element(By.CLASS_NAME, "productImg-link").get_attribute("href"),
        # 4. className productName-link
        lambda el: el.find_element(By.CLASS_NAME, "productName-link").get_attribute("href"),
        # 5. XPath
        lambda el: el.find_element(By.XPATH, ".//div[contains(@class,'productItemBlock')]//a").get_attribute("href"),
        # 6. JS querySelector
        lambda el: el.parent.execute_script(
            "return arguments[0].querySelector('.productItemBlock a').href;", el),
        # 7. fallback: scan tất cả <a>
        lambda el: next(
            (a.get_attribute("href") for a in el.find_elements(By.TAG_NAME, "a")
             if a.get_attribute("href")), None
        ),
    ]

    last_exception = None
    for strat_idx, strat in enumerate(strategies, start=1):
        for attempt in range(1, max_strategy_retries + 1):
            try:
                href = strat(prod_element)
                if href:
                    # Nếu URL là relative (ví dụ bắt được từ data-scroll-target), nối thêm domain
                    if href.startswith("/"):
                        href = urljoin(BASE_URL, href)
                    print(f"Strategy #{strat_idx} succeeded on attempt {attempt}: {href}")
                    return href
                else:
                    raise NoSuchElementException("Empty href")
            except Exception as e:
                last_exception = e
                print(f"  Strategy #{strat_idx} attempt {attempt} failed: {e}")
                time.sleep(0.3)
        print(f"→ Strategy #{strat_idx} exhausted, chuyển sang chiến lược tiếp theo.")
    print(f"Tất cả chiến lược đều thất bại cho phần tử: {last_exception}")
    return None


def extract_product_links(page: int, max_retries: int = MAX_RETRIES) -> list:
    logger.info(f"Extracting links from page {page}")
    url = BASE_URL.format(page)
    print(f"Fetching page {page}: {url}")

    for attempt in range(1, max_retries + 1):
        try:
            driver = init_driver()
            driver.get(url)

            print("Scrolling to load products...")
            scroll_down_slowly(driver)

            WebDriverWait(driver, IMPLICIT_WAIT).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "productItem"))
            )
            products = driver.find_elements(By.CLASS_NAME, "productItem")
            print(f"  Found {len(products)} product items on page {page}")

            links = []
            for idx, prod in enumerate(products, start=1):
                href = find_product_link(prod)
                if href:
                    links.append(href)
                else:
                    logger.error(f"Page {page}, product #{idx}: không lấy được link.")

            logger.info(f"Page {page}: extracted {len(links)} links on attempt {attempt}")
            return links

        except Exception as e:
            logger.warning(f"Attempt {attempt} failed for page {page}: {e}")
            time.sleep(RETRY_DELAY)
        finally:
            driver.quit()

    logger.error(f"All {max_retries} attempts failed for page {page}")
    return []

# -----------------------------------
# Detailed product scraping with retry
# -----------------------------------
def scrape_product_with_retry(url: str, max_retries: int = MAX_RETRIES) -> dict:
    for attempt in range(1, max_retries + 1):
        try:
            driver = init_driver()
        except Exception as e:
            logger.error(f"{url}: driver init failed: {e}")
            time.sleep(RETRY_DELAY)
            continue
        try:
            driver.get(url)
            try:
                more_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CLASS_NAME, "show-text"))
                )
                driver.execute_script("arguments[0].click();", more_btn)
            except Exception:
                pass
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "more-detail-content"))
            )
            price = driver.find_element(By.CLASS_NAME, "now-price").text.strip()
            specs = driver.find_elements(By.CLASS_NAME, "more-detail-content")
            record = {"URL": url, "Price": price}
            for spec in specs:
                try:
                    label = spec.find_element(By.CLASS_NAME, "more-label").text.strip().replace(' ', '_')
                    value = spec.find_element(By.CLASS_NAME, "more-value").text.strip()
                    record[label] = value
                except Exception:
                    continue
            logger.info(f"Scraped details for {url} on attempt {attempt}")
            return record
        except Exception as e:
            logger.error(f"Attempt {attempt} failed for {url}: {e}")
            time.sleep(RETRY_DELAY)
        finally:
            driver.quit()
    logger.error(f"All {max_retries} attempts failed for {url}")
    return {"URL": url, "Error": "Failed after retries"}

def scrape_product_bs(url: str) -> dict:
    """
    Scrape product details (price and specs) using requests + BeautifulSoup.
    """
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "lxml")
    record = {"URL": url}

    # 1. Giá ngay lập tức có trong HTML
    price_tag = soup.select_one(".now-price")
    record["Price"] = price_tag.get_text(strip=True) if price_tag else None

    # 2. Thông số chi tiết (các block .more-detail-content)
    for block in soup.select(".more-detail-content"):
        label_tag = block.select_one(".more-label")
        value_tag = block.select_one(".more-value")
        if label_tag and value_tag:
            key = label_tag.get_text(strip=True).replace(" ", "_")
            record[key] = value_tag.get_text(strip=True)

    return record


def scrape_product_bs_with_retry(url: str, max_retries: int = MAX_RETRIES) -> dict:
    """
    Bọc retry quanh hàm scrape_product_bs, tương tự cấu trúc Selenium version.
    """
    for attempt in range(1, max_retries + 1):
        try:
            record = scrape_product_bs(url)
            logger.info(f"Scraped with BS for {url} on attempt {attempt}")
            return record
        except requests.RequestException as e:
            logger.warning(f"Attempt {attempt} failed for {url}: {e}")
            time.sleep(RETRY_DELAY)
    logger.error(f"All {max_retries} attempts failed for {url}")
    return {"URL": url, "Error": "Failed after retries"}


# -----------------------------------
# CSV persistence
# -----------------------------------
def save_to_csv(record: dict, filename: str = OUTPUT_FILE) -> None:
    df_new = pd.DataFrame([record])
    if os.path.exists(filename):
        df_old = pd.read_csv(filename)
        df = pd.concat([df_old, df_new], ignore_index=True)
    else:
        df = df_new
    df.to_csv(filename, index=False)

# -----------------------------------
# Main processing: link extraction & detail scraping with logs and progress bars
# -----------------------------------
def main():
    # # Step 1: Extract links with retry and progress bars
    # pages = random.sample(range(1, 500), 912)
    # all_links = []
    # batches = [pages[i:i + BATCH_SIZE] for i in range(0, len(pages), BATCH_SIZE)]
    # for batch_num, batch in enumerate(batches, start=1):
    #     logger.info(f"Starting link batch {batch_num}/{len(batches)}: {batch}")
    #     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    #         futures = {executor.submit(extract_product_links, p): p for p in batch}
    #         for future in tqdm(as_completed(futures), total=len(futures), desc=f"Links batch {batch_num}"):
    #             page = futures[future]
    #             links = future.result()
    #             if links and len(links) > 0:
    #                 logger.info(f"Page {page}: found {len(links)} links")
    #                 all_links.extend(links)
    #             else:
    #                 logger.warning(f"No links on page {page} after retries")
    #     logger.info(f"Completed link batch {batch_num}/{len(batches)}")

    # pd.DataFrame({"Product_URL": all_links}).to_csv(LINKS_FILE, index=False)
    # logger.info(f"Total links collected: {len(all_links)}")
    # Get all links from CSV
    df_links = pd.read_csv(LINKS_FILE)
    all_links = df_links["Product_URL"].tolist()[2517:]
    # Step 2: Scrape product details in parallel with retry
    logger.info(f"Starting detail scraping for {len(all_links)} products with {MAX_WORKERS} threads")
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(scrape_product_bs_with_retry, url) for url in all_links]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Scraping products"):
            record = future.result()
            save_to_csv(record)
    logger.info("Scraping complete.")

In [2]:
%pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4

   ------------- -------------------------- 1/3 [beautifulsoup4]
   ---------------------------------------- 3/3 [bs4]

Successfully installed beautifulsoup4-4.13.4 bs4-0.0.2 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


In [None]:
if __name__ == "__main__":
    main()

2025-05-24 10:07:47,098 - INFO - Starting detail scraping for 28407 products with 8 threads
Scraping products:   0%|          | 0/28407 [00:00<?, ?it/s]
