# Function Definition

In [1]:
def run_ctgr():
    import time
    import random
    import re
    import numpy as np
    import pandas as pd
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException, NoSuchElementException
    from datetime import datetime
    
    def run_with_retry(max_retries=3):
        for attempt in range(max_retries):
            try:
                now = datetime.now()
                timestamp = now.strftime("%y%m%d_%H")
                filename = f"ctgr_{timestamp}.csv"
                
                # ‚úÖ Set system-specific paths
                CHROMEDRIVER_PATH = "/usr/lib/chromium-browser/chromedriver"
                CHROMIUM_PATH     = "/usr/bin/chromium-browser"
                
                # ‚úÖ Configure Chrome driver options with stability improvements
                options = webdriver.ChromeOptions()
                options.binary_location = CHROMIUM_PATH
                options.add_argument("--headless=new")
                options.add_argument("--disable-gpu")
                options.add_argument("--no-sandbox")
                options.add_argument("--window-size=1920,1080")
                # üÜï Added stability options
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("--disable-extensions")
                options.add_argument("--no-first-run")
                options.add_argument("--disable-default-apps")
                options.add_argument("--disable-background-timer-throttling")
                
                service = Service(CHROMEDRIVER_PATH)
                driver = webdriver.Chrome(service=service, options=options)
                wait = WebDriverWait(driver, 30)
                
                MIN_DELAY = 1.0
                MAX_DELAY = 2.0
                def random_sleep():
                    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
                
                results = []
                
                try:
                    # Basic configuration
                    root_url      = "https://www.yes24.com/Product/Category/Display/001001003"
                    category_code = root_url.rstrip("/").split("/")[-1]
                    category_id   = f"category{category_code}"
                    
                    driver.get(root_url)
                    random_sleep()
                    wait.until(EC.presence_of_element_located((By.ID, category_id)))
                    section    = driver.find_element(By.ID, category_id)
                    top_lis    = section.find_elements(By.CSS_SELECTOR, "ul > li")
                    parent_ids = [li.get_attribute("id") for li in top_lis]
                    
                    for pid in parent_ids:
                        driver.get(root_url)
                        random_sleep()
                        try:
                            wait.until(EC.presence_of_element_located((By.ID, pid)))
                            elem = driver.find_element(By.CSS_SELECTOR, f"#{pid} > span > a.lnk_cate > em")
                            category_name = elem.text.strip()
                            
                            driver.execute_script("arguments[0].scrollIntoView();", elem)
                            random_sleep()
                            elem.click()
                            random_sleep()
                            
                            current_url = driver.current_url
                            try:
                                title_text = driver.find_element(By.CSS_SELECTOR, "#categoryProductContentsWrap > div.cateGoodsSecTop > div.cateGoodsSecTit").text
                                m = re.search(r'\(([\d,]+)\)', title_text)
                                count = int(m.group(1).replace(",", "")) if m else np.nan
                            except:
                                count = np.nan
                            
                            results.append({
                                'category': category_name,
                                'url': current_url,
                                'product_count': count
                            })
                            
                        except Exception as e:
                            print(f"[SKIP] {pid}: {e}")
                            continue
                    
                    df = pd.DataFrame(results)
                    df.to_csv(filename, index=False, encoding="utf-8-sig")
                    
                    now = datetime.now()
                    
                    with open("scraper_log.txt", "a", encoding="utf-8") as log:
                        log.write(f"[{now.strftime('%Y-%m-%d %H:%M')}] ‚úÖ Save completed: {filename}\n")
                    
                    # Function exits on success
                    return
                    
                except Exception as e:
                    if attempt == max_retries - 1:
                        # Final attempt failed
                        with open("scraper_log.txt", "a", encoding="utf-8") as log:
                            log.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] ‚ùå Error occurred (final attempt): {e}\n")
                    else:
                        # Retry scheduled
                        with open("scraper_log.txt", "a", encoding="utf-8") as log:
                            log.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] ‚ö†Ô∏è Error occurred (attempt {attempt + 1}/{max_retries}): {e}\n")
                        time.sleep(60)  # Retrying after 1-minute delay
                    
                finally:
                    driver.quit()
                    
            except Exception as setup_error:
                # Driver initialization failed, etc.
                with open("scraper_log.txt", "a", encoding="utf-8") as log:
                    log.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] ‚ùå Setup error (attempt {attempt + 1}/{max_retries}): {setup_error}\n")
                if attempt < max_retries - 1:
                    time.sleep(60)
    
    # üÜï Execute with retry logic
    run_with_retry()

In [2]:
def run_subctgr():
    import time
    import random
    import re
    import numpy as np
    import pandas as pd
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import NoSuchElementException, TimeoutException
    from datetime import datetime
    
    def run_with_retry(max_retries=3):
        for attempt in range(max_retries):
            try:
                now = datetime.now()
                timestamp = now.strftime("%y%m%d_%H")
                filename = f"subctgr_{timestamp}.csv"

                # ‚úÖ Set system-specific paths
                CHROMEDRIVER_PATH = "/usr/lib/chromium-browser/chromedriver"
                CHROMIUM_PATH     = "/usr/bin/chromium-browser"

                # ‚úÖ Configure Chrome driver options with stability improvements
                options = webdriver.ChromeOptions()
                options.binary_location = CHROMIUM_PATH
                options.add_argument("--headless=new")
                options.add_argument("--disable-gpu")
                options.add_argument("--no-sandbox")
                options.add_argument("--window-size=1920,1080")
                # üÜï Added stability options
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("--disable-extensions")
                options.add_argument("--no-first-run")
                options.add_argument("--disable-default-apps")
                options.add_argument("--disable-background-timer-throttling")

                # ‚úÖ Launch Chrome driver
                service = Service(CHROMEDRIVER_PATH)
                driver  = webdriver.Chrome(service=service, options=options)
                wait    = WebDriverWait(driver, 30)

                MIN_DELAY = 1.0
                MAX_DELAY = 3.0
                def random_sleep():
                    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))

                results = []

                try:
                    # Basic configuration
                    root_url      = "https://www.yes24.com/Product/Category/Display/001001003"
                    category_code = root_url.rstrip("/").split("/")[-1]
                    category_id   = f"category{category_code}"

                    driver.get(root_url)
                    random_sleep()
                    wait.until(EC.presence_of_element_located((By.ID, category_id)))
                    section    = driver.find_element(By.ID, category_id)
                    top_lis    = section.find_elements(By.CSS_SELECTOR, "ul > li")
                    parent_ids = [li.get_attribute("id") for li in top_lis]

                    def process_category(parent_id):
                        driver.get(root_url)
                        random_sleep()
                        wait.until(EC.presence_of_element_located((By.ID, parent_id)))

                        u_elems = driver.find_elements(By.CSS_SELECTOR, f"#{parent_id} > span > u.cate")
                        if u_elems:
                            parent_name = u_elems[0].text.strip()
                        else:
                            parent_em = driver.find_element(By.CSS_SELECTOR, f"#{parent_id} > span > a.lnk_cate > em")
                            parent_name = parent_em.text.strip()

                        parent_em = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"#{parent_id} > span > a.lnk_cate > em")))
                        driver.execute_script("arguments[0].scrollIntoView();", parent_em)
                        random_sleep()
                        parent_em.click()
                        random_sleep()

                        current_url = driver.current_url

                        try:
                            txt = driver.find_element(By.CSS_SELECTOR, "div.cateGoodsSecTit").text
                            m = re.search(r'\(([\d,]+)\)', txt)
                            count = int(m.group(1).replace(',', '')) if m else np.nan
                        except (NoSuchElementException, TimeoutException):
                            count = np.nan

                        # Handle subcategories
                        child_sel = f"#{parent_id} > ul > li > span > a.lnk_cate > em"
                        try:
                            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, child_sel)))
                            child_ems = driver.find_elements(By.CSS_SELECTOR, child_sel)
                        except TimeoutException:
                            results.append({
                                'category':      parent_name,
                                'subcategory':   np.nan,
                                'url':           current_url,
                                'product_count': count
                            })
                            return

                        for i in range(len(child_ems)):
                            child_ems = driver.find_elements(By.CSS_SELECTOR, child_sel)
                            child_em = child_ems[i]
                            child_name = child_em.text.strip() if child_em.text.strip() else np.nan

                            driver.execute_script("arguments[0].scrollIntoView();", child_em)
                            random_sleep()

                            try:
                                child_em.click()
                            except Exception:
                                results.append({
                                    'category':      parent_name,
                                    'subcategory':   child_name,
                                    'url':           np.nan,
                                    'product_count': np.nan
                                })
                                continue

                            wait.until(EC.staleness_of(child_em))
                            random_sleep()

                            try:
                                txt = driver.find_element(By.CSS_SELECTOR, "div.cateGoodsSecTit").text
                                m = re.search(r'\(([\d,]+)\)', txt)
                                child_count = int(m.group(1).replace(',', '')) if m else np.nan
                            except (NoSuchElementException, TimeoutException):
                                child_count = np.nan

                            results.append({
                                'category':      parent_name,
                                'subcategory':   child_name,
                                'url':           driver.current_url,
                                'product_count': child_count
                            })

                            driver.back()
                            random_sleep()
                            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"#{parent_id} > span > a.lnk_cate > em")))

                    for pid in parent_ids:
                        process_category(pid)

                    df = pd.DataFrame(results)
                    df.to_csv(filename, index=False, encoding="utf-8-sig")

                    now = datetime.now()
                    
                    with open("scraper_log.txt", "a", encoding="utf-8") as log:
                        log.write(f"[{now.strftime('%Y-%m-%d %H:%M')}] ‚úÖ Save completed: {filename}\n")
                    
                    # Function exits on success
                    return

                except Exception as e:
                    if attempt == max_retries - 1:
                        # Final attempt failed
                        with open("scraper_log.txt", "a", encoding="utf-8") as log:
                            log.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] ‚ùå Error occurred (final attempt): {e}\n")
                    else:
                        # Retry scheduled
                        with open("scraper_log.txt", "a", encoding="utf-8") as log:
                            log.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] ‚ö†Ô∏è Error occurred (attempt {attempt + 1}/{max_retries}): {e}\n")
                        time.sleep(60)  # Retrying after 1-minute delay
                    
                finally:
                    driver.quit()
                    
            except Exception as setup_error:
                # Driver initialization failed, etc.
                with open("scraper_log.txt", "a", encoding="utf-8") as log:
                    log.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}] ‚ùå Setup error (attempt {attempt + 1}/{max_retries}): {setup_error}\n")
                if attempt < max_retries - 1:
                    time.sleep(60)
    
    # üÜï Execute with retry logic
    run_with_retry()

# Crawling

In [3]:
# Check scheduled jobs
import schedule
print(schedule.jobs)

[]


In [4]:
import schedule
import threading
from datetime import datetime
from time import sleep

# ‚úÖ Register scheduled tasks
schedule.every().day.at("05:00").do(run_ctgr)
schedule.every().day.at("05:15").do(run_subctgr)
schedule.every().day.at("17:00").do(run_ctgr)
schedule.every().day.at("17:15").do(run_subctgr)

# ‚úÖ Scheduler loop execution function
def run_schedule_loop():
    print(f"üïí Scheduler has started ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})")
    while True:
        schedule.run_pending()
        sleep(30)  # Check for scheduled jobs every 30 seconds

# ‚úÖ Start scheduler in background thread
threading.Thread(target=run_schedule_loop).start()

üïí Scheduler has started (2025-08-02 20:04:26)


In [5]:
# Check scheduled jobs
import schedule
print(schedule.jobs)

[Every 1 day at 05:00:00 do run_ctgr() (last run: [never], next run: 2025-08-03 05:00:00), Every 1 day at 05:15:00 do run_subctgr() (last run: [never], next run: 2025-08-03 05:15:00), Every 1 day at 17:00:00 do run_ctgr() (last run: [never], next run: 2025-08-03 17:00:00), Every 1 day at 17:15:00 do run_subctgr() (last run: [never], next run: 2025-08-03 17:15:00)]
