In [1]:
pip install undetected-chromedriver selenium tqdm tenacity

Defaulting to user installation because normal site-packages is not writeable
Collecting undetected-chromedriver
  Using cached undetected_chromedriver-3.5.5-py3-none-any.whl
Collecting selenium
  Using cached selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting tenacity
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting websockets (from undetected-chromedriver)
  Using cached websockets-15.0.1-cp313-cp313-win_amd64.whl.metadata (7.0 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Using cached trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Using cached outco


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\yooni\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import time
import csv
import os
from multiprocessing import Pool, Manager, Lock
from functools import partial
import logging
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential

# --- 로깅 설정 ---
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(processName)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('crawler.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# --- 1. 설정 ---
SEARCH_KEYWORD = "dior"
PERFUME_CSV_FILE = f'parfumo_perfumes_{SEARCH_KEYWORD}.csv'
REVIEW_CSV_FILE = f'parfumo_reviews_{SEARCH_KEYWORD}.csv'
NUM_WORKERS = 4  # 병렬 처리 워커 수 (CPU 코어 수에 맞게 조정)
RATE_LIMIT_DELAY = 0.3  # 요청 간 최소 대기 시간 (초)

# --- 2. CSV 파일 헤더 ---
PERFUME_FIELDNAMES = ['product_name', 'brand_name', 'target_gender', 'release_year',
                      'top_notes', 'heart_notes', 'base_notes']
REVIEW_FIELDNAMES = ['product_name', 'review_title', 'review_content']

# --- 3. 선택자 (Selectors) ---
SEARCH_BAR_SELECTOR = (By.ID, 's_top')
SEARCH_SUBMIT_SELECTOR = (By.CSS_SELECTOR, 'button.btn-s-ext')
PRODUCT_LINK_SELECTOR = (By.CSS_SELECTOR, 'div.name > a')
NEXT_PAGE_BUTTON_SELECTOR = (By.CSS_SELECTOR, 'a.paging_links[rel="next"]')

# 제품 상세 페이지
PRODUCT_NAME_SELECTOR = (By.CSS_SELECTOR, 'h1.p_name_h1')
BRAND_NAME_SELECTOR = (By.CSS_SELECTOR, 'h1 span[itemprop="brand"] span[itemprop="name"]')
TARGET_GENDER_SELECTOR = (By.CSS_SELECTOR, 'div.p_gender_big i')
RELEASE_YEAR_SELECTOR = (By.CSS_SELECTOR, 'h1 span.label_a')

# 노트 셀렉터
TOP_NOTES_SELECTOR = (By.CSS_SELECTOR, 'span[data-nt="t"] span.nowrap')
HEART_NOTES_SELECTOR = (By.CSS_SELECTOR, 'span[data-nt="m"] span.nowrap')
BASE_NOTES_SELECTOR = (By.CSS_SELECTOR, 'span[data-nt="b"] span.nowrap')

# 리뷰 섹션
REVIEW_TITLE_SELECTOR = (By.CSS_SELECTOR, 'div.text-lg.bold span[itemprop="name"]')
REVIEW_CONTENT_SELECTOR = (By.CSS_SELECTOR, 'div.leading-7')
READ_MORE_BUTTON_SELECTOR = (By.XPATH, ".//div[contains(text(), 'Read more')]")
MORE_REVIEWS_BUTTON_SELECTOR = (By.CSS_SELECTOR, 'span.action_more_reviews')
REVIEW_CONTAINER_SELECTOR = (By.CSS_SELECTOR, 'article.review')


# --- 4. 헬퍼 함수 ---

def setup_driver(headless=False):
    """Undetected-Chromedriver를 설정하고 실행합니다."""
    options = uc.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-images')  # 이미지 로딩 비활성화로 속도 향상
    driver = uc.Chrome(options=options)
    driver.implicitly_wait(3)  # 7초에서 3초로 단축
    return driver


def setup_csv_files():
    """CSV 파일이 없으면 헤더와 함께 생성합니다."""
    if not os.path.exists(PERFUME_CSV_FILE):
        with open(PERFUME_CSV_FILE, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=PERFUME_FIELDNAMES)
            writer.writeheader()

    if not os.path.exists(REVIEW_CSV_FILE):
        with open(REVIEW_CSV_FILE, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=REVIEW_FIELDNAMES)
            writer.writeheader()


def click_with_js(driver, element):
    """JavaScript로 클릭합니다."""
    try:
        driver.execute_script("arguments[0].click();", element)
    except Exception as e:
        logger.debug(f"JS 클릭 실패: {e}")


def safe_find_text(driver_or_element, *selector, wait_time=2):
    """요소를 찾아 텍스트를 반환하되, 없으면 빈 문자열을 반환합니다."""
    try:
        element = WebDriverWait(driver_or_element, wait_time).until(
            EC.presence_of_element_located(selector)
        )
        return element.text
    except (NoSuchElementException, TimeoutException):
        return ""


def get_notes(driver, *selector):
    """노트 요소를 모두 찾아 텍스트 리스트로 반환하고 쉼표로 연결합니다."""
    try:
        elements = driver.find_elements(*selector)
        notes = [elem.text for elem in elements if elem.text]
        return ", ".join(notes)
    except NoSuchElementException:
        return ""


def write_batch_to_csv(filename, fieldnames, data_batch, lock):
    """배치 데이터를 한 번에 CSV에 씁니다."""
    if not data_batch:
        return

    with lock:
        with open(filename, 'a', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writerows(data_batch)


# --- 5. 핵심 스크래핑 함수 (재시도 로직 포함) ---

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def scrape_product_details(driver):
    """제품 상세 페이지에서 향수 정보를 스크랩합니다."""
    wait = WebDriverWait(driver, 5)

    product_name_element = wait.until(EC.presence_of_element_located(PRODUCT_NAME_SELECTOR))
    product_name = driver.execute_script("return arguments[0].firstChild.textContent.trim()", product_name_element)

    brand_name = safe_find_text(driver, *BRAND_NAME_SELECTOR)
    release_year = safe_find_text(driver, *RELEASE_YEAR_SELECTOR)

    target_gender = "N/A"
    try:
        icon_class = driver.find_element(*TARGET_GENDER_SELECTOR).get_attribute('class')
        if 'fa-mars' in icon_class:
            target_gender = 'M'
        elif 'fa-venus' in icon_class:
            target_gender = 'F'
        elif 'fa-venus-mars' in icon_class:
            target_gender = 'N'
    except NoSuchElementException:
        pass

    top_notes = get_notes(driver, *TOP_NOTES_SELECTOR)
    heart_notes = get_notes(driver, *HEART_NOTES_SELECTOR)
    base_notes = get_notes(driver, *BASE_NOTES_SELECTOR)

    product_data = {
        'product_name': product_name,
        'brand_name': brand_name,
        'target_gender': target_gender,
        'release_year': release_year,
        'top_notes': top_notes,
        'heart_notes': heart_notes,
        'base_notes': base_notes
    }

    return product_name, product_data


def scrape_reviews(driver, product_name):
    """제품 페이지의 모든 리뷰를 스크랩합니다. (배치 처리)"""
    wait = WebDriverWait(driver, 5)
    processed_review_texts = set()
    reviews_batch = []  # 메모리에 배치로 저장

    logger.info(f"[{product_name}] 리뷰 수집 시작...")

    # 리뷰 섹션으로 스크롤
    try:
        reviews_section = wait.until(
            EC.presence_of_element_located((By.ID, "reviews_holder"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", reviews_section)
        time.sleep(0.5)  # 1초에서 0.5초로 단축
    except Exception as e:
        logger.warning(f"스크롤 중 오류 발생: {e}")
        return reviews_batch

    while True:
        try:
            # 리뷰 요소 찾기
            review_elements = wait.until(
                EC.presence_of_all_elements_located(REVIEW_CONTAINER_SELECTOR)
            )

            for review in review_elements:
                try:
                    # Read more 버튼 처리
                    read_more_button = review.find_element(*READ_MORE_BUTTON_SELECTOR)
                    if read_more_button.is_displayed():
                        click_with_js(driver, read_more_button)
                        time.sleep(0.2)  # 0.5초에서 0.2초로 단축
                except NoSuchElementException:
                    pass

                title = safe_find_text(review, *REVIEW_TITLE_SELECTOR, wait_time=1)
                content = safe_find_text(review, *REVIEW_CONTENT_SELECTOR, wait_time=1)

                if content and content not in processed_review_texts:
                    processed_review_texts.add(content)

                    review_data = {
                        'product_name': product_name,
                        'review_title': title,
                        'review_content': content
                    }
                    reviews_batch.append(review_data)

            # More reviews 버튼 클릭
            more_reviews_button = wait.until(
                EC.element_to_be_clickable(MORE_REVIEWS_BUTTON_SELECTOR)
            )
            click_with_js(driver, more_reviews_button)
            time.sleep(RATE_LIMIT_DELAY)

        except (TimeoutException, NoSuchElementException):
            logger.info(f"✅ [{product_name}] 리뷰 {len(reviews_batch)}개 수집 완료.")
            break
        except Exception as e:
            logger.error(f"❌ [{product_name}] 리뷰 수집 중 오류: {e}")
            break

    return reviews_batch


# --- 6. 워커 함수 (병렬 처리용) ---

def process_product_batch(product_urls, file_lock, worker_id):
    """워커가 제품 배치를 처리합니다."""
    driver = None
    try:
        driver = setup_driver(headless=False)

        for i, url in enumerate(product_urls):
            try:
                logger.info(f"[Worker-{worker_id}] ({i + 1}/{len(product_urls)}) 제품 처리 중: {url}")
                driver.get(url)

                # 제품 정보 스크랩
                product_name, product_data = scrape_product_details(driver)
                write_batch_to_csv(PERFUME_CSV_FILE, PERFUME_FIELDNAMES, [product_data], file_lock)
                logger.info(f"✅ [Worker-{worker_id}] 제품 정보 저장: {product_name}")

                # 리뷰 스크랩 (배치 처리)
                reviews_batch = scrape_reviews(driver, product_name)
                if reviews_batch:
                    write_batch_to_csv(REVIEW_CSV_FILE, REVIEW_FIELDNAMES, reviews_batch, file_lock)

                time.sleep(RATE_LIMIT_DELAY)

            except Exception as e:
                logger.error(f"❌ [Worker-{worker_id}] 제품 처리 중 오류: {url} - {e}")
                continue

    except Exception as e:
        logger.error(f"❌ [Worker-{worker_id}] 워커 오류: {e}")
    finally:
        if driver:
            driver.quit()


# --- 7. 제품 URL 수집 함수 ---

def collect_all_product_urls():
    """모든 검색 결과 페이지에서 제품 URL을 수집합니다."""
    driver = setup_driver()
    wait = WebDriverWait(driver, 10)
    all_product_urls = []

    try:
        # 메인 페이지로 이동
        driver.get("https://www.parfumo.com/")

        # Privacy 팝업 처리
        try:
            iframe_element = wait.until(
                EC.presence_of_element_located((By.ID, "sp_message_iframe_902160"))
            )
            driver.switch_to.frame(iframe_element)

            settings_button = wait.until(
                EC.element_to_be_clickable((By.XPATH, "//button[@title='Settings or reject']"))
            )
            settings_button.click()

            save_exit_button = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.sp_choice_type_SAVE_AND_EXIT"))
            )
            save_exit_button.click()
            logger.info("✅ Privacy 팝업 처리 완료")

        except (TimeoutException, NoSuchElementException):
            logger.info("Privacy 팝업이 없거나 이미 처리됨")
        finally:
            driver.switch_to.default_content()

        # 검색 실행
        search_bar = wait.until(EC.element_to_be_clickable(SEARCH_BAR_SELECTOR))
        search_bar.send_keys(SEARCH_KEYWORD)

        search_button = wait.until(EC.element_to_be_clickable(SEARCH_SUBMIT_SELECTOR))
        click_with_js(driver, search_button)

        logger.info(f"'{SEARCH_KEYWORD}' 검색 완료. URL 수집 시작...")

        # 페이지네이션으로 모든 URL 수집
        page_num = 1
        while True:
            logger.info(f"페이지 {page_num} URL 수집 중...")

            try:
                wait.until(EC.presence_of_element_located(PRODUCT_LINK_SELECTOR))
                product_link_elements = driver.find_elements(*PRODUCT_LINK_SELECTOR)
                page_urls = [elem.get_attribute('href') for elem in product_link_elements
                             if elem.get_attribute('href')]

                all_product_urls.extend(page_urls)
                logger.info(f"페이지 {page_num}에서 {len(page_urls)}개 URL 수집 (총: {len(all_product_urls)}개)")

            except TimeoutException:
                logger.warning(f"페이지 {page_num}에서 제품 링크를 찾을 수 없음")

            # 다음 페이지 확인
            try:
                next_button = wait.until(
                    EC.presence_of_element_located(NEXT_PAGE_BUTTON_SELECTOR)
                )
                next_page_url = next_button.get_attribute('href')
                driver.get(next_page_url)
                time.sleep(1)
                page_num += 1
            except (TimeoutException, NoSuchElementException):
                logger.info(f"✅ 마지막 페이지 도달. 총 {len(all_product_urls)}개 URL 수집 완료")
                break

    except Exception as e:
        logger.error(f"❌ URL 수집 중 오류: {e}")
    finally:
        driver.quit()

    return all_product_urls


# --- 8. 메인 실행 ---

def main():
    """메인 실행 함수"""
    logger.info("=" * 60)
    logger.info("향수 크롤러 시작 (최적화 버전)")
    logger.info("=" * 60)

    setup_csv_files()

    # 1단계: 모든 제품 URL 수집
    logger.info("\n[1단계] 제품 URL 수집 중...")
    product_urls = collect_all_product_urls()

    if not product_urls:
        logger.error("수집된 제품 URL이 없습니다. 종료합니다.")
        return

    logger.info(f"\n총 {len(product_urls)}개 제품 발견")

    # 2단계: URL을 배치로 나누어 병렬 처리
    logger.info(f"\n[2단계] {NUM_WORKERS}개 워커로 병렬 처리 시작...")

    # URL을 워커 수만큼 분할
    batch_size = len(product_urls) // NUM_WORKERS + 1
    url_batches = [product_urls[i:i + batch_size] for i in range(0, len(product_urls), batch_size)]

    # 멀티프로세싱 매니저로 파일 락 생성
    manager = Manager()
    file_lock = manager.Lock()

    # 병렬 처리 실행
    with Pool(NUM_WORKERS) as pool:
        worker_func = partial(process_product_batch, file_lock=file_lock)
        pool.starmap(worker_func, [(batch, i) for i, batch in enumerate(url_batches)])

    logger.info("\n" + "=" * 60)
    logger.info("✅ 모든 크롤링 완료!")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()

--- Logging error ---
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\Lib\logging\__init__.py", line 1154, in emit
    stream.write(msg + self.terminator)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode characters in position 47-48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\yooni\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.