In [11]:
# ─────────────────────────────────────────
# 필요 패키지 설치 (최초 1회만)
# pip install selenium webdriver-manager pandas openpyxl
# ─────────────────────────────────────────

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time


def go_to_next_page(driver, wait):
    try:
        curr = int(driver.find_element(By.CSS_SELECTOR, "div.pageing > strong").text.strip())
    except:
        return False
    nxt = curr + 1
    try:
        wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, f'div.pageing > a[data-page-no="{nxt}"]')
        )).click()
    except TimeoutException:
        try:
            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.pageing > a.next"))).click()
            wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, f'div.pageing > a[data-page-no="{nxt}"]')
            )).click()
        except TimeoutException:
            return False
    try:
        wait.until(lambda d: d.find_element(By.CSS_SELECTOR, "div.pageing > strong").text.strip() == str(nxt))
        return True
    except TimeoutException:
        return False


def collect_reviews(driver, wait):
    reviews = []
    try:
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#reviewInfo > a"))).click()
        time.sleep(1)
    except TimeoutException:
        return reviews

    while True:
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#gdasList > li")))
        for el in driver.find_elements(By.CSS_SELECTOR, "#gdasList > li > div.review_cont > div.txt_inner"):
            txt = el.text.strip()
            if txt:
                reviews.append(txt)
        if not go_to_next_page(driver, wait):
            break
    return reviews


def crawl_reviews(driver, wait):
    """
    현재 탭에서 상품 링크 수집→새 탭 열기→브랜드/상품명/리뷰 수집 후 리스트 반환
    반환 형식: [[brand, product, review], ...]
    """
    main_handle = driver.current_window_handle
    links = [a.get_attribute("href")
             for a in driver.find_elements(By.CSS_SELECTOR, "ul.cate_prd_list > li .prd_info a")]

    rows = []
    for href in links:
        driver.execute_script("window.open(arguments[0], '_blank');", href)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)

        try:
            brand = driver.find_element(By.CSS_SELECTOR, "#moveBrandShop").text.strip()
        except:
            brand = ""
        try:
            product = driver.find_element(By.CSS_SELECTOR, "p.prd_name").text.strip()
        except:
            product = ""

        reviews = collect_reviews(driver, wait)
        for rev in reviews:
            rows.append([brand, product, rev])

        driver.close()
        # 안전하게 원래 탭으로 복귀
        handles = driver.window_handles
        if main_handle in handles:
            driver.switch_to.window(main_handle)
        elif handles:
            driver.switch_to.window(handles[0])
        else:
            break

    return rows

def main():
    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 10)

    driver.get("https://www.oliveyoung.co.kr/store/main/main.do")
    wait.until(EC.element_to_be_clickable((By.ID, "btnGnbOpen"))).click()
    time.sleep(1)

    all_rows = []
    categories = [
        ("스킨/토너",      "//a[contains(text(), '스킨/토너')]"),
        ("에센스/세럼/앰플", "a[data-attr*='에센스/세럼/앰플']"),
        ("크림",           "//ul[@class='loc_history']//a[@class='cate_y' and normalize-space(text())='크림']"),
        ("아이크림",       "//ul[@class='cate_list_box']//a[normalize-space(text())='아이크림']")
    ]

    for cat_name, selector in categories:
        # 카테고리 클릭
        if selector.startswith("//"):
            wait.until(EC.element_to_be_clickable((By.XPATH, selector))).click()
        else:
            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector))).click()
        time.sleep(1)

        # 리뷰 크롤링
        rows = crawl_reviews(driver, wait)
        # 카테고리명 컬럼 추가
        for r in rows:
            all_rows.append([cat_name] + r)

    driver.quit()

    # 하나의 DataFrame으로 저장
    df = pd.DataFrame(all_rows, columns=["카테고리", "브랜드명", "상품명", "리뷰내용"])
    df.to_csv("all_reviews.csv", index=False, encoding="utf-8-sig")
    df.to_excel("all_reviews.xlsx", index=False)
    print(f"✅ 전체 리뷰 {len(df)}개 저장 → all_reviews.csv, all_reviews.xlsx")


if __name__ == "__main__":
    main()



TimeoutException: Message: 


In [15]:
# ─────────────────────────────────────────
# 필요 패키지 설치 (최초 1회만)
# pip install selenium webdriver-manager pandas openpyxl
# ─────────────────────────────────────────

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

BASE_URL = "https://www.oliveyoung.co.kr/store/main/main.do"

# 다음 페이지로 이동 (페이징 처리)
def go_to_next_page(driver, wait):
    try:
        curr = int(driver.find_element(By.CSS_SELECTOR, "div.pageing > strong").text.strip())
    except:
        return False
    nxt = curr + 1
    try:
        wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, f'div.pageing > a[data-page-no="{nxt}"]')
        )).click()
    except TimeoutException:
        # '다음 10페이지' 버튼 클릭 후 재시도
        try:
            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.pageing > a.next"))).click()
            wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, f'div.pageing > a[data-page-no="{nxt}"]')
            )).click()
        except TimeoutException:
            return False
    try:
        wait.until(lambda d: d.find_element(By.CSS_SELECTOR, "div.pageing > strong").text.strip() == str(nxt))
        return True
    except TimeoutException:
        return False

# 한 상품 페이지에서 리뷰 모두 수집
def collect_reviews(driver, wait):
    reviews = []
    try:
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#reviewInfo > a"))).click()
        time.sleep(1)
    except TimeoutException:
        return reviews

    while True:
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#gdasList > li")))
        for el in driver.find_elements(By.CSS_SELECTOR, "#gdasList > li > div.review_cont > div.txt_inner"):
            txt = el.text.strip()
            if txt:
                reviews.append(txt)
        if not go_to_next_page(driver, wait):
            break
    return reviews

# 현재 카테고리 페이지에서 모든 상품 링크 열고 리뷰 수집
def crawl_reviews(driver, wait):
    main_handle = driver.current_window_handle
    links = [a.get_attribute("href") for a in driver.find_elements(By.CSS_SELECTOR, "ul.cate_prd_list > li .prd_info a")]
    rows = []

    for href in links:
        # 새 탭 열기
        driver.execute_script("window.open(arguments[0], '_blank');", href)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)

        # 브랜드명/상품명
        try:
            brand = driver.find_element(By.CSS_SELECTOR, "#moveBrandShop").text.strip()
        except:
            brand = ""
        try:
            product = driver.find_element(By.CSS_SELECTOR, "p.prd_name").text.strip()
        except:
            product = ""

        # 리뷰 수집
        for rev in collect_reviews(driver, wait):
            rows.append([brand, product, rev])

        # 탭 닫고 원래 탭으로 복귀
        driver.close()
        handles = driver.window_handles
        if main_handle in handles:
            driver.switch_to.window(main_handle)
        else:
            driver.switch_to.window(handles[0])

    return rows

# GNB 메뉴 열고 data-attr 셀렉터로 카테고리 클릭
def click_category(driver, wait, css_selector):
    driver.get(BASE_URL)
    wait.until(EC.element_to_be_clickable((By.ID, "btnGnbOpen"))).click()
    time.sleep(1)
    try:
        el = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector)))
        driver.execute_script("arguments[0].scrollIntoView(true);", el)
        el.click()
        time.sleep(2)
        return True
    except TimeoutException:
        return False

def main():
    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 15)

    # 카테고리명 → data-attr 기반 CSS 셀렉터 매핑
    category_selectors = {
        "스킨/토너":      "a[data-attr*='카테고리상세^카테고리리스트^스킨/토너']",
        "에센스/세럼/앰플": "a[data-attr*='카테고리상세^카테고리리스트^에센스/세럼/앰플']",
        "크림":           "a[data-attr*='카테고리상세^카테고리리스트^크림']",
        "아이크림":       "a[data-attr*='카테고리상세^카테고리리스트^아이크림']"
    }

    for cat_name, selector in category_selectors.items():
        print(f"\n▶▶ [{cat_name}] 크롤링 시작")
        if not click_category(driver, wait, selector):
            print(f"⚠️ [{cat_name}] 클릭 실패, 건너뜁니다.")
            continue

        rows = crawl_reviews(driver, wait)
        if not rows:
            print(f"⚠️ [{cat_name}] 수집된 리뷰가 없습니다.")
            continue

        # DataFrame 생성 & 파일명 안전 처리
        df = pd.DataFrame(rows, columns=["브랜드명", "상품명", "리뷰내용"])
        safe_name = cat_name.replace("/", "_")
        csv_name = f"{safe_name}_reviews.csv"
        xlsx_name = f"{safe_name}_reviews.xlsx"

        df.to_csv(csv_name, index=False, encoding="utf-8-sig")
        df.to_excel(xlsx_name, index=False)
        print(f"✅ [{cat_name}] 리뷰 {len(df)}개 저장 → {csv_name}, {xlsx_name}")

    driver.quit()
    print("\n🎉 모든 카테고리 크롤링 완료!")

if __name__ == "__main__":
    main()



▶▶ [스킨/토너] 크롤링 시작
⚠️ [스킨/토너] 클릭 실패, 건너뜁니다.

▶▶ [에센스/세럼/앰플] 크롤링 시작
⚠️ [에센스/세럼/앰플] 클릭 실패, 건너뜁니다.

▶▶ [크림] 크롤링 시작
⚠️ [크림] 클릭 실패, 건너뜁니다.

▶▶ [아이크림] 크롤링 시작
⚠️ [아이크림] 클릭 실패, 건너뜁니다.

🎉 모든 카테고리 크롤링 완료!


In [None]:
# ─────────────────────────────────────────
# 필요 패키지 설치 (최초 1회만)
# pip install selenium webdriver-manager pandas openpyxl
# ─────────────────────────────────────────

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

BASE_URL = "https://www.oliveyoung.co.kr/store/main/main.do"

def go_to_next_page(driver, wait):
    try:
        curr = int(driver.find_element(By.CSS_SELECTOR, "div.pageing > strong").text.strip())
    except:
        return False
    nxt = curr + 1
    try:
        wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, f"div.pageing > a[data-page-no='{nxt}']"))).click()
    except TimeoutException:
        try:
            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.pageing > a.next"))).click()
            wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, f"div.pageing > a[data-page-no='{nxt}']"))).click()
        except TimeoutException:
            return False
    try:
        wait.until(lambda d: d.find_element(By.CSS_SELECTOR, "div.pageing > strong").text.strip() == str(nxt))
        return True
    except TimeoutException:
        return False

def collect_reviews(driver, wait):
    reviews = []
    try:
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#reviewInfo > a"))).click()
        time.sleep(1)
    except TimeoutException:
        return reviews

    while True:
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#gdasList > li")))
        for el in driver.find_elements(By.CSS_SELECTOR, "#gdasList > li > div.review_cont > div.txt_inner"):
            txt = el.text.strip()
            if txt:
                reviews.append(txt)
        if not go_to_next_page(driver, wait):
            break
    return reviews

def crawl_reviews(driver, wait):
    main_handle = driver.current_window_handle
    links = [a.get_attribute("href")
             for a in driver.find_elements(By.CSS_SELECTOR, "ul.cate_prd_list > li .prd_info a")]
    rows = []

    for href in links:
        driver.execute_script("window.open(arguments[0], '_blank')", href)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(1)

        try:
            brand = driver.find_element(By.CSS_SELECTOR, "#moveBrandShop").text.strip()
        except:
            brand = ""
        try:
            product = driver.find_element(By.CSS_SELECTOR, "p.prd_name").text.strip()
        except:
            product = ""

        for rev in collect_reviews(driver, wait):
            rows.append([brand, product, rev])

        driver.close()
        handles = driver.window_handles
        if main_handle in handles:
            driver.switch_to.window(main_handle)
        else:
            driver.switch_to.window(handles[0])

    return rows

def click_category(driver, wait, xpath):
    wait.until(EC.element_to_be_clickable((By.ID, "btnGnbOpen"))).click()
    time.sleep(1)
    try:
        el = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
        driver.execute_script("arguments[0].scrollIntoView(true);", el)
        el.click()
        time.sleep(2)
        return True
    except TimeoutException:
        return False

def main():
    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 15)

    # 카테고리 XPaths
    XPATHS = {
        "스킨/토너":      "//a[text()='스킨/토너']",
        "에센스/세럼/앰플": "//a[text()='에센스/세럼/앰플']",
        "크림":           "//a[text()='크림']",
        "아이크림":       "//a[text()='아이크림']",
    }

    for cat in ["스킨/토너", "에센스/세럼/앰플", "크림", "아이크림"]:
        print(f"\n▶▶ [{cat}] 크롤링 시작")
        driver.get(BASE_URL)

        # GNB 열고 클릭
        if cat != "아이크림":
            if not click_category(driver, wait, XPATHS[cat]):
                print(f"⚠️ [{cat}] 클릭 실패, 건너뜁니다.")
                continue
        else:
            # 아이크림은 먼저 크림 클릭 → 아이크림 클릭
            if not click_category(driver, wait, XPATHS["크림"]):
                print("⚠️ [크림] 클릭 실패, 아이크림으로 넘어갈 수 없습니다.")
                continue
            # 크림 페이지 내에서 아이크림 클릭
            try:
                el = wait.until(EC.element_to_be_clickable((By.XPATH, XPATHS["아이크림"])))
                driver.execute_script("arguments[0].scrollIntoView(true);", el)
                el.click()
                time.sleep(2)
            except TimeoutException:
                print("⚠️ [아이크림] 클릭 실패")
                continue

        rows = crawl_reviews(driver, wait)
        if not rows:
            print(f"⚠️ [{cat}] 리뷰가 없습니다.")
            continue

        df = pd.DataFrame(rows, columns=["브랜드명", "상품명", "리뷰내용"])
        safe = cat.replace("/", "_")
        csv_name = f"{safe}_reviews.csv"
        xlsx_name = f"{safe}_reviews.xlsx"
        df.to_csv(csv_name, index=False, encoding="utf-8-sig")
        df.to_excel(xlsx_name, index=False)
        print(f"✅ [{cat}] 리뷰 {len(df)}개 저장 → {csv_name}, {xlsx_name}")

    driver.quit()
    print("\n🎉 전체 크롤링 완료")

if __name__ == "__main__":
    main()



▶▶ [스킨/토너] 크롤링 시작
