In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import *
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
from urllib.parse import urlparse

from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

import time
import json
import os
import csv
import re


In [7]:
# url = 'https://www.1976.com.tw/cat/27?t=all'#女士香水
# url = 'https://www.1976.com.tw/cat/40?t=all'#男士香水
url = 'https://www.1976.com.tw/cat/56?t=all'#中性香水

In [8]:
def open_driver(url):
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.add_argument("--incognito")
    prefs = {
        "profile.default_content_setting_values.autofill": 2,
        "profile.password_manager_enabled": False,
        "credentials_enable_service": False,
        "autofill.profile_enabled": False,
        "autofill.address_enabled": False,
        "autofill.credit_card_enabled": False,
    }
    options.add_experimental_option("prefs", prefs)

    driver = None
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
    except Exception as e1:
        fallback_path = r"chromedriver.exe"
        try:
            service = Service(fallback_path)
            driver = webdriver.Chrome(service=service, options=options)
        except Exception as e2:
            raise RuntimeError(f"Failed to initialize ChromeDriver:\n1) webdriver-manager error: {e1}\n2) Local driver error: {e2}")

    driver.get(url)
    driver.implicitly_wait(5)
    driver.maximize_window()
    return driver

In [None]:
SCROLL_PAUSE = 0.5  # 列表滚动后的等待时间
DETAIL_LOAD_PAUSE = 0.5  # 打开详情页后的等待时间
BETWEEN_ITEMS_PAUSE = 0.3  # 切换商品间的间隔
BATCH_PAUSE = 1  # 每批商品后的额外休息

def scrape_perfume_data(driver, csv_filename="perfume_data.csv"):
    '''
    抓取香水数据的主函数，逐条写入CSV
    '''
    wait = WebDriverWait(driver, 15)
    detail_links = collect_detail_links(driver, wait)
    perfumes_data = []
    fieldnames = [
        "name",
        "description",
        "scent_notes",
        "top_notes",
        "middle_notes",
        "base_notes",
        "detail_url",
    ]

    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        print(f"共收集到 {len(detail_links)} 个商品链接需要采集")

        for idx, detail_url in enumerate(detail_links, start=1):
            print(f"正在处理第 {idx} 个商品: {detail_url}")
            perfume_info = visit_detail_page(driver, wait, detail_url)
            if perfume_info:
                perfumes_data.append(perfume_info)
                writer.writerow({key: perfume_info.get(key, "") for key in fieldnames})
                csvfile.flush()
            time.sleep(BETWEEN_ITEMS_PAUSE)
            if idx % 10 == 0:
                time.sleep(BATCH_PAUSE)

    return perfumes_data

def collect_detail_links(driver, wait, max_scroll_rounds=12):
    '''
    收集商品列表中的所有详情链接，处理懒加载
    '''
    collected = []
    seen = set()
    last_height = 0

    for round_idx in range(max_scroll_rounds):
        try:
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.item.prod-item a.inlineblock")))
        except TimeoutException:
            print("等待商品列表加载超时，提前结束")
            break

        anchors = driver.find_elements(By.CSS_SELECTOR, "div.item.prod-item a.inlineblock")
        if not anchors:
            break
        for anchor in anchors:
            try:
                href = anchor.get_attribute("href")
                if href and href not in seen:
                    seen.add(href)
                    collected.append(href)
            except StaleElementReferenceException:
                continue

        driver.execute_script("window.scrollBy(0, window.innerHeight * 2);")
        time.sleep(SCROLL_PAUSE)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return collected

def visit_detail_page(driver, wait, detail_url):
    '''
    打开详情页并提取数据
    '''
    try:
        driver.execute_script("window.open(arguments[0], '_blank');", detail_url)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(DETAIL_LOAD_PAUSE)
        perfume_info = extract_perfume_details(driver, wait)
        perfume_info["detail_url"] = detail_url
        print(f"    -> 成功提取: {perfume_info['name']}")
        return perfume_info
    except Exception as exc:
        print(f"    -> 抓取 {detail_url} 时出错: {exc}")
        return None
    finally:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(BETWEEN_ITEMS_PAUSE / 2)

def parse_fragrance_section(lines):
    '''
    解析香调及前味/中味/後味信息
    '''
    parsed = {
        "scent_notes": "未提供",
        "top_notes": "未提供",
        "middle_notes": "未提供",
        "base_notes": "未提供",
    }

    alias_map = {
        "scent_notes": ["香調", "香调"],
        "top_notes": ["前味", "前調", "前调", "TopNotes"],
        "middle_notes": ["中味", "中調", "中调", "MiddleNotes", "HeartNotes"],
        "base_notes": ["後味", "后味", "後調", "后調", "BaseNotes"],
    }

    for line in lines:
        normalized = re.sub(r"\s+", "", line)
        for key, aliases in alias_map.items():
            if any(alias in normalized for alias in aliases):
                value = line
                if "：" in line:
                    value = line.split("：", 1)[1].strip()
                elif ":" in line:
                    value = line.split(":", 1)[1].strip()
                parsed[key] = value
                break

    return parsed

def extract_perfume_details(driver, wait):
    '''
    从详情页提取香水详细信息
    '''
    info = {
        "name": "未知名称",
        "description": "未提供",
        "scent_notes": "未提供",
        "top_notes": "未提供",
        "middle_notes": "未提供",
        "base_notes": "未提供",
    }

    NAME_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[1]"
    DESC_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[3]"
    NOTES_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[4]"

    try:
        name_el = wait.until(EC.presence_of_element_located((By.XPATH, NAME_XPATH)))
        info["name"] = name_el.text.strip()
    except TimeoutException:
        print("    名称加载超时")

    try:
        desc_el = driver.find_element(By.XPATH, DESC_XPATH)
        desc_text = desc_el.text.strip()
        if desc_text:
            info["description"] = desc_text
    except Exception:
        pass

    try:
        notes_el = driver.find_element(By.XPATH, NOTES_XPATH)
        notes_lines = [line.strip() for line in notes_el.text.splitlines() if line.strip()]
        info.update(parse_fragrance_section(notes_lines))
    except Exception:
        pass

    return info

def save_to_csv(data, filename="perfume_data.csv"):
    '''
    将数据保存到CSV文件
    '''
    if not data:
        print("没有数据可保存")
        return

    keys = data[0].keys()

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)

    print(f"数据已保存至 {filename}")

def main():
    '''
    主函数
    '''
    driver = open_driver(url)

    try:
        print("开始抓取香水数据...")
        perfumes_data = scrape_perfume_data(driver)

        print(f"总共抓取了 {len(perfumes_data)} 条香水数据")

        # 打印部分结果
        for i, perfume in enumerate(perfumes_data[:3]):  # 只打印前3个
            print(f"\n香水 {i+1}:")
            for key, value in perfume.items():
                print(f"  {key}: {value}")

    except Exception as e:
        print(f"执行过程中出现错误: {str(e)}")

    finally:
        driver.quit()

In [None]:
# 运行爬虫
if __name__ == "__main__":
    main()

开始抓取香水数据...
共收集到 666 个商品链接需要采集
正在处理第 1 个商品: https://www.1976.com.tw/prod/23308
    -> 成功提取: Guerlain Rosa Verde 嬌蘭花草水語 夏沁玫瑰淡香水
正在处理第 2 个商品: https://www.1976.com.tw/prod/23307
    -> 成功提取: Monotheme Verbena 吟遊詩人 檸檬馬鞭草淡香水
正在处理第 3 个商品: https://www.1976.com.tw/prod/23305
    -> 成功提取: Monotheme Honeysuckle 植然忍冬中性淡香水
正在处理第 4 个商品: https://www.1976.com.tw/prod/23304
    -> 成功提取: Serge Lutens La Fille de Berlin 蘆丹氏 柏林少女淡香精迷你瓶
正在处理第 5 个商品: https://www.1976.com.tw/prod/23303
    -> 成功提取: Maison Margiela Untitled 無題淡香精
正在处理第 6 个商品: https://www.1976.com.tw/prod/23300
    -> 成功提取: Jo Malone 黑白配輕巧香氛組
正在处理第 7 个商品: https://www.1976.com.tw/prod/23296
    -> 成功提取: Tom Ford Oud Minerale 海洋烏木淡香精
正在处理第 8 个商品: https://www.1976.com.tw/prod/23294
    -> 成功提取: Penhaligon's 潘海利根 香氛圖書館獸首肖像禮盒
正在处理第 9 个商品: https://www.1976.com.tw/prod/23293
    -> 成功提取: Diptyque Fleur de Peau 肌膚之華中性淡香水
正在处理第 10 个商品: https://www.1976.com.tw/prod/23292
