# Scent4You

# Python Packages

* pandas
* selenium
* webdriver_manager
* BeautifulSoup

# Data Collection

Our dataset was collected from [Perfume 1976](https://www.1976.com.tw/index) and [Wikiparfum](https://www.wikiparfum.com/zh/).

[Perfume 1976](https://www.1976.com.tw/index) is a website dedicated to selling perfumes. We collected information on 2,507 perfume products currently being sold on this site, including their descriptions, top/middle/base notes, and fragrance types.

[Wikiparfum](https://www.wikiparfum.com/zh/) bills itself as “the world's first perfume encyclopedia,” functioning as a Wikipedia for the fragrance industry. The website compiles detailed data on perfumes and fragrance materials. We obtained their classification data for fragrance materials from this website.

## Collecting Data on Perfume 1976

In [None]:
raise RuntimeError(
    "It is not recommended to run code in this file. "
    "Please visit https://github.com/wilson-lyc/Scent4You "
    "to access the standalone source code. "
    "This section corresponds to the file: 1976_data_collect.ipynb."
)

In [None]:
# Necessary packages
import time
import json
import os
import csv
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import *
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

In [None]:
# Target URL
url = 'https://www.1976.com.tw/cat/27?t=all' # female
# url = 'https://www.1976.com.tw/cat/40?t=all' # male
# url = 'https://www.1976.com.tw/cat/56?t=all' # neutral

In [None]:
# Global variables
SCROLL_PAUSE = 0.5         # Wait time after scrolling the list
DETAIL_LOAD_PAUSE = 0.5    # Wait time after opening the detail page
BETWEEN_ITEMS_PAUSE = 0.3  # Interval between switching items
BATCH_PAUSE = 1            # Extra pause after each batch of items

In [None]:
def open_driver(url):
    '''
    Open browser
    '''
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.add_argument("--incognito")
    prefs = {
        "profile.default_content_setting_values.autofill": 2,
        "profile.password_manager_enabled": False,
        "credentials_enable_service": False,
        "autofill.profile_enabled": False,
        "autofill.address_enabled": False,
        "autofill.credit_card_enabled": False,
    }
    options.add_experimental_option("prefs", prefs)

    driver = None
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
    except Exception as e1:
        fallback_path = r"chromedriver.exe"
        try:
            service = Service(fallback_path)
            driver = webdriver.Chrome(service=service, options=options)
        except Exception as e2:
            raise RuntimeError(f"Failed to initialize ChromeDriver:\n1) webdriver-manager error: {e1}\n2) Local driver error: {e2}")

    driver.get(url)
    driver.implicitly_wait(5)
    driver.maximize_window()
    return driver

In [None]:
def collect_detail_links(driver, wait, max_scroll_rounds=12):
    """
    Collect all detail links from the product list, handling lazy loading.
    """
    collected = []
    seen = set()
    last_height = 0

    for round_idx in range(max_scroll_rounds):
        try:
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.item.prod-item a.inlineblock")))
        except TimeoutException:
            print("等待商品列表加载超时，提前结束")
            break

        anchors = driver.find_elements(By.CSS_SELECTOR, "div.item.prod-item a.inlineblock")
        if not anchors:
            break
        for anchor in anchors:
            try:
                href = anchor.get_attribute("href")
                if href and href not in seen:
                    seen.add(href)
                    collected.append(href)
            except StaleElementReferenceException:
                continue

        driver.execute_script("window.scrollBy(0, window.innerHeight * 2);")
        time.sleep(SCROLL_PAUSE)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return collected

In [None]:
def parse_fragrance_section(lines):
    """
    Parse scent notes and top/middle/base notes information
    """
    parsed = {
        "fragrance": "未提供",
        "top_notes": "未提供",
        "middle_notes": "未提供",
        "base_notes": "未提供",
    }

    alias_map = {
        "fragrance": ["香調", "香调"],
        "top_notes": ["前味", "前調", "前调", "TopNotes"],
        "middle_notes": ["中味", "中調", "中调", "MiddleNotes", "HeartNotes"],
        "base_notes": ["後味", "后味", "後調", "后調", "BaseNotes"],
    }

    for line in lines:
        normalized = re.sub(r"\s+", "", line)
        for key, aliases in alias_map.items():
            if any(alias in normalized for alias in aliases):
                value = line
                if "：" in line:
                    value = line.split("：", 1)[1].strip()
                elif ":" in line:
                    value = line.split(":", 1)[1].strip()
                parsed[key] = value
                break

    return parsed

In [None]:
def extract_perfume_details(driver, wait):
    """
    Extract detailed perfume information from the detail page
    """
    info = {
        "name": "未知名称",
        "description": "未提供",
        "fragrance": "未提供",
        "top_notes": "未提供",
        "middle_notes": "未提供",
        "base_notes": "未提供",
    }

    NAME_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[1]"
    DESC_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[3]"
    NOTES_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[4]"

    try:
        name_el = wait.until(EC.presence_of_element_located((By.XPATH, NAME_XPATH)))
        info["name"] = name_el.text.strip()
    except TimeoutException:
        print("Name loading timeout")

    try:
        desc_el = driver.find_element(By.XPATH, DESC_XPATH)
        desc_text = desc_el.text.strip()
        if desc_text:
            info["description"] = desc_text
    except Exception:
        pass

    try:
        notes_el = driver.find_element(By.XPATH, NOTES_XPATH)
        notes_lines = [line.strip() for line in notes_el.text.splitlines() if line.strip()]
        info.update(parse_fragrance_section(notes_lines))
    except Exception:
        pass

    return info

In [None]:
def visit_detail_page(driver, wait, detail_url):
    """
    Open the detail page and extract data
    """
    try:
        driver.execute_script("window.open(arguments[0], '_blank');", detail_url)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(DETAIL_LOAD_PAUSE)
        perfume_info = extract_perfume_details(driver, wait)
        perfume_info["detail_url"] = detail_url
        print(f"    -> 成功提取: {perfume_info['name']}")
        return perfume_info
    except Exception as exc:
        print(f"    -> 抓取 {detail_url} 时出错: {exc}")
        return None
    finally:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(BETWEEN_ITEMS_PAUSE / 2)

In [None]:
def scrape_perfume_data(driver, csv_filename="perfume_data.csv"):
    wait = WebDriverWait(driver, 15)
    detail_links = collect_detail_links(driver, wait)
    perfumes_data = []
    fieldnames = [
        "name",
        "description",
        "fragrance",
        "top_notes",
        "middle_notes",
        "base_notes",
        "detail_url",
    ]

    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        print(f"共收集到 {len(detail_links)} 个商品链接需要采集")

        for idx, detail_url in enumerate(detail_links, start=1):
            print(f"正在处理第 {idx} 个商品: {detail_url}")
            perfume_info = visit_detail_page(driver, wait, detail_url)
            if perfume_info:
                perfumes_data.append(perfume_info)
                writer.writerow({key: perfume_info.get(key, "") for key in fieldnames})
                csvfile.flush()
            time.sleep(BETWEEN_ITEMS_PAUSE)
            if idx % 10 == 0:
                time.sleep(BATCH_PAUSE)

    return perfumes_data

In [None]:
def save_to_csv(data, filename="perfume_data.csv"):
    if not data:
        print("No data to save")
        return

    keys = data[0].keys()

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)

    print(f"Data saved to {filename}")

In [None]:
driver = open_driver(url)
try:
    print("Start scraping perfume data...")
    perfumes_data = scrape_perfume_data(driver)
    print(f"Total {len(perfumes_data)} perfume records scraped")

    # Print part of the results
    for i, perfume in enumerate(perfumes_data[:3]):  # Only print the first 3
        print(f"\nPerfume {i+1}:")
        for key, value in perfume.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Error occurred during execution: {str(e)}")

finally:
    driver.quit()

After running the above code, we will obtain three CSV files, each recording data for female, male, and neutral perfumes. The data includes name, description, fragrance, top_notes, middle_notes, base_notes, and detail_url. The three files are located in the data folder as:

- `1976_raw_female.csv`
- `1976_raw_male.csv`
- `1976_raw_unisex.csv`

## Collecting Data on Wikiparfum

Due to anti-scraping measures on Wikiparfum, we were unable to fully automate data collection from the site. Instead, we manually saved the HTML code of the required web pages and used the BeautifulSoup package to parse the local HTML files and extract the data we needed.

It is not recommended to run code in this file. Please visit https://github.com/wilson-lyc/Scent4You to access the standalone source code.

This section corresponds to the file: `note_class_collect.ipynb`.

In [None]:
# Necessary packages
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Global variables
HTML_PATH = os.path.join("html", "wikiparfum")

In [None]:
# Extract notes and their classes
for filename in os.listdir(HTML_PATH):
    if filename.endswith('.html'):
        file_path = os.path.join(HTML_PATH, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
            
        class_name = filename.replace('.html', '')
        print(f"===== {class_name} =====")
        
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find_all("div", class_='items-start')
        print(f"Total notes: {len(items)}")
        
        notes_list = []
        for item in items:
            span = item.find("span", class_='break-all sm:break-normal text-16 mb-1 text-black block overflow-hidden text-ellipsis')
            notes_list.append(span.get_text(strip=True))
        print(notes_list)
        
        temp_df = pd.DataFrame({
            'note': notes_list,
            'class': [class_name] * len(notes_list)
        })
        
        df = pd.concat([df, temp_df], ignore_index=True)

In [None]:
# Save
df.to_csv("data/note_class.csv", index=False)

Running this part of the code will generate `note_class.csv` in the `data` folder, recording the fragrance materials and their classification.