# Scent4You

# Python Packages

* pandas
* selenium
* webdriver_manager
* BeautifulSoup

# Data Collection

Our dataset was collected from [Perfume 1976](https://www.1976.com.tw/index) and [Wikiparfum](https://www.wikiparfum.com/zh/).

[Perfume 1976](https://www.1976.com.tw/index) is a website dedicated to selling perfumes. We collected information on 2,507 perfume products currently being sold on this site, including their descriptions, top/middle/base notes, and fragrance types.

[Wikiparfum](https://www.wikiparfum.com/zh/) bills itself as “the world's first perfume encyclopedia,” functioning as a Wikipedia for the fragrance industry. The website compiles detailed data on perfumes and fragrance materials. We obtained their classification data for fragrance materials from this website.

## Collecting Data on Perfume 1976

In [None]:
raise RuntimeError(
    "It is not recommended to run code in this file. "
    "Please visit https://github.com/wilson-lyc/Scent4You "
    "to access the standalone source code. "
    "This section corresponds to the file: 1976_data_collect.ipynb."
)

In [None]:
# Necessary packages
import time
import json
import os
import csv
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import *
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

In [None]:
# Target URL
url = 'https://www.1976.com.tw/cat/27?t=all' # female
# url = 'https://www.1976.com.tw/cat/40?t=all' # male
# url = 'https://www.1976.com.tw/cat/56?t=all' # neutral

In [None]:
# Global variables
SCROLL_PAUSE = 0.5         # Wait time after scrolling the list
DETAIL_LOAD_PAUSE = 0.5    # Wait time after opening the detail page
BETWEEN_ITEMS_PAUSE = 0.3  # Interval between switching items
BATCH_PAUSE = 1            # Extra pause after each batch of items

In [None]:
def open_driver(url):
    '''
    Open browser
    '''
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.add_argument("--incognito")
    prefs = {
        "profile.default_content_setting_values.autofill": 2,
        "profile.password_manager_enabled": False,
        "credentials_enable_service": False,
        "autofill.profile_enabled": False,
        "autofill.address_enabled": False,
        "autofill.credit_card_enabled": False,
    }
    options.add_experimental_option("prefs", prefs)

    driver = None
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
    except Exception as e1:
        fallback_path = r"chromedriver.exe"
        try:
            service = Service(fallback_path)
            driver = webdriver.Chrome(service=service, options=options)
        except Exception as e2:
            raise RuntimeError(f"Failed to initialize ChromeDriver:\n1) webdriver-manager error: {e1}\n2) Local driver error: {e2}")

    driver.get(url)
    driver.implicitly_wait(5)
    driver.maximize_window()
    return driver

In [None]:
def collect_detail_links(driver, wait, max_scroll_rounds=12):
    """
    Collect all detail links from the product list, handling lazy loading.
    """
    collected = []
    seen = set()
    last_height = 0

    for round_idx in range(max_scroll_rounds):
        try:
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.item.prod-item a.inlineblock")))
        except TimeoutException:
            print("等待商品列表加载超时，提前结束")
            break

        anchors = driver.find_elements(By.CSS_SELECTOR, "div.item.prod-item a.inlineblock")
        if not anchors:
            break
        for anchor in anchors:
            try:
                href = anchor.get_attribute("href")
                if href and href not in seen:
                    seen.add(href)
                    collected.append(href)
            except StaleElementReferenceException:
                continue

        driver.execute_script("window.scrollBy(0, window.innerHeight * 2);")
        time.sleep(SCROLL_PAUSE)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return collected

In [None]:
def parse_fragrance_section(lines):
    """
    Parse scent notes and top/middle/base notes information
    """
    parsed = {
        "fragrance": "未提供",
        "top_notes": "未提供",
        "middle_notes": "未提供",
        "base_notes": "未提供",
    }

    alias_map = {
        "fragrance": ["香調", "香调"],
        "top_notes": ["前味", "前調", "前调", "TopNotes"],
        "middle_notes": ["中味", "中調", "中调", "MiddleNotes", "HeartNotes"],
        "base_notes": ["後味", "后味", "後調", "后調", "BaseNotes"],
    }

    for line in lines:
        normalized = re.sub(r"\s+", "", line)
        for key, aliases in alias_map.items():
            if any(alias in normalized for alias in aliases):
                value = line
                if "：" in line:
                    value = line.split("：", 1)[1].strip()
                elif ":" in line:
                    value = line.split(":", 1)[1].strip()
                parsed[key] = value
                break

    return parsed

In [None]:
def extract_perfume_details(driver, wait):
    """
    Extract detailed perfume information from the detail page
    """
    info = {
        "name": "未知名称",
        "description": "未提供",
        "fragrance": "未提供",
        "top_notes": "未提供",
        "middle_notes": "未提供",
        "base_notes": "未提供",
    }

    NAME_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[1]"
    DESC_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[3]"
    NOTES_XPATH = "/html/body/div[1]/div[2]/div[2]/div[2]/div[4]"

    try:
        name_el = wait.until(EC.presence_of_element_located((By.XPATH, NAME_XPATH)))
        info["name"] = name_el.text.strip()
    except TimeoutException:
        print("Name loading timeout")

    try:
        desc_el = driver.find_element(By.XPATH, DESC_XPATH)
        desc_text = desc_el.text.strip()
        if desc_text:
            info["description"] = desc_text
    except Exception:
        pass

    try:
        notes_el = driver.find_element(By.XPATH, NOTES_XPATH)
        notes_lines = [line.strip() for line in notes_el.text.splitlines() if line.strip()]
        info.update(parse_fragrance_section(notes_lines))
    except Exception:
        pass

    return info

In [None]:
def visit_detail_page(driver, wait, detail_url):
    """
    Open the detail page and extract data
    """
    try:
        driver.execute_script("window.open(arguments[0], '_blank');", detail_url)
        driver.switch_to.window(driver.window_handles[-1])
        time.sleep(DETAIL_LOAD_PAUSE)
        perfume_info = extract_perfume_details(driver, wait)
        perfume_info["detail_url"] = detail_url
        print(f"    -> 成功提取: {perfume_info['name']}")
        return perfume_info
    except Exception as exc:
        print(f"    -> 抓取 {detail_url} 时出错: {exc}")
        return None
    finally:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(BETWEEN_ITEMS_PAUSE / 2)

In [None]:
def scrape_perfume_data(driver, csv_filename="perfume_data.csv"):
    wait = WebDriverWait(driver, 15)
    detail_links = collect_detail_links(driver, wait)
    perfumes_data = []
    fieldnames = [
        "name",
        "description",
        "fragrance",
        "top_notes",
        "middle_notes",
        "base_notes",
        "detail_url",
    ]

    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        print(f"共收集到 {len(detail_links)} 个商品链接需要采集")

        for idx, detail_url in enumerate(detail_links, start=1):
            print(f"正在处理第 {idx} 个商品: {detail_url}")
            perfume_info = visit_detail_page(driver, wait, detail_url)
            if perfume_info:
                perfumes_data.append(perfume_info)
                writer.writerow({key: perfume_info.get(key, "") for key in fieldnames})
                csvfile.flush()
            time.sleep(BETWEEN_ITEMS_PAUSE)
            if idx % 10 == 0:
                time.sleep(BATCH_PAUSE)

    return perfumes_data

In [None]:
def save_to_csv(data, filename="perfume_data.csv"):
    if not data:
        print("No data to save")
        return

    keys = data[0].keys()

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)

    print(f"Data saved to {filename}")

In [None]:
driver = open_driver(url)
try:
    print("Start scraping perfume data...")
    perfumes_data = scrape_perfume_data(driver)
    print(f"Total {len(perfumes_data)} perfume records scraped")

    # Print part of the results
    for i, perfume in enumerate(perfumes_data[:3]):  # Only print the first 3
        print(f"\nPerfume {i+1}:")
        for key, value in perfume.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Error occurred during execution: {str(e)}")

finally:
    driver.quit()

After running the above code, we will obtain three CSV files, each recording data for female, male, and neutral perfumes. The data includes name, description, fragrance, top_notes, middle_notes, base_notes, and detail_url. The three files are located in the data folder as:

- `1976_raw_female.csv`
- `1976_raw_male.csv`
- `1976_raw_unisex.csv`

## Collecting Data on Wikiparfum

Due to anti-scraping measures on Wikiparfum, we were unable to fully automate data collection from the site. Instead, we manually saved the HTML code of the required web pages and used the BeautifulSoup package to parse the local HTML files and extract the data we needed.

It is not recommended to run code in this file. Please visit https://github.com/wilson-lyc/Scent4You to access the standalone source code.

This section corresponds to the file: `note_class_collect.ipynb`.

In [None]:
# Necessary packages
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Global variables
HTML_PATH = os.path.join("html", "wikiparfum")

In [None]:
# Extract notes and their classes
for filename in os.listdir(HTML_PATH):
    if filename.endswith('.html'):
        file_path = os.path.join(HTML_PATH, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
            
        class_name = filename.replace('.html', '')
        print(f"===== {class_name} =====")
        
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find_all("div", class_='items-start')
        print(f"Total notes: {len(items)}")
        
        notes_list = []
        for item in items:
            span = item.find("span", class_='break-all sm:break-normal text-16 mb-1 text-black block overflow-hidden text-ellipsis')
            notes_list.append(span.get_text(strip=True))
        print(notes_list)
        
        temp_df = pd.DataFrame({
            'note': notes_list,
            'class': [class_name] * len(notes_list)
        })
        
        df = pd.concat([df, temp_df], ignore_index=True)

In [None]:
# Save
df.to_csv("data/note_class.csv", index=False)

Running this part of the code will generate `note_class.csv` in the `data` folder, recording the fragrance materials and their classification.

# Data Cleaning

Before conducting data visualization and model training, it is essential to preprocess the data collected from online sources to ensure that it can be used in subsequent analyses. The data requiring cleaning includes **perfume data** (from [Perfume 1976](https://www.1976.com.tw/index)) and **note classification data** (from [Wikiparfum](https://www.wikiparfum.com/zh/)).

## Cleaning Perfume Data

Perfume Data consists of [1976_raw_female.csv](data/1976_raw_female.csv), [1976_raw_male.csv](data/1976_raw_male.csv), and [1976_raw_neutral.csv](data/1976_raw_neutral.csv). In this section, we:

* Add a gender column

* Data merging

* Translate Traditional Chinese into Simplified Chinese

* Remove escape characters from the description column

* Delete records with empty top/middle/base notes or fragrance

* Remove extra spaces in the notes and fragrance

* Choose the first fragrance as classification

* Count the number of notes and fragrance

* Unified fragrance label

It is not recommended to run code in this file. Please visit https://github.com/wilson-lyc/Scent4You to access the standalone source code.

This section corresponds to the file: [1976_data_clean.ipynb](1976_data_clean.ipynb).

In [None]:
# Necessary packages
import pandas as pd
import opencc
import csv

In [None]:
# Load uncleaned data
df_male = pd.read_csv("data/1976_raw_male.csv")
df_female = pd.read_csv("data/1976_raw_female.csv")
df_neutral = pd.read_csv("data/1976_raw_neutral.csv")

In [None]:
# Add a gender column
df_male['gender'] = 'male'
df_female['gender'] = 'female'
df_neutral['gender'] = 'neutral'

In [None]:
# Data merging
df = pd.concat([df_male, df_female, df_neutral], ignore_index=True)

In [None]:
# Translate Traditional Chinese into Simplified Chinese
converter = opencc.OpenCC('t2s.json')
for col in df.columns:
    if col == 'detail_url':
        continue
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x: converter.convert(x) if isinstance(x, str) else x)

In [None]:
# Remove escape characters from the description column
if 'description' in df.columns:
    df['description'] = df['description'].replace({r'\r\n|\n|\r': ''}, regex=True)

In [None]:
# Delete records with empty notes or fragrance
print(df[['fragrance', 'top_notes', 'middle_notes', 'base_notes']].isna().sum())
df = df.dropna(subset=['fragrance', 'top_notes', 'middle_notes', 'base_notes'])

In [None]:
# Remove extra spaces in the notes and fragrance
for col in ['fragrance', 'top_notes', 'middle_notes', 'base_notes']:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: x.replace(' ', '') if isinstance(x, str) else x)

In [None]:
# Choose the first fragrance as classification
df['fragrance'] = df['fragrance'].apply(lambda x: x.split('、')[0] if isinstance(x, str) else x)

In [None]:
# Count the number of notes
note_columns = ['top_notes', 'middle_notes', 'base_notes']
note_counts = {}

for col in note_columns:
    if col in df.columns:
        for s in df[col]:
            if isinstance(s, str):
                for item in [n.strip() for n in s.split('、') if n.strip()]:
                    note_counts[item] = note_counts.get(item, 0) + 1

# Save
with open('data/note_count.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['note', 'count'])
    for item, count in sorted(note_counts.items(), key=lambda x: x[1], reverse=True):
        writer.writerow([item, count])

In [None]:
# Count the number of fragrance
fragrance_counts = {}
for s in df['fragrance']:
    for item in [i.strip() for i in s.split('、') if i.strip()]:
        fragrance_counts[item] = fragrance_counts.get(item, 0) + 1

# Save
with open('data/fragrance_count.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['fragrance', 'count'])
    for item, count in sorted(fragrance_counts.items(), key=lambda x: x[1], reverse=True):
        writer.writerow([item, count])

In [None]:
# Unified fragrance label
df = df.rename(columns={'fragrance': 'original_fragrance'})
map_fragrance = pd.read_csv("data/fragrance_map.csv") # Manually unify the expression of fragrance labels
df = df.merge(map_fragrance, how='left', left_on='original_fragrance', right_on='original_fragrance')

In [None]:
# Save the final cleaned data
df.to_csv("data/1976_clean.csv",index=False)

Running this part of the code will generate 3 `csv` files in the `data` folder:  
* [ntoe_count.csv](data/ntoe_count.csv): records the frequency of different fragrance notes  
* [fragrance_count.csv](data/fragrance_count.csv): records the number of various perfumes  
* [1976_clean.csv](data/1976_clean.csv): cleaned perfume dataset  


## Cleaning Note Classification Data

Note classification data consists of [ntoe_count.csv](data/ntoe_count.csv) and [note_class.csv](data/note_class.csv) files. They come from different websites: [ntoe_count.csv](data/ntoe_count.csv) is sourced from [Perfume 1976](https://www.1976.com.tw/index), while [note_class.csv](data/note_class.csv) is sourced from [Wikiparfum](https://www.wikiparfum.com/zh/). We use [Wikiparfum](https://www.wikiparfum.com/zh/) classifications of notes to categorize the perfume notes from [Perfume 1976](https://www.1976.com.tw/index). The challenge lies in the fact that the two websites often describe the same note differently, so we introduce the Qwen3 to help resolve this issue.


It is not recommended to run code in this file. Please visit https://github.com/wilson-lyc/Scent4You to access the standalone source code.

This section corresponds to the file: [note_class_map_initial.ipynb](note_class_map_initial.ipynb), [note_class_map_ai.ipynb](note_class_map_ai.ipynb) and [note_class_map_merge.ipynb](note_class_map_merge.ipynb)

### Simple Matching

First, we do not perform any processing on the two files and directly carry out classification matching, starting with assigning classification labels to the notes that are consistently described in both files.

In [None]:
# Necessary packages
import pandas as pd

In [None]:
# Load note classes
note_class = pd.read_csv("data/note_class.csv")
note_class = note_class.drop_duplicates(subset=['note'])
print(note_class['class'].unique())

In [None]:
# Load notes
df = pd.read_csv("data/ntoe_count.csv")

In [None]:
# Assign classes to notes
df = df.merge(note_class, how='left', left_on='note', right_on='note')

In [None]:
# Save
df.to_csv("data/note_class_initial.csv", index=False)

In [None]:
# Record notes without classes
not_matched = df[df['class'].isna()]
not_matched.to_csv("data/note_class_miss.csv", index=False)

### AI Classification

Classes have been assigned to a subset of notes, and those successfully matched are stored in [note_class_initial.csv](data/note_class_initial.csv). Notes that could not be matched have been recorded in [note_class_miss.csv](data/note_class_miss.csv). Subsequently, the Qwen3 model is employed to classify the unmatched notes.


Before running this section of code, please read the API_KEY configuration guide at https://help.aliyun.com/zh/model-studio/first-api-call-to-qwen


In [None]:
# Necessary packages
import os
import dashscope
import pandas as pd

In [None]:
# Load API key from environment variable
API_KEY=os.getenv('DASHSCOPE_API_KEY')

if not API_KEY:
    raise RuntimeError("DASHSCOPE_API_KEY environment variable not set. Please set it according to the documentation at https://help.aliyun.com/zh/model-studio/first-api-call-to-qwen")

In [None]:
# Define a function to request Qwen3 for classification
def get_classification(note):
    messages = [
        {'role': 'system', 'content': '你是一个香材分类大师，你需要将香材分类为：东方调、木质香调、果香调、柑橘调、柑苔香调、概念性、海洋调、烟草香调、皮革香调、美食香调、花香调、辛香调、醛香调、青香调、馥奇调、麝香调。不要任何解释，直接输出分类结果，无法分类输出未知。'},
        {'role': 'user', 'content': f'{note}'}
    ]
    response = dashscope.Generation.call(
        api_key=API_KEY,
        model="qwen3-235b-a22b-instruct-2507",
        messages=messages,
        result_format='message'
    )
    return response.output.choices[0].message['content']

In [None]:
# Fill missing classes and save
df = pd.read_csv('data/note_class_miss.csv')
df['class'] = df['note'].apply(get_classification)
df['class'] = df['class'].replace('未知', '')
df.to_csv('data/note_class_miss_filled.csv', index=False)

### Merge Data

Thus far, Qwen3 has been employed to assign classes to notes that previously could not be matched. The processed data is stored in [note_class_miss_filled.csv](data/note_class_miss_filled.csv). Next, we merge [note_class_initial.csv](data/note_class_initial.csv) and [note_class_miss_filled.csv](data/note_class_miss_filled.csv).

In [None]:
# Necessary packages
import pandas as pd

In [None]:
# Load data
df_miss = pd.read_csv("data/note_class_miss_filled.csv")
df_initial = pd.read_csv("note_class_map_initial.ipynb")

In [None]:
# Merge
note_class_unique = df_miss.drop_duplicates(subset='note', keep='first')
mask = df_initial['class'].isna() | (df_initial['class'] == '')
df_initial.loc[mask, 'class'] = df_initial.loc[mask, 'note'].map(note_class_unique.set_index('note')['class'])
print(df_initial['class'].isna().sum())

In [None]:
# Save
df_initial.to_csv("data/note_class_map_raw.csv", index=False)

### Manual Classification

The merged data is stored in [note_class_map_raw.csv](data/note_class_map_raw.csv). At this stage, 111 notes remain without assigned classes, and their categorization must be completed through manual classification.


In [None]:
raise RuntimeError("Please manually classify the unmatched notes.")

The manually classified data is stored in [note_class_map.csv](data/note_class_map.csv).