In [11]:
import os
import re
from bs4 import BeautifulSoup
import json
import pandas as pd
import glob

# Global Variables 全局变量

In [12]:
HTML_PATH = 'html'
DATA_PATH = 'data'

# Data Preprocessing 数据预处理

In [13]:
def get_perfumes_from_html(html):
    '''
    Extract perfume information from HTML content
    '''
    res=[]
    soup = BeautifulSoup(html, "html.parser")
    items = soup.find_all("div", class_="item")
    for item in items:
        # Extract ID
        link = item.find("a")["href"]
        id_match = re.search(r"/(\d+)-", link)
        if id_match:
            ID = id_match.group(1)
        else:
            ID = None
        # Extract brand, Chinese name, French name, year
        title = item.find("h2").get_text()
        brand = ""
        name_cn = ""
        name_fr = ""
        year = ""
        pattern = r"^(.*?)\s+(.*?)\s+(.*?),\s*(\d+)$"
        # match = re.match(r"^(\S+)\s+(\S+)\s+([A-Za-z\s'’]+),\s*(\d{4})$", title)
        title_match = re.match(pattern, title)
        if title_match:
            brand = title_match.group(1)
            name_cn = title_match.group(2)
            name_fr = title_match.group(3).strip()
            year = title_match.group(4)
        else:
            print(f"Format error: {title}")
            continue
          
        # Extract score
        score_text = item.find("div", class_="score").get_text()
        score_match = re.search(r'([\d\.]+)\s*分', score_text)
        score = score_match.group(1) if score_match else ''
        
        # Extract notes
        info = item.find("div", class_="info")
        text = info.get_text()
        if text.find("气味：") != -1:
            # No distinction between top, middle, base notes
            text = text.replace("气味：", " ") # Remove "气味："
            text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
            note = text.split(" ")
            res.append({
                "ID": ID,
                "brand": brand,
                "name_cn": name_cn,
                "name_fr": name_fr,
                "year": year,
                "note": note,
                "score": score
            })
        else:
            # Distinguish top, middle, base notes
            replacements = ["前调：", "中调：", "后调："]
            for r in replacements:
                text = text.replace(r, "|")
            text = re.sub(r"\s+", " ", text).strip()
            note = text.split("|")
            top_note = note[1].strip().split(" ")
            middle_note = note[2].strip().split(" ")
            base_note = note[3].strip().split(" ")
            res.append({
                "ID": ID,
                "brand": brand,
                "name_cn": name_cn,
                "name_fr": name_fr,
                "year": year,
                "top_note": top_note,
                "middle_note": middle_note,
                "base_note": base_note,
                "score": score
            })
            
    return res

In [14]:
for brand in os.listdir(HTML_PATH):
    brand_path = os.path.join(HTML_PATH, brand)
    html_count = 0
    perfumes = []
    for filename in os.listdir(brand_path):
        file_path = os.path.join(brand_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
        html_count += 1
        perfumes.extend(get_perfumes_from_html(html))
    print(f"Brand: {brand}  File count: {html_count}  Perfume count: {len(perfumes)}")
    # Save
    jsonl_path = os.path.join(DATA_PATH, f'{brand}.jsonl')
    with open(jsonl_path, 'w', encoding='utf-8') as f:
        for perfume in perfumes:
            f.write(json.dumps(perfume, ensure_ascii=False) + '\n')

Brand: Chanel  File count: 15  Perfume count: 150
Brand: Hermes  File count: 14  Perfume count: 136
Brand: YSL  File count: 28  Perfume count: 273


In [15]:
# Merge all JSONL files into a single CSV
def merge_jsonl_to_csv(data_folder, output_csv):
    all_data = []
    for jsonl_file in glob.glob(os.path.join(data_folder, '*.jsonl')):
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                # 统一所有字段，缺失的补空
                row = {
                    'ID': item.get('ID', ''),
                    'brand': item.get('brand', ''),
                    'name_cn': item.get('name_cn', ''),
                    'name_fr': item.get('name_fr', ''),
                    'year': item.get('year', ''),
                    'score': item.get('score', ''),
                    'top_note': ' '.join(item.get('top_note', [])) if 'top_note' in item else '',
                    'middle_note': ' '.join(item.get('middle_note', [])) if 'middle_note' in item else '',
                    'base_note': ' '.join(item.get('base_note', [])) if 'base_note' in item else '',
                    'note': ' '.join(item.get('note', [])) if 'note' in item else ''
                }
                all_data.append(row)
    df = pd.DataFrame(all_data, columns=['ID', 'brand', 'name_cn', 'name_fr', 'year', 'score', 'top_note', 'middle_note', 'base_note', 'note'])
    df.to_csv(output_csv, index=False, encoding='utf-8-sig')

output_path = os.path.join(DATA_PATH, 'perfumes.csv')
merge_jsonl_to_csv(DATA_PATH, output_path)