In [None]:
# Necessary packages
import pandas as pd
import opencc
import csv

In [None]:
# Load uncleaned data
df_male = pd.read_csv("data/1976_raw_male.csv")
df_female = pd.read_csv("data/1976_raw_female.csv")
df_neutral = pd.read_csv("data/1976_raw_neutral.csv")

In [None]:
# Add a gender column
df_male['gender'] = 'male'
df_female['gender'] = 'female'
df_neutral['gender'] = 'neutral'

In [None]:
# Data merging
df = pd.concat([df_male, df_female, df_neutral], ignore_index=True)

In [None]:
# Translate Traditional Chinese into Simplified Chinese
converter = opencc.OpenCC('t2s.json')
for col in df.columns:
    if col == 'detail_url':
        continue
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x: converter.convert(x) if isinstance(x, str) else x)

In [None]:
# Remove escape characters from the description column
if 'description' in df.columns:
    df['description'] = df['description'].replace({r'\r\n|\n|\r': ''}, regex=True)

In [None]:
# Delete records with empty notes or fragrance
print(df[['fragrance', 'top_notes', 'middle_notes', 'base_notes']].isna().sum())
df = df.dropna(subset=['fragrance', 'top_notes', 'middle_notes', 'base_notes'])

In [None]:
# Remove extra spaces in the notes and fragrance
for col in ['fragrance', 'top_notes', 'middle_notes', 'base_notes']:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: x.replace(' ', '') if isinstance(x, str) else x)

In [None]:
# Choose the first fragrance as classification
df['fragrance'] = df['fragrance'].apply(lambda x: x.split('、')[0] if isinstance(x, str) else x)

In [None]:
# Count the number of notes
note_columns = ['top_notes', 'middle_notes', 'base_notes']
note_counts = {}

for col in note_columns:
    if col in df.columns:
        for s in df[col]:
            if isinstance(s, str):
                for item in [n.strip() for n in s.split('、') if n.strip()]:
                    note_counts[item] = note_counts.get(item, 0) + 1

# Save
with open('data/note_count.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['note', 'count'])
    for item, count in sorted(note_counts.items(), key=lambda x: x[1], reverse=True):
        writer.writerow([item, count])

In [None]:
# Count the number of fragrance
fragrance_counts = {}
for s in df['fragrance']:
    for item in [i.strip() for i in s.split('、') if i.strip()]:
        fragrance_counts[item] = fragrance_counts.get(item, 0) + 1

# Save
with open('data/fragrance_count.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['fragrance', 'count'])
    for item, count in sorted(fragrance_counts.items(), key=lambda x: x[1], reverse=True):
        writer.writerow([item, count])

In [None]:
# Unified fragrance label
df = df.rename(columns={'fragrance': 'original_fragrance'})
map_fragrance = pd.read_csv("data/fragrance_map.csv") # Manually unify the expression of fragrance labels
df = df.merge(map_fragrance, how='left', left_on='original_fragrance', right_on='original_fragrance')

In [None]:
# Save the final cleaned data
df.to_csv("data/1976_clean.csv",index=False)