In [1]:
import pandas as pd
import numpy as np

In [None]:
df_note = pd.read_csv('data/note_embedding_v1.csv')
df_perfume = pd.read_csv('data/1976_clean.csv')

In [3]:
note2vec = {row['note']: row.iloc[1:].values.astype(np.float32) for _, row in df_note.iterrows()}
print(f"Loaded {len(note2vec)} note embeddings.")

Loaded 2285 note embeddings.


In [4]:
def get_notes_embedding(name,notes_list):
    embeddings = [note2vec[note] for note in notes_list if note in note2vec]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        print("Warning: No embeddings found for notes:", name)
        return np.zeros(768, dtype=np.float32)

In [5]:
perfume_embeddings = []
for _, row in df_perfume.iterrows():
    name = row['name']
    top_notes_text = row['top_notes'].split('、')
    middle_notes_text = row['middle_notes'].split('、')
    base_notes_text = row['base_notes'].split('、')
    
    top_notes_vec = get_notes_embedding(name, top_notes_text)
    middle_notes_vec = get_notes_embedding(name, middle_notes_text)
    base_notes_vec = get_notes_embedding(name, base_notes_text)
    
    perfume_vec = top_notes_vec * 0.3 + middle_notes_vec * 0.4 + base_notes_vec * 0.3
    
    perfume_embeddings.append(perfume_vec)
    
perfume_embeddings = np.array(perfume_embeddings)



In [6]:
df_perfume_embeddings = pd.DataFrame(perfume_embeddings)
df_perfume_embeddings.insert(0, 'name', df_perfume['name'])
df_perfume_embeddings = df_perfume_embeddings[~(df_perfume_embeddings.iloc[:, 1:] == 0).all(axis=1)]
df_perfume_embeddings.to_csv('data/perfume_embedding_wa.csv', index=False)