In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv('fra_cleaned.csv', encoding='latin-1', on_bad_lines='skip', delimiter=';')

def split_notes(col):
    return set(n.strip().lower() for n in str(col).split(",") if n and n.strip())

# gather all unique notes from every layer
unique_notes = set()
for col in ["Top", "Middle", "Base"]:
    df[col] = df[col].apply(split_notes)
    unique_notes.update(*df[col])

# make sorted vocabulary
note_vocab = sorted(unique_notes)
note_to_idx = {n: i for i, n in enumerate(note_vocab)}

print(f"Total unique notes: {len(note_vocab)}")


Total unique notes: 1671


In [10]:
# weighting by layer importance
layer_weights = {"Top": 1.0, "Middle": 1.5, "Base": 2.0}

# number of perfumes Ã— number of notes
embeddings = np.zeros((len(df), len(note_vocab)), dtype=np.float32)

# fill in weights
for i, row in df.iterrows():
    for layer, weight in layer_weights.items():
        for note in row[layer]:
            if note in note_to_idx:
                embeddings[i, note_to_idx[note]] = max(
                    embeddings[i, note_to_idx[note]], weight
                )


In [11]:
# compute L2 norms
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
norms[norms == 0] = 1  # avoid divide-by-zero
embeddings = embeddings / norms

In [13]:
np.save("perfume_vectors.npy", embeddings)
df[["Perfume", "url"]].to_json("perfume_metadata.json", orient="records")
with open("note_vocab.txt", "w") as f:
    f.write("\n".join(note_vocab))