In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note Embedding

In [3]:
# Model
note_labels_num = 21
note_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
nem = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=note_labels_num).to(device)
state_dict = torch.load("models/NEM_v1.pth", map_location=device)
nem.load_state_dict(state_dict)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
# Dictionary
df_ne = pd.read_csv("data/note_embedding_v1.csv")
ne_list = df_ne.iloc[:, 1:].values.astype(float)

In [5]:
def note2vec(note):
    # 将香材文本转为向量
    nem.eval()
    with torch.no_grad():
        encoding = note_tokenizer(
            note,
            truncation=True,
            padding='max_length', 
            max_length=64,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = nem.bert(input_ids=input_ids, attention_mask=attention_mask)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    
    # 推荐近似香材
    embedding = embedding.reshape(1, -1)
    similarities = cosine_similarity(embedding, ne_list)[0]
    idx = similarities.argmax()
    rc_vec = ne_list[idx]
    rc_name = df_ne.iloc[idx, 0]
    rc_similarity = similarities[idx]
    return rc_vec, rc_name, rc_similarity

In [6]:
def get_note_list_vec(note_list):
    vec_list = []
    for note in note_list:
        vec, name, similarity = note2vec(note)
        vec_list.append(vec)
        print(f"Note: {note} => {name} (similarity: {similarity:.4f})")
    return vec_list

# Perfume Vector

In [7]:
# Model
class PerfumeEmbedding(nn.Module):
    def __init__(self, note_dim=768, hidden=256, z_dim=128, num_classes=8, dropout=0.3):
        super().__init__()
        self.phi_top = nn.Sequential(
            nn.Linear(note_dim, hidden),
            nn.ReLU(),
            nn.LayerNorm(hidden),
            nn.Dropout(dropout)
        )
        self.phi_mid = nn.Sequential(
            nn.Linear(note_dim, hidden),
            nn.ReLU(),
            nn.LayerNorm(hidden),
            nn.Dropout(dropout)
        )
        self.phi_base = nn.Sequential(
            nn.Linear(note_dim, hidden),
            nn.ReLU(),
            nn.LayerNorm(hidden),
            nn.Dropout(dropout)
        )
        
        self.attn_top = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
        self.attn_mid = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
        self.attn_base = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
        
        self.rho_top = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.rho_mid = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.rho_base = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        self.cross_attn = nn.MultiheadAttention(embed_dim=hidden, num_heads=4, batch_first=True, dropout=dropout)
        
        self.rho = nn.Sequential(
            nn.Linear(hidden * 6, z_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(z_dim, z_dim), 
            nn.ReLU()
        )
        self.classifier = nn.Linear(z_dim, num_classes)

    def aggregate(self, phi, attn_net, rho, notes):
        h = phi(notes)
        scores = attn_net(h).squeeze(-1)
        attn_weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        weighted_h = (h * attn_weights).sum(dim=1)
        return rho(weighted_h)
    
    def forward(self, top_notes, mid_notes, base_notes):
        h_top = self.aggregate(self.phi_top, self.attn_top, self.rho_top, top_notes)
        h_mid = self.aggregate(self.phi_mid, self.attn_mid, self.rho_mid, mid_notes)
        h_base = self.aggregate(self.phi_base, self.attn_base, self.rho_base, base_notes)
        
        h_seq = torch.stack([h_top, h_mid, h_base], dim=1)
        h_interact, _ = self.cross_attn(h_seq, h_seq, h_seq)
        h_top_i, h_mid_i, h_base_i = h_interact.unbind(dim=1)
        
        h_all = torch.cat([h_top, h_mid, h_base, h_top_i, h_mid_i, h_base_i], dim=-1)
        
        z = self.rho(h_all)
        logits = self.classifier(z)
        return logits, z
    
pem = PerfumeEmbedding().to(device)
state_dict = torch.load("models/PEM_v2.pth")
pem.load_state_dict(state_dict)

<All keys matched successfully>

In [8]:
# Get perfume vector
def perfume2vec(top_notes, mid_notes, base_notes):
    pem.eval()
    with torch.no_grad():
        top_vecs = get_note_list_vec(top_notes)
        mid_vecs = get_note_list_vec(mid_notes)
        base_vecs = get_note_list_vec(base_notes)
                
        top_vecs = torch.tensor(np.array(top_vecs), dtype=torch.float32).to(device).unsqueeze(0)
        mid_vecs = torch.tensor(np.array(mid_vecs), dtype=torch.float32).to(device).unsqueeze(0)
        base_vecs = torch.tensor(np.array(base_vecs), dtype=torch.float32).to(device).unsqueeze(0)
        
        _, z = pem(top_vecs, mid_vecs, base_vecs)
    return z.cpu().numpy().squeeze()

# Recommend Perfumes

In [9]:
# Load perfume vector list
df_pe = pd.read_csv("data/perfume_embedding_v2.csv")
pe_list = df_pe.iloc[:, 1:].values.astype(float)

In [10]:
def get_recommended_perfumes(top_notes, mid_notes, base_notes, top_k=5):
    perfume_vec = perfume2vec(top_notes, mid_notes, base_notes).reshape(1, -1)
    similarities = cosine_similarity(perfume_vec, pe_list)[0]
    df_top = df_pe.assign(similarity=similarities).nlargest(top_k, 'similarity')
    recommendation = df_top[['name','similarity']].values.tolist()
    return recommendation

In [11]:
like_top_notes = ["橙子"]
like_mid_notes = ["咖啡"]
like_base_notes = ["巧克力"]
top_k = 10

rc_perfumes = get_recommended_perfumes(like_top_notes, like_mid_notes, like_base_notes, top_k)
display(rc_perfumes)

Note: 橙子 => 橙子 (similarity: 1.0000)
Note: 咖啡 => 咖啡 (similarity: 1.0000)
Note: 巧克力 => 巧克力 (similarity: 1.0000)


[['Mugler Angel Fantasm 黑天使幻想女性淡香精', 0.9306738356113713],
 ['Aquolina Pink Sugar Creamy Sunshine 奶油阳光女性淡香水', 0.9237883703457287],
 ['Ariana Grande Sweet Like Candy 女性淡香精', 0.9224516461840185],
 ['Juicy Couture Viva La Juicy Sucre 蛋糕甜心女性淡香精', 0.9219706607464045],
 ['Juicy Couture Viva La Juicy Sucre 蛋糕甜心女性淡香精 TESTER', 0.9219706607464045],
 ['Juicy Couture Viva La Juicy Sucre 蛋糕甜心女性淡香精行动香氛', 0.9219706607464045],
 ['Burberry Goddess 缪斯女神淡香精', 0.9205307121399737],
 ['Burberry Goddess 缪斯女神淡香精迷你瓶', 0.9096613683790785],
 ['Aquolina Pink Sugar Red Velvet 红丝绒女性淡香水', 0.9047305913803726],
 ['Fcuk Friction Her 爱火女性淡香精', 0.9036712784692651]]