In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note Embedding

In [3]:
# Model
note_labels_num = 21
note_tokenizer = BertTokenizer.from_pretrained("./bert-base-chinese")
note_embedding = BertForSequenceClassification.from_pretrained("./bert-base-chinese", num_labels=note_labels_num).to(device)
state_dict = torch.load("models/NoteEmbedding_1211_200624/epoch_10.pth", map_location=device)
note_embedding.load_state_dict(state_dict)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
# Dictionary
df_note_embedding = pd.read_csv("data/note_embedding.csv")
note_embeddings = df_note_embedding.iloc[:, 1:].values.astype(float)

In [5]:
# Get note vector
def get_note_vector(note):
    note_embedding.eval()
    with torch.no_grad():
        encoding = note_tokenizer(
            note,
            truncation=True,
            padding='max_length', 
            max_length=64,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = note_embedding.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

In [6]:
# Get recommended note
def get_recommend_note(note, top_k=5):
    embedding = get_note_vector(note).reshape(1, -1)
    similarities = cosine_similarity(embedding, note_embeddings)[0]
    df_top = df_note_embedding.assign(similarity=similarities).nlargest(top_k, 'similarity')
    recommendation = df_top.values.tolist()
    return recommendation

# Perfume Vector

In [16]:
# Get note_list vector
def get_note_list_vector(note_list):
    vectors = []
    for note in note_list:
        rc_note = get_recommend_note(note, top_k=1)[0] # Makesure the note was trained
        rc_note_vector = rc_note[1:-1]
        vectors.append(rc_note_vector)
        rc_note_similarity = rc_note[-1]
        rc_note_name = rc_note[0]
        print(f"Note: {note} => Recommended Note: {rc_note_name}, Similarity: {rc_note_similarity:.4f}")
    mean_vector = np.mean(vectors, axis=0)
    return mean_vector

In [8]:
# Get perfume vector
def get_perfume_vector(top_notes, mid_notes, base_notes):
    top_vector = get_note_list_vector(top_notes)
    mid_vector = get_note_list_vector(mid_notes)
    base_vector = get_note_list_vector(base_notes)
    perfume_vec = top_vector * 0.3 + mid_vector * 0.4 + base_vector * 0.3
    return perfume_vec

# Recommend Perfumes

In [9]:
# Load perfume vector list
df_pe = pd.read_csv("data/perfume_embedding_wa.csv")
pe_list = df_pe.iloc[:, 1:].values.astype(float)

In [10]:
def get_recommended_perfumes(top_notes, mid_notes, base_notes, top_k=5):
    perfume_vec = get_perfume_vector(top_notes, mid_notes, base_notes).reshape(1, -1)
    similarities = cosine_similarity(perfume_vec, pe_list)[0]
    df_top = df_pe.assign(similarity=similarities).nlargest(top_k, 'similarity')
    recommendation = df_top[[]].values.tolist()
    return recommendation

In [11]:
# Load perfume vector list
df_pe = pd.read_csv("data/perfume_embedding_wa.csv")
pe_list = df_pe.iloc[:, 1:].values.astype(float)

In [12]:
def get_recommended_perfumes(top_notes, mid_notes, base_notes, top_k=5):
    perfume_vec = get_perfume_vector(top_notes, mid_notes, base_notes).reshape(1, -1)
    similarities = cosine_similarity(perfume_vec, pe_list)[0]
    df_top = df_pe.assign(similarity=similarities).nlargest(top_k, 'similarity')
    recommendation = df_top[['name','similarity']].values.tolist()
    return recommendation

In [27]:
like_top_notes = ["菊花","橘子","紫罗兰叶"]
like_mid_notes = ["木头","薰衣草","竹子"]
like_base_notes = ["青苔","麝香"]

rc_perfumes = get_recommended_perfumes(like_top_notes, like_mid_notes, like_base_notes)
print(rc_perfumes)

Note: 菊花 => Recommended Note: 玫瑰花, Similarity: 0.9726
Note: 橘子 => Recommended Note: 橘子, Similarity: 1.0000
Note: 紫罗兰叶 => Recommended Note: 紫罗兰叶, Similarity: 1.0000
Note: 木头 => Recommended Note: 木头, Similarity: 1.0000
Note: 薰衣草 => Recommended Note: 薰衣草, Similarity: 1.0000
Note: 竹子 => Recommended Note: 竹子, Similarity: 1.0000
Note: 青苔 => Recommended Note: 青苔, Similarity: 1.0000
Note: 麝香 => Recommended Note: 麝香, Similarity: 1.0000
[['Jovan Black Musk for Man 坏男人黑麝香男性古龙水', 0.9686658497342033], ['Creed Green Irish Tweed 爱尔兰之心淡香精', 0.9287443767036873], ['Creed Green Irish Tweed 爱尔兰之心淡香精 TESTER', 0.9287443767036873], ['Armaf Legesi 男性淡香精', 0.9062378716646099], ['Calvin Klein CK DEFY 无畏之心男性淡香水', 0.8867157172913289]]
