In [1]:
import pandas as pd
import numpy as np

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=8).to(device)
state_dict = torch.load("models/BERT_1211_132515/bert_epoch_10.pth", map_location=device)
model.load_state_dict(state_dict)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [3]:
df = pd.read_csv("data/1976_embeddings_bert.csv")
embeddings = df.iloc[:, 1:].values.astype(float)

In [4]:
def get_embedding(text):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            text,
            truncation=True, 
            padding='max_length', 
            max_length=128, 
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

In [22]:
like_top_notes =["橙","葡萄柚"]
like_middle_notes =["玫瑰","天竺葵","广藿香"]
like_base_notes =["香根","安息香","甜椒","西洋杉"]

like_notes = f"前调：{'、'.join(like_top_notes)}；中调：{'、'.join(like_middle_notes)}；后调：{'、'.join(like_base_notes)}"

like_ids = get_embedding(like_notes)
user_embedding = like_ids.reshape(1, -1)

similarities = cosine_similarity(user_embedding, embeddings)[0]

df['similarity'] = similarities
df_sorted = df.sort_values(by='similarity', ascending=False)

df_sorted[['name', 'similarity']].head(10)

Unnamed: 0,name,similarity
465,Paco Rabanne XS Excess 超越男性淡香水,0.809833
192,Versace EROS 艾诺斯爱神男性淡香精版本 TESTER,0.807249
191,Versace EROS 艾诺斯爱神男性淡香精版本,0.807249
1535,Estee Lauder White Linen 雅诗兰黛白色亚麻女性淡香精,0.807225
205,Versace Eros Flame 凡赛斯爱神火焰男性淡香精 TESTER,0.802252
204,Versace Eros Flame 凡赛斯爱神火焰男性淡香精,0.802252
541,Tom Ford Grey Vetiver 灰色香根草男性淡香精(清新岩兰草),0.795109
148,Burberry Brit for Him 风格男性淡香水,0.792716
5,Kenzo Jungle 斑马男性淡香水(2025),0.792707
662,Cacharel pour L'homme 卡夏尔经典男性淡香水,0.792513
