In [None]:
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=21).to(device)
state_dict = torch.load("models/NoteEmbedding_1211_200624/epoch_10.pth", map_location=device)
model.load_state_dict(state_dict)

In [None]:
df = pd.read_csv("data/note_embedding.csv")
embeddings = df.iloc[:, 1:].values.astype(float)

In [None]:
def get_embedding(note):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            note,
            truncation=True,
            padding='max_length', 
            max_length=64,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

In [None]:
like_note = "玫瑰"
like_embedding = get_embedding(like_note)
similarities = cosine_similarity(like_embedding, embeddings)[0]
df['similarity'] = similarities
df_sorted = df.sort_values(by='similarity', ascending=False)
df_sorted[['note', 'similarity']].head(10)