In [1]:
pip install torch

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import torch  
from transformers import BertTokenizer, BertModel  
from torch.utils.data import DataLoader  

# 假设CSV文件名为'中文图书数据集.csv'，并且包含'关键词'和'书名'两列（注意这里使用中文列名）  
file_path = '中文图书数据集.csv'  # 文件路径是一个字符串，不是DataFrame  
keyword_column = '关键词'  # 使用中文列名  
title_column = '书名'  # 使用中文列名  
  
# 读取CSV文件，注意这里只传递文件路径字符串  
df = pd.read_csv(file_path, encoding='GBK')  # 假设文件编码是GBK 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 检查是否有可用的GPU  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# 加载支持中文的BERT模型和分词器    
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')    
model = BertModel.from_pretrained('bert-base-chinese')   
model.to(device)  # 确保模型在正确的设备上  

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
keyword_embeddings = []  
  
for index, row in df.iterrows():  
    keyword = row[keyword_column]  
    inputs = tokenizer(keyword, return_tensors="pt", padding=True, truncation=True)  
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 确保输入在GPU上  
      
    with torch.no_grad():  
        outputs = model(**inputs)  
        last_hidden_states = outputs.last_hidden_state  
          
        # 我们取[CLS]标记的嵌入作为整个句子的表示  
        # 注意这里.cpu()是在获取numpy数组之前调用的，以确保数据在CPU上  
        keyword_embeddings.append(last_hidden_states[0, 0, :].cpu().numpy())

In [7]:
# 这里我们使用NumPy数组，因为cosine_similarity可以直接处理NumPy数组  
import numpy as np
keyword_embeddings = np.array(keyword_embeddings)    

In [14]:
# 对你自己输入的关键词进行BERT嵌入    
input_keyword = "中医"  # 替换为你的关键词    
input_inputs = tokenizer(input_keyword, return_tensors="pt", padding=True, truncation=True)    
input_inputs = {k: v.to(device) for k, v in input_inputs.items()}    
with torch.no_grad():    
    input_outputs = model(**input_inputs)    
    input_embedding = input_outputs.last_hidden_state[0, 0, :].cpu().numpy()    
  

In [15]:
from sklearn.metrics.pairwise import cosine_similarity 
# 计算相似度    
similarities = cosine_similarity(input_embedding.reshape(1, -1), keyword_embeddings)
# 注意这里直接使用NumPy数组  
similarities = cosine_similarity(input_embedding.reshape(1, -1), keyword_embeddings)    
  
# 找到相似度最高的索引    
max_similarity_idx = np.argmax(similarities)    
  
# 获取相似度最高的书名    
most_similar_title = df.loc[max_similarity_idx, title_column]    
print(f"与输入关键词最相似的书名是: {most_similar_title}")

与输入关键词最相似的书名是: 甲骨文化与中医学
