In [2]:
import csv
import jieba 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import bm25_code as bm25

stop_words = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8')]
book_comments = {}  

def load_data(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='\t')
        for row in reader:
            book = row['book']
            comment = row['body']
            if not book:  
                continue

            words = [word for word in jieba.lcut(comment) if word not in stop_words]
            if book not in book_comments:
                book_comments[book] = []
            book_comments[book].append(words)
    return book_comments

if __name__ == '__main__':
   
    book_comments = load_data("doubanfix_comments.txt")
    
    book_names = []
    book_data = []
    for book, comments in book_comments.items():
        book_names.append(book)
        merged_doc = [word for comment in comments for word in comment]
        book_data.append(merged_doc)
    
    # 计算BM25矩阵
    bm25_matrix = bm25.bm25(book_data)
    
    # 计算余弦相似度矩阵
    similarity_matrix = cosine_similarity(bm25_matrix)
    book_name = input("请输入书名：")
    idx = book_names.index(book_name)
    similar_indices = np.argsort(-similarity_matrix[idx])[1:11]
    print(f"为您推荐《{book_name}》的相似书籍：")
    for i in similar_indices:
     print(f"《{book_names[i]}》 相似度：{similarity_matrix[idx][i]:.4f}")
     print()
    

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\海洋\AppData\Local\Temp\jieba.cache
Loading model cost 0.799 seconds.
Prefix dict has been built successfully.


为您推荐《红楼梦》的相似书籍：
《那些回不去的年少时光》 相似度：0.0719

《1984》 相似度：0.0703

《Harry Potter and the Deathly Hallows》 相似度：0.0702

《哈利·波特与魔法石》 相似度：0.0660

《活着》 相似度：0.0651

《送你一颗子弹》 相似度：0.0644

《茶花女》 相似度：0.0638

《步步惊心》 相似度：0.0637

《拆掉思维里的墙》 相似度：0.0630

《1995-2005夏至未至》 相似度：0.0613

