#### 1. 实现基于豆瓣top250图书评论的简单推荐系统（TF-IDF及BM25两种算法实现）


In [34]:
import csv
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer  #feature_extraction 特征提取
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [35]:
# 加载文件
def load_data(filename):
    # 图书的评论信息集合
    book_comments = {} #{书名：”评论1词 + 评论2词 + ...“}

    with open(filename,'r' ) as f:
        reader = csv.DictReader(f, delimiter='\t' ) #识别格式文本中的标题列
        for item in reader:
            book = item['book']
            comment = item['body']
            comment_words = jieba.lcut(comment)

            if book == '': continue #书名为空的保护

            # 图书评论集合收集
            book_comments[book] = book_comments.get(book, []) #dict的get方法，key不存在，可赋值
            book_comments[book].extend(comment_words)
    # book_list = list[book_comments.keys()]
    # print(book_list) 
    return book_comments

In [36]:
#构建TF-IDF特征矩阵
#分析矩阵的每一行的非0数值和所有行做余弦相似度的计算

def tf_idf(bookname):
    #加载停用词列表
    stop_words = [line.strip() for line in open("/Users/peiqi/code/AiPremiumClass/李思佳/week05/stopwords.txt", 'r', encoding="utf-8")]

    # 加载评论信息
    book_comments = load_data("/Users/peiqi/code/AiPremiumClass/李思佳/week05/doubanbook_fixed.txt")

    # 提取书名和评论文本
    book_names = []
    book_comms = []
    for book, comments in book_comments.items():
        book_names.append(book)
        book_comms.append(comments)
    
    #构建TF-IDF矩阵
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform([' '.join(comments) for comments in book_comms])  #返回基于tfidf的得分矩阵

    #分析矩阵的每一行的非0数值和所有行做余弦相似度的计算
    similarity_matrix = cosine_similarity(tfidf_matrix)

    book_idx = book_names.index(bookname)
    # 获取与输入图书最相似的图书
    recommend_book_idx = np.argsort(-similarity_matrix[book_idx])[1:11]
    for idx in recommend_book_idx:
        print(f" 《{book_names[idx]}》\t 相似度：{similarity_matrix[book_idx][idx]:.4f}")


In [37]:
def bm25(comments, k=1.5, b=0.75):
    # 计算文档总数
    N = len(comments)
    # 初始化文档长度列表和词频字典
    doc_lengths = []
    word_doc_freq = {}
    doc_term_dict = [{} for _ in range(N)]

    for i, comment in enumerate(comments):
        # 记录文档长度
        doc_lengths.append(len(comment))
        unique_words = set()
        for word in comment:
            # 统计词频
            doc_term_dict[i][word] = doc_term_dict[i].get(word, 0) + 1
            unique_words.add(word)
        # 统计包含该词的文档数量
        for word in unique_words:
            word_doc_freq[word] = word_doc_freq.get(word, 0) + 1

    # 计算每个单词的平均文档长度
    avg_doc_len = sum(doc_lengths) / N

    # 构建词汇表
    vocabulary = list(word_doc_freq.keys())
    word_index = {word: idx for idx, word in enumerate(vocabulary)}

    # 构建文档 - 词频矩阵
    doc_term_matrix = np.zeros((N, len(vocabulary)))
    for i in range(N):
        for word, freq in doc_term_dict[i].items():
            idx = word_index.get(word)
            if idx is not None:
                doc_term_matrix[i, idx] = freq

    # 计算 idf 值
    idf_numerator = N - np.array([word_doc_freq[word] for word in vocabulary]) + 0.5
    idf_denominator = np.array([word_doc_freq[word] for word in vocabulary]) + 0.5
    idf = np.log(idf_numerator / idf_denominator)
    idf[idf_numerator <= 0] = 0  # 避免出现 nan 值

    # 计算 bm25 值
    doc_lengths = np.array(doc_lengths)
    bm25_matrix = np.zeros((N, len(vocabulary)))
    for i in range(N):
        tf = doc_term_matrix[i]
        bm25 = idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doc_lengths[i] / avg_doc_len))
        bm25_matrix[i] = bm25

    # 根据原始评论顺序重新排列 bm25 值
    final_bm25_matrix = []
    for i, comment in enumerate(comments):
        bm25_comment = []
        for word in comment:
            idx = word_index.get(word)
            if idx is not None:
                bm25_comment.append(bm25_matrix[i, idx])
        final_bm25_matrix.append(bm25_comment)

    # 找到最长的子列表长度
    max_length = max(len(row) for row in final_bm25_matrix)
    # 填充所有子列表到相同的长度
    padded_matrix = [row + [0] * (max_length - len(row)) for row in final_bm25_matrix]
    # 转换为 numpy 数组
    final_bm25_matrix = np.array(padded_matrix)

    return final_bm25_matrix

In [38]:
def bm25_reccommend(book):
    book_comments = load_data("/Users/peiqi/code/AiPremiumClass/李思佳/week05/doubanbook_fixed.txt")

    # 提取书名和评论文本
    book_names = []
    book_comms = []
    for book, comments in book_comments.items():
        book_names.append(book)
        book_comms.append(comments)

    book_idx = book_names.index(book)
    #构建bm25矩阵
    bm25_matrix = bm25(book_comms)
    # 计算评论之间的余弦相似度
    similarity_matrix = cosine_similarity(bm25_matrix)
    # 获取目标评论的相似度向量
    target_similarity = similarity_matrix[book_idx]
    # 获取最相似的评论索引（排除自身）,前10个
    recommend_book_idx = np.argsort(-target_similarity)[1:11]
    # 返回最相似的图书名和相似度得分
    for idx in recommend_book_idx:
        print(f" 《{book_names[idx]}》\t 相似度：{similarity_matrix[book_idx][idx]:.4f}")

In [None]:
print("基于tf-idf模型推荐：")
tf_idf("天才在左 疯子在右")
print("基于bm25模型推荐：")
bm25_reccommend("天才在左 疯子在右")