#### 1. 实现基于豆瓣top250图书评论的简单推荐系统（TF-IDF及BM25两种算法实现）


In [1]:
import csv
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer  #feature_extraction 特征提取
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
# 加载文件
def load_data(filename):
    # 图书的评论信息集合
    book_comments = {} #{书名：”评论1词 + 评论2词 + ...“}

    with open(filename,'r' ) as f:
        reader = csv.DictReader(f, delimiter='\t' ) #识别格式文本中的标题列
        for item in reader:
            book = item['book']
            comment = item['body']
            comment_words = jieba.lcut(comment)

            if book == '': continue #书名为空的保护

            # 图书评论集合收集
            book_comments[book] = book_comments.get(book, []) #dict的get方法，key不存在，可赋值
            book_comments[book].extend(comment_words)
    book_list = list[book_comments.keys()]
    print(book_list) 
    return book_comments

In [6]:
#构建TF-IDF特征矩阵
#分析矩阵的每一行的非0数值和所有行做余弦相似度的计算

def tf_idf(bookname):
    #加载停用词列表
    stop_words = [line.strip() for line in open("李思佳/week05/stopwords.txt", 'r', encoding="utf-8")]

    # 加载评论信息
    book_comments = load_data("李思佳/week05/doubanbook_fixed.txt")
    print(len(book_comments))

    # 提取书名和评论文本
    book_names = []
    book_comms = []
    for book, comments in book_comments.items():
        book_names.append(book)
        book_comms.append(comments)
    
    #构建TF-IDF矩阵
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform([' '.join(comments) for comments in book_comms])  #返回基于tfidf的得分矩阵

    #分析矩阵的每一行的非0数值和所有行做余弦相似度的计算
    similarity_matrix = cosine_similarity(tfidf_matrix)

    book_idx = book_names.index(bookname)
    # 获取与输入图书最相似的图书
    recommend_book_idx = np.argsort(-similarity_matrix[book_idx])[1:11]
    for idx in recommend_book_idx:
        print(f" 《{book_names[idx]}》\t 相似度：{similarity_matrix[book_idx][idx]:.4f}")


In [None]:
# 输入待查询的书籍
book_name = input("请输入图书名称：")
tf_idf(book_name)
