In [None]:
### 1. 实现基于豆瓣top250图书评论的简单推荐系统（TF-IDF及BM25两种算法实现）

In [1]:
# 导包
import csv
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import os
from rank_bm25 import BM25Okapi

In [2]:
# 数据集处理
def get_new_book(oldFileName, newFileName):
    if os.path.exists(newFileName):
        return
    new_f = open(newFileName, 'w')
    lines = [line for line in open(oldFileName, 'r')]
    for i, line in enumerate(tqdm(lines)):
        line = line.strip()
        # 标题行
        if i == 0:
            new_f.write(line + '\n')
            pre_line = ''
            continue
        
        # 记录首行 or 同一本书换行情况
        if not pre_line:
            pre_line = line
            continue
        
        curr_lines = line.split('\t')
        pre_lines = pre_line.split('\t')
        # 同一本书 and 未换行
        if curr_lines[0] == pre_lines[0]:
            new_f.write(pre_line + '\n')
            pre_line = line
        # 不同书 or 同一本书换行了
        else:
            # 不同书
            if len(curr_lines) == 6:
                new_f.write(pre_line + '\n')
                pre_line = line
            # 同一本书换行了
            else:
                pre_line += line
                
    new_f.close() 

In [3]:
# 分词获取
def get_book_comments(fileName):
    book_comments = {}
    with open(fileName, 'r') as f:
        lines = csv.DictReader(f, delimiter='\t')
        for line in lines:
            book_name = line.get('book', '')
            comments = line.get('body', '')
            
            if not book_name or not comments : 
                continue
            
            comments_words = jieba.lcut(comments)
            book_comments.setdefault(book_name, []).extend(comments_words)
    return book_comments
            

In [4]:
# TF-IDF算法-计算推荐书籍
def tfidf_recommendation(stopWords, book_comments, in_book_name):
    book_name_list = list(book_comments.keys())
    # 构建IF-IDF矩阵（每个元素都是词典每个词汇在当前书籍中的TF-IDF值）
    tfidfVectorizer = TfidfVectorizer(stop_words=stopWords)
    tfidfVectorizer.fit
    tfidf_matrix = tfidfVectorizer.fit_transform([' '.join(book_comments[book_name]) for book_name in book_name_list])

    # 计算TF-IDF余弦相似度（每个元素都是当前书籍和其他书籍向量余弦值）
    similaritys = cosine_similarity(tfidf_matrix)
    book_idx = book_name_list.index(in_book_name)
    similarity_row = similaritys[book_idx]
    
    top_10_similar_books = np.argsort(similarity_row)[::-1][1:11]
    
    top_10_similar_books = [book_name_list[i] for i in top_10_similar_books]
    return top_10_similar_books

In [5]:
# BM25-计算推荐书籍
def bm25_recommendation(stopWords, book_comments, in_book_name):
    book_name_list = list(book_comments.keys())
    all_comments = [' '.join(book_comments[book_name]) for book_name in book_name_list]
    bm25 = BM25Okapi([word for word in all_comments if word not in stopWords])
    
    # 获取输入书籍的评论并分词
    in_book_comment = ' '.join(book_comments[in_book_name])
    in_book_tokens = [word for word in jieba.lcut(in_book_comment) if word not in stopWords]
    
    # 计算输入书籍与其他书籍的BM25得分
    scores = bm25.get_scores(in_book_tokens)
    
    top_10_similar_books = np.argsort(scores)[::-1][1:11]
    
    top_10_similar_books = [book_name_list[i] for i in top_10_similar_books]
    return top_10_similar_books

In [6]:
# 找相近书籍
if __name__ == '__main__':
    oldFileName = "doubanbook_top250_comments.txt"
    newFileName = "doubanbook_top250_comments_new.txt"
    # 书籍内容格式化
    get_new_book(oldFileName, newFileName)
    
    # 加载分词数据集
    book_comments = get_book_comments(newFileName)
    book_name_list = list(book_comments.keys())
    
    # 停用词处理
    stopWords = [word for word in open('stopwords.txt', 'r')]
    # 查找任意一本书籍相似的排名前10的书籍
    in_book_name = '明朝那些事儿（贰）'
   
   # 使用TF - IDF算法进行推荐
    tfidf_result = tfidf_recommendation(stopWords, book_comments, in_book_name)
    print("TF - IDF推荐结果(图书，按相关性从高到低):", tfidf_result)
    # 使用BM25算法进行推荐
    bm25_result = bm25_recommendation(stopWords, book_comments, in_book_name)
    print("BM25推荐结果(图书，按相关性从高到低):", tfidf_result)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\uchonor\AppData\Local\Temp\jieba.cache
Loading model cost 1.164 seconds.
Prefix dict has been built successfully.


TF - IDF推荐结果(图书，按相关性从高到低): ['明朝那些事儿（壹）', '明朝那些事儿（1-9）', '万历十五年', '明朝那些事儿（叁）', '明朝那些事儿（柒）：大结局', '明朝那些事儿（陆）', '明朝那些事儿（肆）', '人类简史', '明朝那些事儿（伍）', '穆斯林的葬礼']
BM25推荐结果(图书，按相关性从高到低): ['明朝那些事儿（壹）', '明朝那些事儿（1-9）', '万历十五年', '明朝那些事儿（叁）', '明朝那些事儿（柒）：大结局', '明朝那些事儿（陆）', '明朝那些事儿（肆）', '人类简史', '明朝那些事儿（伍）', '穆斯林的葬礼']
