#### 处理数据

In [1]:
import csv
import jieba

def load_data(file_pat):
    book_comments = {}
    with open(file_pat, 'r') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for item in reader:
            book = item['book']
            comment = item['body']
            comments_words = jieba.lcut(comment)

            if book == '': continue

            book_comments[book] = book_comments.get(book, [])
            book_comments[book].extend(comments_words)

    return book_comments
        


In [2]:
def load_stop_words(file_path):
    stop_words = []  
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            stop_words.append(line.strip())  
    return stop_words  

#### TF-IDF算法

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def tfidf_recommend(book_comments, stop_words):

    books = list(book_comments.keys())
    comments = [' '.join(book_comments[book]) for book in books]

    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform(comments)

    cosine_similarities = cosine_similarity(tfidf_matrix)

    results = {}
    for idx, book in enumerate(books):
        similar_indices = cosine_similarities[idx].argsort()[:-6:-1]    # 返回前5个最相似的书籍
        similar_books = [(books[i]) for i in similar_indices if i != idx]   
        results[book] = similar_books

    return results

#### BM25算法

In [4]:
from rank_bm25 import BM25Okapi as BM25


def BM25_recommend(book_comments, stop_words):

    books = list(book_comments.keys())
    comments = [book_comments[book] for book in books]

    bm25 = BM25(corpus=comments)

    results = {}
    for idx, book in enumerate(books):
        scores = bm25.get_scores(comments[idx])
        similar_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]
        similar_books = [(books[i]) for i in similar_indices if i != idx]
        results[book] = similar_books

    return results

#### 推荐比较

In [5]:
if __name__ == '__main__':
        
        book_comments = load_data('data/comments_fixed.txt')
        stop_words = load_stop_words('data/stopwords.txt')
    
        tfidf_results = tfidf_recommend(book_comments, stop_words)
        BM25_results = BM25_recommend(book_comments, stop_words)
    
        for book in tfidf_results:
            print('book:', book)
            print('tfidf:', tfidf_results[book])
            print('BM25:', BM25_results[book])
            print('====================')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\sgp\AppData\Local\Temp\jieba.cache
Loading model cost 0.901 seconds.
Prefix dict has been built successfully.


book: 天才在左 疯子在右
tfidf: ['三体', '人类简史', '拆掉思维里的墙', '少有人走的路']
BM25: ['三体', '人类简史', '三体Ⅲ', '白夜行']
book: 1Q84 BOOK 1
tfidf: ['悲伤逆流成河', '1Q84 BOOK 2', '1Q84 BOOK 3', '白夜行']
BM25: ['白夜行', '1995-2005夏至未至', '茶花女', '局外人']
book: 悲伤逆流成河
tfidf: ['1Q84 BOOK 1', '左手倒影，右手年华。', '会有天使替我爱你', '幻城']
BM25: ['梦里花落知多少', '小时代1.0折纸时代', '1Q84 BOOK 1', '幻城']
book: 恶意
tfidf: ['放学后', '解忧杂货店', '嫌疑人X的献身', '白夜行']
BM25: ['嫌疑人X的献身', '白夜行', '解忧杂货店', '幻夜']
book: Harry Potter and the Deathly Hallows
tfidf: ['哈利·波特与魔法石', '哈利·波特与火焰杯', '哈利·波特与密室', '哈利·波特与凤凰社']
BM25: ['哈利·波特与魔法石', '哈利·波特与火焰杯', '哈利·波特与阿兹卡班的囚徒', '哈利·波特与凤凰社']
book: 长安乱
tfidf: ['像少年啦飞驰', '零下一度', '三重门', '1988：我想和这个世界谈谈']
BM25: ['一座城池', '1988：我想和这个世界谈谈', '他的国', '三重门']
book: 苏菲的世界
tfidf: ['不能承受的生命之轻', '遇见未知的自己', '人类简史', '如何阅读一本书']
BM25: ['挪威的森林', '遇见未知的自己', '人类简史', '三体']
book: 许三观卖血记
tfidf: ['活着', '兄弟（上）', '兄弟（下）', '平凡的世界（全三部）']
BM25: ['活着', '兄弟（下）', '兄弟（上）', '骆驼祥子']
book: 1995-2005夏至未至
tfidf: ['1Q84 BOOK 1', '白夜行', '茶花女', '局外人']
BM25: ['1Q84 BOOK 1', '茶花女', '白夜行', '