# 推荐系统：根据输入的关键词推荐5本最相关的书籍

实现基于豆瓣top250图书评论的简单推荐系统

## TF-IDF

整体思路：
1. 数据清洗并过滤掉停用词，减少计算量 cn_stopwords.txt
2. 计算TF：每本书相当于一个“文档”，计算每本书的词频；计算总词频
3. 计算IDF：lg(总书本量/（1+包含该词的书本量）)
4. 计算TF-IDF，排序取前5本值最高的书：TF*IDF

In [17]:
import jieba
import pandas as pd
from collections import Counter
import math
from collections import defaultdict

In [None]:
# 数据预处理：评论存在换行，造成数据没有对齐列的信息
def process_comment_file(file_path):
    # 读取文件并处理换行问题
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
    
    # 预处理：合并被换行分割的评论
    processed = []
    last_record = []
    for line in lines:
        parts = line.strip().split('\t')
        # 如果满足条件就直接添加，如果不满足就添加到上一条
        if len(parts) == 6:  # 应该有6个字段
            if last_record:
                processed.append(last_record)
            last_record = parts
        else:
            if last_record:
                last_record[5] += ' ' + line.strip()
    
    # 添加最后一条满足条件的记录
    if last_record:
        processed.append(last_record)
    
    # 转换为DataFrame
    df = pd.DataFrame(
        processed,
        columns=["book", "id", "star", "time", "likenum", "body"]
    )
    return df


In [15]:
comment = process_comment_file(r'F:\NLP算法课程\正式课\0319\语言模型及词向量相关知识\doubanbook_top250_comments.txt')
comment.describe()

Unnamed: 0,book,id,star,time,likenum,body
count,96253,96253,96253,96253,96253,96253
unique,233,29381,6,4470,603,92139
top,嫌疑人X的献身,xyws,allstar50,2010-02-09,0,经典
freq,970,95,34099,132,78378,63


In [None]:
# 1. 预处理：合并同一本书的所有评论
book_contents = comment.groupby('book')['body'].apply(lambda x: ' '.join(x)).to_dict()

# 停用词表
with open(r'F:\NLP算法课程\正式课\0319\语言模型及词向量相关知识\cn_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f)

# 2. 计算每本书每个词的词频（TF）
tf_dict = {}
for book, content in book_contents.items():
    words = jieba.lcut(content) # 得到分完词的list
    # 添加停用词和中文过滤
    filtered = [w for w in words 
               if w not in stopwords 
               and len(w) > 1  # 新增长度过滤
               and '\u4e00' <= w <= '\u9fff']  # 仅保留中文
    word_count = Counter(filtered)
    total_words = sum(word_count.values())
    tf_dict[book] = {word: count/total_words for word, count in word_count.items()}

# 3. 计算IDF
df_count = defaultdict(int) #遇到新词自动设置为0
total_books = len(book_contents)

for book in tf_dict:
    for word in tf_dict[book]:
        df_count[word] += 1

idf_dict = {word: math.log(total_books / (1 + count)) for word, count in df_count.items()}

# 4. 计算TF-IDF并推荐书籍
def recommend_by_tfidf(keyword):
    book_scores = []
    for book in tf_dict:
        tf = tf_dict[book].get(keyword, 0)
        idf = idf_dict.get(keyword, 0)
        book_scores.append((book, tf * idf))
    
    return sorted(book_scores, key=lambda x: x[1], reverse=True)[:5]


Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\czx\AppData\Local\Temp\jieba.cache
Loading model cost 0.907 seconds.
Prefix dict has been built successfully.


In [30]:
#使用示例
key_word = input('请输入关键词')
recommendations = recommend_by_tfidf(key_word)
print(f"关键词：{key_word}，基于TF-IDF的推荐结果：")
for book, score in recommendations:
    print(f"{book}（TF-IDF值：{score:.4f}）")

关键词：推荐，基于TF-IDF的推荐结果：
1995-2005夏至未至（TF-IDF值：-0.0000）
1Q84 BOOK 1（TF-IDF值：-0.0000）
1Q84 BOOK 3（TF-IDF值：-0.0000）
一个陌生女人的来信（TF-IDF值：-0.0000）
一個人住第5年（TF-IDF值：-0.0000）


心得：dict真好用

## BM25

In [32]:
# 设置超参数
k = 1.5
b = 0.75
# 计算BM25得分并推荐书籍
doc_lengths = {book: sum(word_count.values()) for book, word_count in tf_dict.items()} #总词数量化文档长度
avg_dl = sum(doc_lengths.values()) / len(doc_lengths) #平均文档长度

# 计算IDF
df_count = defaultdict(int)
total_books = len(book_contents)
for book in tf_dict:
    for word in tf_dict[book]:
        df_count[word] += 1

# 新公式：log[(N - n_t + 0.5) / (n_t + 0.5)]
idf_dict = {word: math.log( (total_books - count + 0.5) / (count + 0.5) ) 
            for word, count in df_count.items()}

def recommend_by_BM25(keyword):
    book_scores = []
    idf = idf_dict.get(keyword, 0)
    for book in tf_dict:
        tf = tf_dict[book].get(keyword, 0)
        dl = doc_lengths[book]
        
        # BM25计算公式
        numerator = tf * (k + 1)
        denominator = tf + k * (1 - b + b * (dl / avg_dl))
        bm25_score = idf * (numerator / (denominator + 1e-8))  # 防止除零
        
        book_scores.append((book, bm25_score))
    
    return sorted(book_scores, key=lambda x: x[1], reverse=True)[:5]

In [31]:
bm25_recommendations = recommend_by_BM25(key_word)
bm25_recommendations

[('1995-2005夏至未至', -0.0),
 ('1Q84 BOOK 1', -0.0),
 ('1Q84 BOOK 3', -0.0),
 ('一个陌生女人的来信', -0.0),
 ('一個人住第5年', -0.0)]