In [2]:
# 1. 实现基于豆瓣top250图书评论的简单推荐系统（TF-IDF及BM25两种算法实现）
# 2. 使用自定义的文档文本，通过fasttext训练word2vec训练词向量模型，并计算词汇间的相关度。（选做：尝试tensorboard绘制词向量可视化图）
# 3. 使用课堂示例cooking.stackexchange.txt，使用fasttext训练文本分类模型。（选做：尝试使用Kaggle中的Fake News数据集训练文本分类模型）
# https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import math

In [None]:
# 读取文件到comments
def load_comments(file_path):
    comments =[]
    try:
        with open(file_path , 'r' , encoding='utf-8') as file:
            for line in file:
                comment = line.strip()
                if comment:
                        comments.append(comment)
    except FileNotFoundError:
        print(f"错误:文件{file_path}未找到。")
    return comments

In [5]:
com = load_comments("doubanbook_top250_comments.txt")

In [13]:
def bm25(comments , k=1.5, b=0.75):
    # 计算文档总数
    N = len(comments)
    # 初始化文档长度列表和词频字典
    doc_length = []
    word_doc_freq = {}
    doc_term_dict = [{} for _ in range(N)]

    for i , comment in enumerate(comments):
        # 记录文档长度
        doc_length.append(len(comment.split()))
        unique_words = set()

        for word in comment.split():
            # 统计词频
            doc_term_dict[i][word] = doc_term_dict[i].get(word , 0) + 1
            # 上边这句的逻辑：i为当前文章索引，如果word在字典里面，获取+1，不在的话，设置为1
            unique_words.add(word)
            # 上边这局，是对当前文档，进行数据集

        for word in unique_words:
            word_doc_freq[word] = word_doc_freq.get(word,0) + 1
        # 这个代码段是计算所有文档中，当前单词的频数
    
    # 计算每个单词的平均文档长度
    avg_doc_len= sum(doc_length) / N

    # 构建词汇表
    vocabulary = list(word_doc_freq.keys())
    word_index = {word:idx for idx, word in enumerate(vocabulary)}

    # 构建文档、词频矩阵
    doc_term_matrix = np.zeros((N,len(vocabulary)))
    for i in range(N):
        for word , freq in doc_term_dict[i].items():
            idx = word_index.get(word)
            if idx is not None:
                doc_term_matrix[i, idx] = freq

    # 计算idf值
    idf_numerator = N - np.array([word_doc_freq[word] for word in vocabulary]) + 0.5
    idf_denominator = np.array([word_doc_freq[word] for word in vocabulary]) + 0.5
    idf = np.log(idf_numerator / idf_denominator)
    idf[idf_numerator <= 0] = 0 # 避免出现nan值

    # 计算bm25值
    doc_length = np.array(doc_length)
    bm25_matrix = np.zeros((N, len(vocabulary)))
    for i in range(N):
        tf = doc_term_matrix[i]
        bm25 = idf * (tf * (k + 1)) / (tf + k * (1 - b + b * doc_length[i] / avg_doc_len))
        bm25_matrix[i] = bm25
    
    return bm25_matrix



In [11]:
def recommend_books(comments , query , method = 'tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(comments)
        query_vector = vectorizer.transform([query])
        scores = (tfidf_matrix * query_vector.T).toarray().flatten()
    elif method == 'bm25':
        bm25_matrix = bm25(comments)
        vectorizer = TfidfVectorizer()
        vectorizer.fit(comments)
        query_vector = vectorizer.transform([query]).toarray()[0]
        vocabulary = vectorizer.get_feature_names_out()
        word_index = {word: idx for idx, word in enumerate(vocabulary)}
        scores = []
        for i in range(len(comments)):
            score = 0
            for word in query.split():
                if word in word_index:
                    idx = word_index[word]
                    score += bm25_matrix[i][idx]
            scores.append(score)
    else:
        print("不支持的方法，请选择 'tfidf' 或 'bm25'。")
        return []

    ranked_indices = np.argsort(scores)[::-1]
    return ranked_indices

In [14]:
if __name__ == "__main__":
    file_path = "doubanbook_top250_comments.txt"
    comments = load_comments(file_path)
    query = "一本关于历史的好书"
    tfidf_recommendations = recommend_books(comments, query, method='tfidf')
    bm25_recommendations = recommend_books(comments, query, method='bm25')

    print("TF - IDF 推荐结果（前5个）:")
    for i in range(min(5, len(tfidf_recommendations))):
        print(comments[tfidf_recommendations[i]])

    print("\nBM25 推荐结果（前5个）:")
    for i in range(min(5, len(bm25_recommendations))):
        print(comments[bm25_recommendations[i]])

MemoryError: Unable to allocate 110. GiB for an array with shape (99257, 148673) and data type float64

In [16]:
# 使用自定义的文档文本，通过fasttext训练word2vec训练词向量模型，并计算词汇间的相关度。（选做：尝试tensorboard绘制词向量可视化图）
import fasttext
import fasttext.util

ModuleNotFoundError: No module named 'fasttext'

In [17]:
# 自定义文档文本
documents = [
    "我喜欢阅读书籍",
    "书籍是知识的源泉",
    "阅读能开阔视野",
    "知识改变命运"
]

# 将文档写入临时文件，fastText 训练需要从文件读取数据
with open('custom_text.txt', 'w', encoding='utf-8') as f:
    for doc in documents:
        f.write(doc + '\n')

# 训练 fastText 模型
model = fasttext.train_unsupervised('custom_text.txt', model='skipgram')

# 保存模型
model.save_model("custom_fasttext_model.bin")

# 加载模型
loaded_model = fasttext.load_model("custom_fasttext_model.bin")

# 计算词汇间的相关度
word1 = "阅读"
word2 = "书籍"

# 获取词向量
vec1 = loaded_model.get_word_vector(word1)
vec2 = loaded_model.get_word_vector(word2)

# 计算余弦相似度来衡量相关度
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

vec1 = np.array(vec1).reshape(1, -1)
vec2 = np.array(vec2).reshape(1, -1)
similarity = cosine_similarity(vec1, vec2)[0][0]

print(f"词汇 '{word1}' 和 '{word2}' 的相关度为: {similarity}")

NameError: name 'fasttext' is not defined