In [22]:
# recommendation system
import numpy as np
from tqdm import tqdm 
import csv
import jieba # 分词
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math

In [13]:
# 清洗数据，有可能有断行
fixed_file = open("doubanbook_top250_comments_fixed.txt", "w", encoding = "utf-8")
lines = [line for line in open("doubanbook_top250_comments.txt", "r", encoding = "utf-8").readlines()]
print(len(lines))
for i, line in tqdm(enumerate(lines)):
    #提取书名和评论，保存标题列
    if i == 0:
        fixed_file.write(line)
        prev_line = ''
        continue
    # 判断当前的行书名是否等于上一行的书名, 如果不是要合并

    terms = line.split("\t")
    # book_name = terms[0]
    # comment = terms[-1]
    if line.split("\t")[0] == prev_line.split("\t")[0]:
        if len(prev_line.split("\t")) == 6:
            fixed_file.write(prev_line + '\n')
            prev_line = line.strip()
        else:
            prev_line = ""
    else:
        if len(terms) == 6:
            prev_line = line.strip()
        else:
            prev_line += line.strip()

fixed_file.close()

99665


99665it [00:00, 307280.04it/s]


In [14]:
# 停用词表
stop_words = [word.strip() for word in open("stopwords.txt", 'r', encoding = 'utf-8').readlines()]

In [15]:
def load_tsv_data(file_path):
    book_comments = {} # {book_name: "评论词1+评论词2+..."}
    with open(file_path, 'r') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for item in reader:
            book_name = item['book']
            comment = item['body']
            comment_words = jieba.lcut(comment)
            if book_name == "":
                continue
            book_comments[book_name] = book_comments.get(book_name, list()) # 如果书名不存在,就是返回空列表
            book_comments[book_name].extend(comment_words)
    return book_comments


In [16]:
book_comments = load_tsv_data("doubanbook_top250_comments_fixed.txt")

# TF-IDF

In [17]:
book_list = list(book_comments.keys())
# print(len(book_list))
vectorizer = TfidfVectorizer(stop_words=stop_words)
# fit： 根据所有文档（书籍评论）计算词汇表和相应的 IDF 值。
# transform： 将每个文档转换为对应的 TF-IDF 向量。
# 组合说明： fit_transform 是 fit 和 transform 的结合，即一步完成模型参数的学习（计算 IDF）和数据转换，适合于一次性对训练数据进行处理。
tfidf_matrix = vectorizer.fit_transform([' '.join(book_comments[book_name]) for book_name in book_list]) # 计算tfidf
# print(tfidf_matrix.shape)
# 这会返回一个对称的相似度矩阵，表示每对文档之间的“内容相似程度”
similarities = cosine_similarity(tfidf_matrix)

In [18]:
book_id = book_list.index("天才在左 疯子在右")
recommend_book_ids = np.argsort(-similarities[book_id][:11])[1:] # 取前十名,去掉第一个自己本身, 加负号降序
for i in recommend_book_ids:
    print(f"<{book_list[i]}>\t similarity: {similarities[book_id][i]}")

<1995-2005夏至未至>	 similarity: 0.12349318350286932
<苏菲的世界>	 similarity: 0.12193950370527712
<盗墓笔记>	 similarity: 0.10750471319253747
<霍乱时期的爱情>	 similarity: 0.10006284467308349
<恶意>	 similarity: 0.09818539603453101
<长安乱>	 similarity: 0.07893026228621035
<许三观卖血记>	 similarity: 0.07839213736458249
<悲伤逆流成河>	 similarity: 0.07575044302166914
<Harry Potter and the Deathly Hallows>	 similarity: 0.07465265480840053
<1Q84 BOOK 1>	 similarity: 0.03454391961036781


# bm25

In [19]:
def bm25(comments, k1=5, b=0.75):
    """
    BM25算法计算词权重
    comments：分词后的文本列表（例如 [['我', '喜欢', '阅读'], ['这是', '一个', '示例']]）
    k1, b：BM25公式中的可调参数，通常取 k1=1.2~2.0，b=0.75左右
    """
    # 1. 计算每篇文档的长度
    doc_lengths = []
    for comment in comments:
        doc_lengths.append(len(comment))

    # 2. 统计词频：每篇文档中，各词出现的次数
    doc_term_dict_list = []
    for comment in comments:
        doc_term_dict = {}
        for word in comment:
            doc_term_dict[word] = doc_term_dict.get(word, 0) + 1
        doc_term_dict_list.append(doc_term_dict)

    # 3. 统计所有文档中出现过的单词集合
    unique_words = set()
    for doc_term_dict in doc_term_dict_list:
        unique_words |= set(doc_term_dict.keys())

    # 4. 构建词典，并为每个单词分配索引
    vocabulary = list(unique_words)
    word_idx = {}
    for idx, word in enumerate(vocabulary):
        word_idx[word] = idx

    # 5. 计算每个词在多少篇文档中出现（文档频次）
    doc_freq = {}
    for doc_term_dict in doc_term_dict_list:
        for word in doc_term_dict.keys():
            doc_freq[word] = doc_freq.get(word, 0) + 1

    # 6. 计算所有文档的平均长度
    avg_doc_len = sum(doc_lengths) / len(doc_lengths)

    # 7. 初始化 BM25 矩阵
    bm25_matrix = []
    for _ in range(len(doc_term_dict_list)):
        bm25_matrix.append([0] * len(vocabulary))

    # 8. 计算 BM25 值
    for i, doc_term_dict in enumerate(doc_term_dict_list):
        for word, freq in doc_term_dict.items():
            # IDF
            idf = math.log((len(comments) - doc_freq[word] + 0.5) / (doc_freq[word] + 0.5))
            # TF
            tf = (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * (doc_lengths[i] / avg_doc_len)))
            # BM25
            bm25_matrix[i][word_idx[word]] = idf * tf

    # 9. 补齐成相同长度（如果后续需要以矩阵形式处理）
    max_length = max(len(row) for row in bm25_matrix)
    final_bm25_matrix = []
    for row in bm25_matrix:
        final_bm25_matrix.append(row + [0] * (max_length - len(row)))
    final_bm25_matrix = np.array(final_bm25_matrix)

    return final_bm25_matrix

In [None]:
bm25_matrix = bm25([' '.join(book_comments[book_name]) for book_name in book_list])

similarities_bm = cosine_similarity(bm25_matrix)

In [25]:
book_id = book_list.index("天才在左 疯子在右")
recommend_book_ids = np.argsort(-similarities_bm[book_id][:11])[1:] # 取前十名,去掉第一个自己本身, 加负号降序
for i in recommend_book_ids:
    print(f"<{book_list[i]}>\t similarity: {similarities_bm[book_id][i]}")

<恶意>	 similarity: 0.9678231503468048
<苏菲的世界>	 similarity: 0.9653268008913644
<霍乱时期的爱情>	 similarity: 0.9630073687833041
<1Q84 BOOK 1>	 similarity: 0.9613670534381518
<许三观卖血记>	 similarity: 0.9606248546173871
<盗墓笔记>	 similarity: 0.9572816073170372
<长安乱>	 similarity: 0.9526024034737062
<1995-2005夏至未至>	 similarity: 0.9500788818977659
<Harry Potter and the Deathly Hallows>	 similarity: 0.949401513584823
<悲伤逆流成河>	 similarity: 0.8917140063251158


# fasttext


In [26]:
import fasttext
import jieba
import tensorboard

In [27]:
with open('HLM.txt', 'r', encoding='gb2312', errors='replace') as f:
    lines = f.read()
# 分词处理
with open('sparse.txt', 'w') as f:
    f.write(' '.join(jieba.cut(lines)))
# 仅支持空格分词的文件
model = fasttext.train_unsupervised('sparse.txt', model = 'skipgram')

In [28]:
# 获取词向量的类比
print(model.get_analogies('贾母','宝玉', '宝钗'))

[(0.7684757113456726, '李婶娘'), (0.7592969536781311, '薛姨妈'), (0.7405363321304321, '贾母素'), (0.7380984425544739, '李婶'), (0.7317023873329163, '贾母才'), (0.7302117943763733, '王二夫人'), (0.7285462617874146, '邢'), (0.7238101959228516, '王夫人'), (0.7182996869087219, '彼此'), (0.7164919972419739, '劝慰')]


# 文本分类

In [29]:
model_class = fasttext.train_supervised('cooking.stackexchange.txt', lr=0.01, epoch=10, dim=200)

In [30]:
print(model_class.predict('Why not put knives in the dishwasher ?'))

(('__label__baking',), array([0.03588072]))
