# 导入

In [None]:
import ast
import json
import re
import os
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# 加载文件

In [None]:

import pandas as pd
os.chdir("Fin-Topicmodel")
csv_path='dataset_22-24_report_title.csv'
# csv_path='report_titles_dataset/dataset_22-24_report_title.csv'
df_loaded = pd.read_csv(csv_path, encoding='utf-8-sig')
titles = df_loaded['metadata'].tolist()
date_strs = df_loaded['date_str'].tolist()
for i, date_str in enumerate(date_strs):
    date_strs[i] = str(date_str)

In [None]:
embedding_model_path="valuesimplex-ai-lab/Fin-Retriever-base"   #Finretriever
embedding_model = SentenceTransformer(embedding_model_path)
tokenizer = AutoTokenizer.from_pretrained(embedding_model_path)
titles_embeddings = embedding_model.encode(titles)

# 创建模型

In [None]:
# 2. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors=15,
  n_components=32,    #降维维度
  min_dist=0.0,
  metric='cosine',
  random_state=22  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html
)

# 3. 创建HDBSCAN聚类模型
# 如果要减少离群值，可以减小下面两个参数min_cluster_size min_samples
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=2,
  min_samples=1,    #默认=min_cluster_size
  metric='euclidean'
)

from merge_tokenizer import *

def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # 去除换行符并生成停用词列表
        stopwords = [line.strip() for line in file]
    return stopwords
# 加载停用词文件
stop_words = load_stopwords("stopwords_cn.txt")

# vectorizer = CountVectorizer(tokenizer=chinese_tokenizer,stop_words=stop_words)
vectorizer = CountVectorizer(tokenizer=merge_tokenizers,stop_words=stop_words)

# 6. 正式创建BERTopic模型
topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model, 
#   min_topic_size=2 ,
  verbose=True
)

In [None]:
titles_embeddings.shape

In [None]:
print(chinese_tokenizer("绿水青山就是金山银山"))
print(chinese_tokenizer("金山办公"))


In [None]:
print(merge_tokenizers("绿水青山就是金山银山"))
print(merge_tokenizers("金山办公"))


# 训练模型

In [None]:
# 查看主题
# topics, probs = topic_model.fit_transform(titles)
topics, probs = topic_model.fit_transform(titles, embeddings=np.array(titles_embeddings)) #传入训练好的词向量
outfilename="berttopic_titles_report_new_finretriever_large_embedder_merge_tokenizer"

topic_info = topic_model.get_topic_info()
topic_info.to_csv(outfilename+'_topic_info_raw.csv', index=False)
topic_docs = topic_model.get_document_info(titles)
topic_docs.to_csv(outfilename+'_topic_docs_raw.csv')

import copy
# 创建 topic_model 的深拷贝
topic_model2 = copy.deepcopy(topic_model)
#Reduce outliers(标签为-1的数据)
new_topics = topic_model2.reduce_outliers(titles, topics, threshold=0.4)   # 默认threshold=0，越大没有分配标签的-1数据就越多
# 官方提供四种策略 https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html#exploration

topic_model2.update_topics(titles, topics=new_topics,vectorizer_model=vectorizer)
topic_info_new = topic_model2.get_topic_info()
topic_info_new.to_csv(outfilename+'_topic_info_reduced_outliers.csv', index=False)
topic_docs = topic_model2.get_document_info(titles)
topic_docs.to_csv(outfilename+'_topic_docs_reduced_outliers.csv')


In [None]:
# # 查看文档的主题分布
# print(topic_model.get_topics())
# print("文档主题分布：")
# print(probs)

# 保存聚类质量评估结果

In [None]:
from evaluate_cluster_quality import *
reduced_embeddings = umap_model.fit_transform(titles_embeddings)
average_silhouette, ch_score,db_score ,td_value = evaluate_clustering(outfilename+'_topic_docs_raw.csv', reduced_embeddings)
# average_silhouette, ch_score,db_score ,td_value = evaluate_clustering(outfilename+'_topic_docs_reduced_outliers.csv', reduced_embeddings)

# 可视化

In [None]:
topic_model.visualize_barchart(title="bar")

# 层次聚类

In [None]:
hierarchical_topics = topic_model2.hierarchical_topics(titles)
tree = topic_model.get_topic_tree(hierarchical_topics)

hierarchical_topics_path = outfilename+"_hierarchical_topics.csv"  # 保存分层主题路径
tree_path = outfilename+"_hierarchical_tree.txt"  # 保存主题树路径
hierarchical_topics.to_csv(hierarchical_topics_path)
    # 保存 tree 为文本文件
with open(tree_path, 'w', encoding='utf-8') as tree_file:
    tree_file.write(tree)
