In [1]:
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
import pandas as pd
from pymilvus import connections, utility, Collection, FieldSchema, CollectionSchema, DataType
from tqdm import tqdm
from transformers import pipeline           # Hugging Face转换器
import networkx as nx                       # 网络图形可视化
import matplotlib.pyplot as plt
import json


实验说明：该实验使用和原本文献相同的超参数，text表示，在embedding时，使用了原本的abstract加上了title。投入bertopic的文本为abstract。目的是验证embedding时候使用abstract+mesh的组合带来的效果。

# TC = 0.5355, TD = 0.6667

In [2]:
# 设置文件路径
raw_csv_path ="../raw.csv"

# 读取CSV文件
def load_info():
    # 读取csv文件
    df = pd.read_csv(raw_csv_path)
    
    # 提取需要的列
    info = []
    for _, row in df.iterrows():
        abstract = row['abstract']
        try:
            year = int(row['pub_year'])
        except:
            year = np.nan
        
        # 提取其他需要的列
        pmid = row.get('pmid', None)
        title = row.get('title', None)
        mesh_terms = row.get('mesh_terms', None)
        text = row.get('text', None)
        
        # 只添加需要的列
        info.append([abstract, year, pmid, title, mesh_terms, text])
    
    # 创建DataFrame，只包含指定的列
    info = pd.DataFrame(info, columns=['abstract', 'year', 'pmid', 'title', 'mesh_terms', 'text'])
    return info

# 加载数据
data_full = load_info()

# 移除缺失摘要的条目
data_full = data_full.dropna(subset=['abstract'])

# 提取摘要文本用于主题建模
docs = data_full.abstract.values

# 打印基本信息
print(f"数据总量: {len(data_full)}")
print(f"年份范围: {data_full['year'].min()} - {data_full['year'].max()}")

数据总量: 3000
年份范围: 2002 - 2025


In [3]:
# 拼接abstract和mesh_terms
data_full['new'] = data_full['title'] + " " + data_full['abstract']

# # 拼接多列
# # data_full['combined_text'] = data_full['title'] + " " + data_full['abstract'] + " " + data_full['mesh_terms']

docs = data_full.new.values

docs[:1]

array(['Anxiety and depression in healthcare workers 2 years after COVID-19 infection and scale validation. This study aims to assess the levels of anxiety and depression among healthcare workers two years post COVID-19 infection and to validate the reliability and validity of the PHQ-9 and GAD-7 scales in this population. This cross-sectional study was conducted in June 2024 using a simple random sampling approach to survey healthcare institution workers. A total of 1038 valid samples were collected, and anxiety and depression levels were assessed using the PHQ-9 and GAD-7 scales. Participants included healthcare workers such as doctors, nurses, administrative staff, and students. Data analysis included descriptive statistics, correlation analysis, univariate, and multivariate analyses to explore the effects of variables such as occupation and gender on anxiety and depression. Long COVID was reported in 50.8% of participants. Occupational categories significantly influenced anxiety an

In [4]:
# from transformers import AutoTokenizer

# # 初始化tokenizer (使用BERT基础模型，您也可以根据需要换成其他模型)
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# # 计算docs中每条数据的token数量
# token_counts = [len(tokenizer.encode(doc)) for doc in docs]

# # 计算统计信息
# total_tokens = sum(token_counts)
# avg_tokens = total_tokens / len(docs)
# max_tokens = max(token_counts)
# min_tokens = min(token_counts)

# # 打印结果
# print(f"总token数: {total_tokens}")
# print(f"平均每条数据token数: {avg_tokens:.2f}")
# print(f"最大token数: {max_tokens}")
# print(f"最小token数: {min_tokens}")
# print(f"超过512 token的数据比例: {sum(1 for x in token_counts if x > 512) / len(token_counts):.2%}")
# print(f"超过1024 token的数据比例: {sum(1 for x in token_counts if x > 1024) / len(token_counts):.2%}")

# # 如果您想查看更详细的分布，取消下面的注释
# plt.figure(figsize=(10, 6))
# plt.hist(token_counts, bins=50)
# plt.title('Token数量分布')
# plt.xlabel('Token数量')
# plt.ylabel('文档数量')
# plt.show()

In [5]:
# model instance
embedding_model = SentenceTransformer(
  'all-MiniLM-L6-v2',
  # 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',
  # 'dmis-lab/biobert-v1.1',
  # 'NeuML/pubmedbert-base-embeddings',
  # 'pritamdeka/S-PubMedBert-MS-MARCO',
  # 'marcmendez-aily/BertMeshTerms',
)

# embeddings = np.load('data/emb-covid-19-pubmedbert-base-embeddings.npy')
# print(type(embeddings), embeddings.shape)

# reduce dimensionality
umap_model = UMAP(
    n_neighbors = 15, n_components = 5, min_dist = 0.0, 
    metric = 'cosine', random_state = 34)
# cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_cluster_size = 30, min_samples = 10,  
    metric = 'euclidean', cluster_selection_method = 'eom', 
    prediction_data = True)

# tokenize topics
vectorizer_model = CountVectorizer(stop_words = "english")
# create topic representation
ctfidf_model = ClassTfidfTransformer()
# fine tune with relevance
representation_model = MaximalMarginalRelevance(diversity = 0.2)
# all steps together
topic_model = BERTopic(
  embedding_model = embedding_model,    # Step 1 - Extract embeddings
  umap_model = umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model = hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model = vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model = ctfidf_model,          # Step 5 - Extract topic words        
  calculate_probabilities = True,        
  verbose = True,
  representation_model = representation_model # Diversify topic words
)

In [6]:
# 训练模型
topics, probs = topic_model.fit_transform(docs)

# topic_model.fit_transform(docs, embeddings=embeddings)

# 获取聚类结果
topic_info = topic_model.get_topic_info()
topic_info



Batches:   0%|          | 0/94 [00:00<?, ?it/s]

2025-05-28 21:23:24,897 - BERTopic - Transformed documents to Embeddings
2025-05-28 21:23:41,278 - BERTopic - Reduced dimensionality
2025-05-28 21:23:41,473 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,668,-1_students_study_anxiety_student,"[students, study, anxiety, student, depression...",[The effect of emotional freedom technique on ...
1,0,311,0_burnout_residents_medical_training,"[burnout, residents, medical, training, exhaus...",[Prevalence of burnout and its correlates amon...
2,1,309,1_suicide_depression_students_suicidal,"[suicide, depression, students, suicidal, depr...",[Cross-sectional survey of depressive symptoms...
3,2,304,2_anxiety_depression_students_depressive,"[anxiety, depression, students, depressive, st...",[Prevalence and factors associated with depres...
4,3,226,3_covid_pandemic_health_students,"[covid, pandemic, health, students, anxiety, d...",[A longitudinal investigation of COVID-19 pand...
5,4,155,4_patients_pain_cancer_depression,"[patients, pain, cancer, depression, dementia,...",[Pre-consultation biopsychosocial data from pa...
6,5,146,5_alcohol_students_motives_consequences,"[alcohol, students, motives, consequences, smo...",[Depressive Symptoms and Drinking Outcomes: Th...
7,6,121,6_intervention_anxiety_depression_therapy,"[intervention, anxiety, depression, therapy, a...",[A Mobile Health Intervention for Mental Healt...
8,7,116,7_nursing_students_clinical_simulation,"[nursing, students, clinical, simulation, skil...",[Nursing students' experiences with high-fidel...
9,8,88,8_mindfulness_veterinary_students_meditation,"[mindfulness, veterinary, students, meditation...",[An online mindfulness-based intervention for ...


In [7]:
# info about all docs assigned to topics
documents = topic_model.get_document_info(docs) 

In [8]:
# ====================================
# 模块5: 模型评估
# 计算主题一致性(TC)和主题多样性(TD)指标
# ====================================
# 将每个主题下的文档合并为一个长文档
documents_per_topic = documents.groupby(['Topic'], as_index = False).agg({'Document': ' '.join})
# 对文档进行预处理
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
# 构建文档分析器
analyzer = topic_model.vectorizer_model.build_analyzer()
# 获取每个文档的标记
tokens = [analyzer(doc) for doc in cleaned_docs]
# 构建主题词列表(每个主题取前10个关键词的第一个元素)
bertopic_topics = [
    [topicwords[0] for topicwords in topic_model.get_topic(i)[:10]]
    for i in range(len(set(topics)) - 1)]
# 计算主题一致性(Topic Coherence)
TC = Coherence(texts = tokens, topk = 10, measure = 'c_v').score({'topics': bertopic_topics}) 
# 计算主题多样性(Topic Diversity)
TD = TopicDiversity().score({'topics': bertopic_topics})
# print('TC = ', TC, 'TD = ', TD)
# 输出保留4位小数的指标值
print(f'TC = {TC:.4f}, TD = {TD:.4f}')


TC = 0.5355, TD = 0.6667


In [9]:
# import dashscope
# from dashscope import Generation
# import pandas as pd
# from tqdm import tqdm

# # 设置通义千问 API 密钥（请替换为你自己的）
# dashscope.api_key = 'sk-39088a0d08b04e89b0a9050272af39d3'

# # 定义提示模板
# def build_prompt(keywords):
#     prompt = (
#         "你是一名专业的学术主题归纳专家。"
#         "请根据下方给定的一组英文关键词，生成一个简洁、准确、便于人类理解的自然语言主题标题。"
#         "要求：\n"
#         "1. 标题应为一句话，简明扼要，突出主题核心。\n"
#         "2. 不要逐词翻译或简单罗列关键词，要根据关键词的内在联系进行归纳总结。\n"
#         "3. 标题用英文输出。\n"
#         "4. 不要包含引号、编号或多余修饰。\n"
#         "5. 适合用于学术论文、报告或可视化展示。\n"
#         f"关键词：{', '.join(keywords)}\n"
#         "请生成主题标题："
#     )
#     return prompt

# # 调用 Qwen 接口生成主题名
# def generate_topic_title(keywords):
#     prompt = build_prompt(keywords)
#     try:
#         response = Generation.call(
#             model='qwen2.5-14b-instruct-1m',
#             prompt=prompt,
#             temperature=0.3,
#             top_p=0.95,
#             max_tokens=100
#         )
#         return response['output']['text'].strip()
#     except Exception as e:
#         print("Error:", e)
#         return "生成失败"

# # 假设你已经有一个名为 topic_info 的 DataFrame，其中包含 'Topic' 和 'Representation' 列
# # 例如：
# # topic_info = pd.DataFrame({
# #     'Topic': [0, 1],
# #     'Representation': ["['癌症', '治疗', '化疗']", "['糖尿病', '胰岛素', '血糖']"]
# # })

# # 处理 topic_info 表格
# topic_info_with_titles = topic_info.copy()
# topic_info_with_titles['Generated_Topic_Title'] = None

# # 只处理实际的主题编号（排除 -1 噪声类）
# valid_rows = topic_info_with_titles['Topic'] >= 0

# # 遍历生成标题
# for idx in tqdm(topic_info_with_titles[valid_rows].index):
#     keywords = topic_info_with_titles.at[idx, 'Representation']
#     # 兼容字符串和list
#     if isinstance(keywords, str):
#         try:
#             keywords_list = eval(keywords)
#         except Exception as e:
#             print(f"eval失败，原始内容：{keywords}，错误：{e}")
#             topic_info_with_titles.at[idx, 'Generated_Topic_Title'] = "格式错误"
#             continue
#     elif isinstance(keywords, list):
#         keywords_list = keywords
#     else:
#         print(f"未知类型，原始内容：{keywords}")
#         topic_info_with_titles.at[idx, 'Generated_Topic_Title'] = "格式错误"
#         continue

#     # 过滤空关键词
#     if not keywords_list or not isinstance(keywords_list, list):
#         topic_info_with_titles.at[idx, 'Generated_Topic_Title'] = "无关键词"
#         continue

#     # 调用API
#     try:
#         topic_info_with_titles.at[idx, 'Generated_Topic_Title'] = generate_topic_title(", ".join(keywords_list))
#     except Exception as e:
#         print(f"API调用失败，关键词：{keywords_list}，错误：{e}")
#         topic_info_with_titles.at[idx, 'Generated_Topic_Title'] = "生成失败"

# # 显示或保存结果
# # print(topic_info_with_titles.head())

# # # 保存为 CSV 文件
# # output_path = "topics_with_titles.csv"
# # topic_info_with_titles.to_csv(output_path, index=False, encoding='utf-8-sig')

# # print(f"结果已保存到：{output_path}")

# for i, title in enumerate(topic_info_with_titles.get('Generated_Topic_Title').to_list()):
#     print(f"{i}: {title}")


In [10]:
# for i, title in enumerate(topic_info_with_titles.get('Generated_Topic_Title').to_list()):
#     print(f"{i}: {title}")

In [11]:
# # ====================================
# # 模块6: 主题探索和分析
# # 对特定主题进行详细分析
# # ====================================
# # 选择一个感兴趣的主题进行分析
# MY_TOPIC = 5
# # 获取该主题的十个关键词
# topic_model.get_topic(MY_TOPIC) 
# # 获取分配给该主题的所有文档
# assigned_docs = documents[documents.Topic == MY_TOPIC] 
# # 打印该主题下所有文档的摘要
# for abstract in assigned_docs.Document:
#     print(abstract)
#     print('---')

# # 获取该主题的代表性文档
# representative_docs = topic_model.get_representative_docs(MY_TOPIC)
# # 获取代表性文档的引用信息
# data_full[data_full.abstract == representative_docs[1]].iloc[0].citation
    
# # 使用零样本分类器为主题分配标签
# classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")
# # 将主题关键词组合成一个序列
# sequence_to_classify =  " ".join([word for word, _ in topic_model.get_topic(MY_TOPIC)])
# # 定义候选标签
# candidate_labels = ["Psychometrics of depression"]
# # 执行分类
# classifier(sequence_to_classify, candidate_labels)

In [12]:
# # ====================================
# # 模块7: 结果可视化
# # 使用多种方法可视化主题建模结果
# # ====================================
# # 1. 条形图 - 显示前六个主题的关键词
# barchart = topic_model.visualize_barchart(top_n_topics = 6, n_words = 5, width = 400, title = "")
# barchart.show()

# # 2. 主题间关系图 - 展示主题之间的距离和关系
# topic_model.visualize_topics().show()

# # 3. 热力图 - 显示主题之间的相似度矩阵
# topic_model.visualize_heatmap(n_clusters = 7)

# # 4. 层次聚类 - 展示主题的层次结构
# hierarchical_topics = topic_model.hierarchical_topics(docs)
# topic_model.visualize_hierarchy(hierarchical_topics = hierarchical_topics)

# # 5. 文档嵌入可视化 - 展示文档在空间中的分布
# embeddings = embedding_model.encode(docs, show_progress_bar = False)
# topic_model.visualize_documents(docs, embeddings = embeddings)   
# topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings = embeddings) 
 
# # 6. 时间序列分析 - 展示主题随时间变化的趋势
# time = data_full.year.to_list() 
# topics_over_time = topic_model.topics_over_time(docs, time)
# topic_model.visualize_topics_over_time(topics_over_time, topics = [0, 4, 5, 20])

# # 7. 主题相关性搜索 - 查找与特定关键词相关的主题
# # 函数：绘制关键词与主题的相似度网络图
# def draw_simil(keyword):
#     # 找出与关键词最相关的3个主题及其相似度
#     topicsF, similarity = topic_model.find_topics(keyword, top_n = 3)
#     # 构建网络图的节点
#     G = nx.DiGraph()
#     G.add_node(-1)
#     G.add_node(0)
#     G.add_node(1)
#     G.add_node(2)
#     node_labels = dict(enumerate(topicsF))
#     node_labels[-1] = keyword
#     scale = (len(keyword) + 12) * 0.032
#     x_offset = 0.25 * scale
#     y_offset = -.5
#     # 设置节点位置
#     pos = {
#         -1: (0, 0),
#         0: (-x_offset * scale, y_offset),
#         1: (0, y_offset),
#         2: (x_offset * scale, y_offset)
#     }
#     # 绘制节点
#     nx.draw(
#         G, pos = pos, labels = node_labels, with_labels = True, node_shape = "s", 
#         bbox = dict(facecolor = "lightgreen", edgecolor = 'black', boxstyle = 'round,pad=0.5'))
    
#     # 构建边的图形(用于改变边的位置)
#     H = nx.DiGraph()
#     for i, sim_val in enumerate(similarity):
#         H.add_edge(i + 3, i, label = round(sim_val, 2)) # 3->0, 4->1, 5->2
#     y_offset = -.48
#     # 设置边位置
#     pos = {
#         0: (-x_offset * scale, y_offset),
#         1: (0, y_offset),
#         2: (x_offset * scale, y_offset),
#         3: (-x_offset * scale, 0),
#         4: (0, 0),
#         5: (x_offset * scale, 0)
#     }
#     edge_labels = nx.get_edge_attributes(H, "label")
#     # 绘制边和标签
#     nx.draw_networkx(H, labels = {}, node_color = "white", arrows = True, pos = pos)
#     nx.draw_networkx_edge_labels(H, pos, edge_labels)
#     # 设置图形边距
#     plt.xlim(-1, 1)
#     plt.ylim(-1, 1)
#     ax = plt.gca()
#     ax.margins(0.20)
#     plt.axis("off")
#     plt.show()

# # 分析两个关键词与主题的相关性
# draw_simil('internet addiction')
# draw_simil('burnout measures')
