In [1]:
import pandas as pd
from pathlib import Path

csv_path = Path('step3_d3_g005_e005_a01/depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_1/iteration_document_paths.csv')
df = pd.read_csv(csv_path)

# 校验必要列
required = {'iteration', 'leaf_node_id', 'document_id'}
missing = required - set(df.columns)
if missing:
    raise ValueError(f'缺少必要列: {missing}')

# 按 iteration 倒序取最后5轮
df['iteration'] = pd.to_numeric(df['iteration'], errors='coerce')
df = df.dropna(subset=['iteration']).sort_values('iteration', ascending=False)

# 获取最后5轮的迭代次数
last5_iterations = df['iteration'].unique()[:5]
print(f"最后5轮迭代次数: {last5_iterations}")

# 筛选最后5轮的数据
last5_data = df[df['iteration'].isin(last5_iterations)]
print(f"最后5轮总数据条数: {len(last5_data)}")

# 检查每个文档在最后5轮中的路径选择
convergent_docs = []
non_convergent_docs = []

for doc_id in last5_data['document_id'].unique():
    doc_data = last5_data[last5_data['document_id'] == doc_id]
    unique_paths = doc_data['leaf_node_id'].nunique()
    
    if unique_paths == 1:
        convergent_docs.append(doc_id)
        path = doc_data['leaf_node_id'].iloc[0]
        # print(f"文档 {doc_id}: 收敛到路径 {path}")
    else:
        non_convergent_docs.append(doc_id)
        # print(f"文档 {doc_id}: 未收敛，选择了 {unique_paths} 个不同路径: {doc_data['leaf_node_id'].unique()}")

print(f"\n收敛结果:")
print(f"收敛文档数量: {len(convergent_docs)}")
print(f"未收敛文档数量: {len(non_convergent_docs)}")
print(f"总文档数量: {len(last5_data['document_id'].unique())}")

if len(convergent_docs) > 0:
    print(f"收敛的文档ID: {convergent_docs}")
if len(non_convergent_docs) > 0:
    print(f"未收敛的文档ID: {non_convergent_docs}")

最后5轮迭代次数: [90 89 88 87 86]
最后5轮总数据条数: 4850

收敛结果:
收敛文档数量: 589
未收敛文档数量: 381
总文档数量: 970
收敛的文档ID: [969, 303, 330, 328, 327, 326, 323, 320, 319, 315, 313, 310, 308, 307, 306, 305, 333, 334, 349, 361, 360, 358, 354, 352, 351, 350, 348, 347, 346, 342, 341, 340, 339, 337, 304, 241, 301, 269, 267, 264, 262, 258, 256, 254, 252, 250, 248, 246, 245, 272, 288, 300, 295, 294, 292, 291, 290, 289, 287, 274, 286, 282, 281, 278, 276, 275, 362, 363, 364, 365, 445, 444, 443, 442, 441, 440, 439, 437, 436, 433, 432, 430, 429, 427, 453, 454, 455, 470, 479, 477, 471, 469, 456, 467, 465, 464, 463, 462, 461, 458, 457, 425, 424, 423, 390, 389, 388, 387, 385, 383, 382, 381, 380, 377, 376, 375, 374, 373, 372, 371, 370, 369, 367, 366, 394, 422, 409, 421, 420, 418, 417, 416, 414, 413, 412, 411, 395, 407, 406, 404, 403, 402, 401, 399, 397, 242, 240, 484, 87, 86, 84, 83, 82, 78, 77, 71, 70, 67, 66, 65, 64, 63, 62, 61, 88, 89, 90, 105, 116, 113, 112, 111, 107, 106, 104, 101, 98, 97, 95, 92, 60, 239, 57, 26, 24, 23, 21

In [2]:
import pandas as pd
from pathlib import Path
import ast

# 读取路径结构数据
structure_csv_path = Path('step3_d3_g005_e005_a01/depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_1/iteration_path_structures.csv')
structure_df = pd.read_csv(structure_csv_path)

# 取最后一轮数据
structure_df['iteration'] = pd.to_numeric(structure_df['iteration'], errors='coerce')
last_iteration = structure_df['iteration'].max()
last_round_data = structure_df[structure_df['iteration'] == last_iteration].copy()

print(f"最后一轮迭代: {last_iteration}")
print(f"最后一轮路径结构数据量: {len(last_round_data)}")

# 解析documents_in_path列
def parse_document_list(doc_str):
    try:
        if pd.isna(doc_str):
            return []
        if isinstance(doc_str, str):
            return ast.literal_eval(doc_str)
        return doc_str
    except:
        return []

last_round_data['documents_in_path'] = last_round_data['documents_in_path'].apply(parse_document_list)

# 转换收敛和非收敛文档ID为集合
convergent_set = set(convergent_docs)
non_convergent_set = set(non_convergent_docs)

print(f"收敛文档数量: {len(convergent_set)}")
print(f"非收敛文档数量: {len(non_convergent_set)}")

# 分析每个节点的稳定和不稳定文档
def analyze_node_stability(row):
    docs_in_node = set(row['documents_in_path'])
    stable_docs = docs_in_node.intersection(convergent_set)
    unstable_docs = docs_in_node.intersection(non_convergent_set)
    
    return {
        'stable_count': len(stable_docs),
        'unstable_count': len(unstable_docs),
        'total_docs': len(docs_in_node),
        'stable_docs': list(stable_docs),
        'unstable_docs': list(unstable_docs)
    }

# 应用分析函数
analysis_results = last_round_data.apply(analyze_node_stability, axis=1, result_type='expand')
result_df = pd.concat([last_round_data[['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id', 'leaf_node_id']], 
                       analysis_results], axis=1)

# 汇聚到父节点 - Layer 0 (根节点)
print("\n=== Layer 0 (根节点) 统计 ===")
layer0_stats = result_df.groupby('layer_0_node_id').agg({
    'stable_count': 'sum',
    'unstable_count': 'sum',
    'total_docs': 'sum'
}).reset_index()
layer0_stats['stability_ratio'] = layer0_stats['stable_count'] / (layer0_stats['stable_count'] + layer0_stats['unstable_count'])
print(layer0_stats)

# 汇聚到父节点 - Layer 1
print("\n=== Layer 1 节点统计 ===")
layer1_stats = result_df.groupby(['layer_0_node_id', 'layer_1_node_id']).agg({
    'stable_count': 'sum',
    'unstable_count': 'sum',
    'total_docs': 'sum'
}).reset_index()
layer1_stats['stability_ratio'] = layer1_stats['stable_count'] / (layer1_stats['stable_count'] + layer1_stats['unstable_count'])
print(layer1_stats)

# 叶子节点统计
print("\n=== 叶子节点 (Layer 2) 统计 ===")
leaf_stats = result_df.groupby(['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id']).agg({
    'stable_count': 'sum',
    'unstable_count': 'sum',
    'total_docs': 'sum'
}).reset_index()
leaf_stats['stability_ratio'] = leaf_stats['stable_count'] / (leaf_stats['stable_count'] + leaf_stats['unstable_count'])
print(leaf_stats.head(10))

# 总体统计
total_stable = result_df['stable_count'].sum()
total_unstable = result_df['unstable_count'].sum()
print(f"\n=== 总体统计 ===")
print(f"总稳定文档数: {total_stable}")
print(f"总不稳定文档数: {total_unstable}")
print(f"总体稳定率: {total_stable/(total_stable+total_unstable):.2%}")

最后一轮迭代: 90
最后一轮路径结构数据量: 252
收敛文档数量: 589
非收敛文档数量: 381

=== Layer 0 (根节点) 统计 ===
   layer_0_node_id  stable_count  unstable_count  total_docs  stability_ratio
0                0           589             381         970         0.607216

=== Layer 1 节点统计 ===
    layer_0_node_id  layer_1_node_id  stable_count  unstable_count  \
0                 0                1           231              65   
1                 0                6             6               1   
2                 0               36             9               3   
3                 0               53            11               3   
4                 0               81            20              17   
..              ...              ...           ...             ...   
57                0            18543             2               3   
58                0            18586             0               5   
59                0            18709             5               0   
60                0            18926       

In [3]:
# import pandas as pd
# from pathlib import Path

# # --- Part 1: 识别持续存在的节点 (基于树结构) ---

# # 1. 使用已读取的路径结构数据 `structure_df`
# # 筛选出最后5轮的结构数据
# last5_structure_df = structure_df[structure_df['iteration'].isin(last5_iterations)]

# # 2. 找出每一轮中存在的所有节点ID
# node_sets_per_iteration = []
# for it in last5_iterations:
#     it_df = last5_structure_df[last5_structure_df['iteration'] == it]
#     # 从所有层级收集唯一的节点ID
#     l0_nodes = set(it_df['layer_0_node_id'].unique())
#     l1_nodes = set(it_df['layer_1_node_id'].unique())
#     l2_nodes = set(it_df['layer_2_node_id'].unique())
#     all_nodes_in_iter = l0_nodes.union(l1_nodes).union(l2_nodes)
#     node_sets_per_iteration.append(all_nodes_in_iter)

# # 3. 计算在所有5轮中都存在的节点的交集
# if node_sets_per_iteration:
#     persistent_node_ids = set.intersection(*node_sets_per_iteration)
# else:
#     persistent_node_ids = set()

# print(f"在最后{len(last5_iterations)}轮的树结构中持续存在的节点数量: {len(persistent_node_ids)}")


# # --- Part 2: 为持续存在的节点计算词汇稳定性 ---

# # 1. 读取词汇分布数据
# word_dist_path = Path('step3_d3_g005_e005_a01/depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_1/iteration_node_word_distributions.csv')
# word_df = pd.read_csv(word_dist_path)

# # 2. 数据预处理
# word_df['iteration'] = pd.to_numeric(word_df['iteration'], errors='coerce')
# word_df['count'] = pd.to_numeric(word_df['count'], errors='coerce')
# word_df.dropna(subset=['iteration', 'count', 'node_id', 'word'], inplace=True)
# last5_words_df = word_df[word_df['iteration'].isin(last5_iterations)]

# # 3. 获取Top 10词汇并重塑数据
# top10_words_df = last5_words_df.sort_values('count', ascending=False).groupby(['iteration', 'node_id']).head(10)
# top_words_sets = top10_words_df.groupby(['iteration', 'node_id'])['word'].apply(set).reset_index()
# pivoted_sets = top_words_sets.pivot(index='node_id', columns='iteration', values='word')

# # 4. 仅筛选出持续存在的节点进行分析
# persistent_nodes_pivoted = pivoted_sets[pivoted_sets.index.isin(persistent_node_ids)].dropna()
# print(f"在持续存在的节点中，拥有完整Top-10词汇历史的节点数量: {len(persistent_nodes_pivoted)}")

# # 5. 定义函数并计算重叠率
# def calculate_overlap_rate(row):
#     intersection_set = set.intersection(*row)
#     return len(intersection_set) / 10.0

# node_stability = pd.DataFrame(index=persistent_nodes_pivoted.index)
# node_stability['word_overlap_rate'] = persistent_nodes_pivoted.apply(calculate_overlap_rate, axis=1)
# node_stability.reset_index(inplace=True)
# print("持续存在节点的Top-10词汇稳定性计算完成。")


# # --- Part 3: 合并所有指标并保存 ---

# # 1. 合并所有层级的文档统计
# layer0_combined = layer0_stats.copy(); layer0_combined['layer'] = 'Layer_0'
# layer1_combined = layer1_stats.copy(); layer1_combined['layer'] = 'Layer_1'
# layer2_combined = leaf_stats.copy(); layer2_combined['layer'] = 'Layer_2'
# all_layers_stats = pd.concat([layer0_combined, layer1_combined, layer2_combined], ignore_index=True)

# # 2. 添加统一的 node_id 列
# all_layers_stats['node_id'] = all_layers_stats['layer_2_node_id'].fillna(
#                                  all_layers_stats['layer_1_node_id']).fillna(
#                                  all_layers_stats['layer_0_node_id'])

# # 3. 将词汇重叠率合并到主表中
# all_layers_combined_stats = pd.merge(
#     all_layers_stats,
#     node_stability[['node_id', 'word_overlap_rate']],
#     on='node_id',
#     how='left'
# )

# # 4. *** 新增：添加 is_persistent 标志列 ***
# all_layers_combined_stats['is_persistent'] = all_layers_combined_stats['node_id'].isin(persistent_node_ids)

# # 5. 整理最终列顺序
# final_column_order = [
#     'layer', 'node_id', 'is_persistent', 'layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id', 
#     'stable_count', 'unstable_count', 'total_docs', 'stability_ratio', 'word_overlap_rate'
# ]
# all_layers_combined_stats = all_layers_combined_stats[final_column_order]

# # 6. 保存所有结果
# output_dir = Path('convergence_analysis_results')
# output_dir.mkdir(exist_ok=True)

# output_file_path = output_dir / 'all_layers_stability_stats.csv'
# all_layers_combined_stats.to_csv(output_file_path, index=False, encoding='utf-8')
# print(f"\n所有层级的组合统计已保存到: {output_file_path}")

# summary_stats = pd.DataFrame({
#     'metric': ['总稳定文档数', '总不稳定文档数', '总文档数', '总体稳定率'],
#     'value': [total_stable, total_unstable, total_stable + total_unstable, 
#               f"{total_stable/(total_stable+total_unstable):.2%}"]
# })
# summary_stats.to_csv(output_dir / 'overall_summary.csv', index=False, encoding='utf-8')
# print(f"总体统计摘要已保存到: {output_dir / 'overall_summary.csv'}")

# convergence_docs_df = pd.DataFrame({
#     'document_id': convergent_docs + non_convergent_docs,
#     'convergence_status': ['convergent'] * len(convergent_docs) + ['non_convergent'] * len(non_convergent_docs)
# })
# convergence_docs_df.to_csv(output_dir / 'document_convergence_status.csv', index=False, encoding='utf-8')
# print(f"文档收敛状态已保存到: {output_dir / 'document_convergence_status.csv'}")

# print(f"\n分析完成，已生成3个核心CSV文件。")
# print(f"1. {output_dir / 'all_layers_stability_stats.csv'}")
# print(f"2. {output_dir / 'overall_summary.csv'}")
# print(f"3. {output_dir / 'document_convergence_status.csv'}")

# # 显示合并后表格的预览
# print("\n=== 最终合并表格预览 ===")
# print(all_layers_combined_stats.head())

In [4]:
import pandas as pd
from pathlib import Path

# --- Part 1: 识别持续存在的节点 (基于树结构) ---

# 1. 使用已读取的路径结构数据 `structure_df`
# 筛选出最后5轮的结构数据
last5_structure_df = structure_df[structure_df['iteration'].isin(last5_iterations)]

# 2. 找出每一轮中存在的所有节点ID
node_sets_per_iteration = []
for it in last5_iterations:
    it_df = last5_structure_df[last5_structure_df['iteration'] == it]
    # 从所有层级收集唯一的节点ID
    l0_nodes = set(it_df['layer_0_node_id'].unique())
    l1_nodes = set(it_df['layer_1_node_id'].unique())
    l2_nodes = set(it_df['layer_2_node_id'].unique())
    all_nodes_in_iter = l0_nodes.union(l1_nodes).union(l2_nodes)
    node_sets_per_iteration.append(all_nodes_in_iter)

# 3. 计算在所有5轮中都存在的节点的交集
if node_sets_per_iteration:
    persistent_node_ids = set.intersection(*node_sets_per_iteration)
else:
    persistent_node_ids = set()

print(f"在最后{len(last5_iterations)}轮的树结构中持续存在的节点数量: {len(persistent_node_ids)}")


# --- Part 2: 为持续存在的节点计算词汇稳定性 ---

# 1. 读取词汇分布数据
word_dist_path = Path('step3_d3_g005_e005_a01/depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_1/iteration_node_word_distributions.csv')
word_df = pd.read_csv(word_dist_path)

# 2. 数据预处理
word_df['iteration'] = pd.to_numeric(word_df['iteration'], errors='coerce')
word_df['count'] = pd.to_numeric(word_df['count'], errors='coerce')
word_df.dropna(subset=['iteration', 'count', 'node_id', 'word'], inplace=True)
last5_words_df = word_df[word_df['iteration'].isin(last5_iterations)]

# 3. 获取Top 10词汇并重塑数据
top10_words_df = last5_words_df.sort_values('count', ascending=False).groupby(['iteration', 'node_id']).head(10)
top_words_sets = top10_words_df.groupby(['iteration', 'node_id'])['word'].apply(set).reset_index()
pivoted_sets = top_words_sets.pivot(index='node_id', columns='iteration', values='word')

# 4. 仅筛选出持续存在的节点进行分析
persistent_nodes_pivoted = pivoted_sets[pivoted_sets.index.isin(persistent_node_ids)].dropna()
print(f"在持续存在的节点中，拥有完整Top-10词汇历史的节点数量: {len(persistent_nodes_pivoted)}")

# 5. 定义函数并计算重叠率
def calculate_overlap_rate(row):
    intersection_set = set.intersection(*row)
    return len(intersection_set) / 10.0

node_stability = pd.DataFrame(index=persistent_nodes_pivoted.index)
node_stability['word_overlap_rate'] = persistent_nodes_pivoted.apply(calculate_overlap_rate, axis=1)
node_stability.reset_index(inplace=True)
print("持续存在节点的Top-10词汇稳定性计算完成。")


# --- Part 3: *** 新增 *** 提取稳定与不稳定词汇列表 ---

# 1. 获取最后一轮的Top-10词汇和计数值
last_iter_top10_df = top10_words_df[top10_words_df['iteration'] == last_iteration]
last_iter_top10_sets = last_iter_top10_df.groupby('node_id')['word'].apply(set).to_dict()
last_iter_word_counts = last_iter_top10_df.set_index(['node_id', 'word'])['count'].to_dict()

# 2. 计算每个持续存在节点的稳定词汇（交集）
stable_words_map = {}
for node_id, row in persistent_nodes_pivoted.iterrows():
    stable_words_map[node_id] = set.intersection(*row)

# 3. 构建包含词汇列表的DataFrame
word_details_list = []
all_final_nodes = set(last_iter_top10_sets.keys())

for node_id in all_final_nodes:
    final_words = last_iter_top10_sets.get(node_id, set())
    stable_words = stable_words_map.get(node_id, set())
    unstable_words = final_words - stable_words
    
    stable_list = sorted(
        [(word, last_iter_word_counts.get((node_id, word), 0)) for word in stable_words],
        key=lambda x: x[1], reverse=True
    )
    unstable_list = sorted(
        [(word, last_iter_word_counts.get((node_id, word), 0)) for word in unstable_words],
        key=lambda x: x[1], reverse=True
    )
    
    word_details_list.append({
        'node_id': node_id,
        'stable_top_words': stable_list,
        'unstable_top_words': unstable_list
    })

word_details_df = pd.DataFrame(word_details_list)
print("已提取最后一轮各节点的稳定与不稳定Top词汇。")


# --- Part 4: 合并所有指标并保存 ---

# 1. 合并所有层级的文档统计
layer0_combined = layer0_stats.copy(); layer0_combined['layer'] = 'Layer_0'
layer1_combined = layer1_stats.copy(); layer1_combined['layer'] = 'Layer_1'
layer2_combined = leaf_stats.copy(); layer2_combined['layer'] = 'Layer_2'
all_layers_stats = pd.concat([layer0_combined, layer1_combined, layer2_combined], ignore_index=True)

# 2. 添加统一的 node_id 列
all_layers_stats['node_id'] = all_layers_stats['layer_2_node_id'].fillna(
                                 all_layers_stats['layer_1_node_id']).fillna(
                                 all_layers_stats['layer_0_node_id'])

# 3. 将词汇重叠率和词汇列表合并到主表中
all_layers_combined_stats = pd.merge(all_layers_stats, node_stability, on='node_id', how='left')
all_layers_combined_stats = pd.merge(all_layers_combined_stats, word_details_df, on='node_id', how='left')


# 4. 添加 is_persistent 标志列
all_layers_combined_stats['is_persistent'] = all_layers_combined_stats['node_id'].isin(persistent_node_ids)

# 5. 整理最终列顺序
final_column_order = [
    'layer', 'node_id', 'is_persistent', 
    'layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id', 
    'stable_count', 'unstable_count', 'total_docs', 'stability_ratio', 
    'word_overlap_rate', 'stable_top_words', 'unstable_top_words'
]
# 确保所有列都存在，即使在某些情况下某些列可能为空
for col in final_column_order:
    if col not in all_layers_combined_stats.columns:
        all_layers_combined_stats[col] = None
        
all_layers_combined_stats = all_layers_combined_stats[final_column_order]

# 6. 保存所有结果
output_dir = Path('convergence_analysis_results')
output_dir.mkdir(exist_ok=True)

output_file_path = output_dir / 'all_layers_stability_stats.csv'
all_layers_combined_stats.to_csv(output_file_path, index=False, encoding='utf-8')
print(f"\n所有层级的组合统计已保存到: {output_file_path}")

summary_stats = pd.DataFrame({
    'metric': ['总稳定文档数', '总不稳定文档数', '总文档数', '总体稳定率'],
    'value': [total_stable, total_unstable, total_stable + total_unstable, 
              f"{total_stable/(total_stable+total_unstable):.2%}"]
})
summary_stats.to_csv(output_dir / 'overall_summary.csv', index=False, encoding='utf-8')
print(f"总体统计摘要已保存到: {output_dir / 'overall_summary.csv'}")

convergence_docs_df = pd.DataFrame({
    'document_id': convergent_docs + non_convergent_docs,
    'convergence_status': ['convergent'] * len(convergent_docs) + ['non_convergent'] * len(non_convergent_docs)
})
convergence_docs_df.to_csv(output_dir / 'document_convergence_status.csv', index=False, encoding='utf-8')
print(f"文档收敛状态已保存到: {output_dir / 'document_convergence_status.csv'}")

print(f"\n分析完成，已生成3个核心CSV文件。")
print(f"1. {output_dir / 'all_layers_stability_stats.csv'}")
print(f"2. {output_dir / 'overall_summary.csv'}")
print(f"3. {output_dir / 'document_convergence_status.csv'}")

# 显示合并后表格的预览
print("\n=== 最终合并表格预览 ===")
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.width', 1000) # 加宽显示
print(all_layers_combined_stats.head())

在最后5轮的树结构中持续存在的节点数量: 226
在持续存在的节点中，拥有完整Top-10词汇历史的节点数量: 226
持续存在节点的Top-10词汇稳定性计算完成。
已提取最后一轮各节点的稳定与不稳定Top词汇。

所有层级的组合统计已保存到: convergence_analysis_results/all_layers_stability_stats.csv
总体统计摘要已保存到: convergence_analysis_results/overall_summary.csv
文档收敛状态已保存到: convergence_analysis_results/document_convergence_status.csv

分析完成，已生成3个核心CSV文件。
1. convergence_analysis_results/all_layers_stability_stats.csv
2. convergence_analysis_results/overall_summary.csv
3. convergence_analysis_results/document_convergence_status.csv

=== 最终合并表格预览 ===
     layer  node_id  is_persistent  layer_0_node_id  layer_1_node_id  layer_2_node_id  stable_count  unstable_count  total_docs  stability_ratio  word_overlap_rate                                   stable_top_words                                 unstable_top_words
0  Layer_0      0.0           True                0              NaN              NaN           589             381         970         0.607216                0.8  [(model, 1370), (method, 1354), (u

In [5]:
# 首先，我们研究倒数5轮都存在的文档&节点
# 判断主题覆盖度>50%的节点，同时判断其文档的收敛性
# 根据top词语绘制节点语义

In [6]:
import pandas as pd
import numpy as np

# --- Part 1: 识别目标节点 ---
# 目标节点：在最后一轮存在，但不是在过去5轮都持续存在的节点。

# 1. 获取最后一轮的所有节点ID
last_round_structure_df = structure_df[structure_df['iteration'] == last_iteration]
l0_final = set(last_round_structure_df['layer_0_node_id'].unique())
l1_final = set(last_round_structure_df['layer_1_node_id'].unique())
l2_final = set(last_round_structure_df['layer_2_node_id'].unique())
final_round_node_ids = l0_final.union(l1_final).union(l2_final)

# 2. 从最后一轮节点中，排除掉那些持续存在的节点
# np.nan 会被识别为节点，需要排除
final_round_node_ids.discard(np.nan)
non_persistent_final_nodes = final_round_node_ids - persistent_node_ids

print(f"最后一轮共有 {len(final_round_node_ids)} 个节点。")
print(f"其中有 {len(non_persistent_final_nodes)} 个节点不是“持续存在”的，我们将对它们进行分析。")


# --- Part 2: 计算非持续存在节点的词汇连续性 ---

# 1. 预处理词汇数据，筛选出最后5轮
# 我们将使用之前已经加载和筛选过的 `last5_words_df`

# 2. 获取每个节点在每一轮的Top-10词汇（处理并列情况）
# 使用rank函数，如果第10和第11个词数量相同，则都包含进来
last5_words_df['rank'] = last5_words_df.groupby(['iteration', 'node_id'])['count'].rank(method='min', ascending=False)
top_words_df = last5_words_df[last5_words_df['rank'] <= 10].copy()

# 3. 构建一个方便查询的Top词汇字典: {(iteration, node_id): {word1, word2, ...}}
top_words_sets_by_iter_node = top_words_df.groupby(['iteration', 'node_id'])['word'].apply(set).to_dict()

# 4. 遍历每个目标节点，计算其词汇的连续出现次数
non_persistent_stability_results = {}

for node_id in non_persistent_final_nodes:
    # a. 获取该节点在最后一轮的Top词汇
    final_top_words = top_words_sets_by_iter_node.get((last_iteration, node_id), set())
    
    if not final_top_words:
        continue # 如果节点在最后一轮没有词汇，则跳过

    node_word_stability = []
    # b. 对每一个Top词汇，向前追溯
    for word in final_top_words:
        consecutive_count = 0
        # 从最后一轮开始，倒序遍历
        for it in last5_iterations: 
            # 获取当前迭代轮次的Top词汇集合
            current_iter_top_words = top_words_sets_by_iter_node.get((it, node_id), set())
            
            if word in current_iter_top_words:
                consecutive_count += 1
            else:
                # 一旦中断，就停止计数
                break
        
        node_word_stability.append((word, consecutive_count))
    
    # c. 按连续出现次数降序排列结果
    node_word_stability.sort(key=lambda x: x[1], reverse=True)
    non_persistent_stability_results[node_id] = node_word_stability

print(f"\n分析完成！已计算 {len(non_persistent_stability_results)} 个非持续存在节点的词汇连续性。")


# --- Part 3: 展示结果 ---
print("\n=== 非持续存在节点词汇连续性分析 (示例) ===")
# 打印前5个节点的结果作为示例
count = 0
for node_id, stability_list in non_persistent_stability_results.items():
    if count < 5:
        print(f"\n--- Node ID: {node_id} ---")
        print("词汇, 连续出现次数")
        for word, num in stability_list:
            print(f"{word}, {num}")
        count += 1
    else:
        break

# 您也可以将结果保存到文件中
non_persistent_df = pd.DataFrame.from_dict(non_persistent_stability_results, orient='index')
non_persistent_df.to_csv(output_dir / 'non_persistent_node_word_stability.csv')
print("\n完整结果已保存到 non_persistent_node_word_stability.csv")

最后一轮共有 315 个节点。
其中有 89 个节点不是“持续存在”的，我们将对它们进行分析。

分析完成！已计算 89 个非持续存在节点的词汇连续性。

=== 非持续存在节点词汇连续性分析 (示例) ===

--- Node ID: 19179 ---
词汇, 连续出现次数
integrator, 4
time, 4
five, 4
continuum, 4
balance, 4
law, 4
discrete, 4
integration, 4
momentum, 4
equation, 2
dynamical, 2
system, 2
irregular, 1
function, 1

--- Node ID: 19185 ---
词汇, 连续出现次数
microstructures, 4
initiation, 4
contribution, 4
interface, 4
across, 4
damage, 4
influence, 4
general, 4
cohesive, 4
zone, 4
electrical, 4
jump, 4
effective, 4
resistance, 4
development, 4
study, 4
bulk, 4
property, 2

--- Node ID: 19186 ---
词汇, 连续出现次数
one, 2
achieve, 1
cyclic, 1
compression, 1
modeling, 1
metric, 1
find, 1
manufacture, 1
require, 1
preserve, 1
ratio, 1
analyze, 1
storage, 1
development, 1
select, 1
additive, 1
wavelet, 1
result, 1
error, 1
address, 1
hyperreduction, 1
reconstruct, 1
reconstruction, 1
measurement, 1
data, 1

--- Node ID: 19204 ---
词汇, 连续出现次数
wave, 1
st, 1
mesh, 1
joint, 1
cell, 1
representation, 1
code, 1
propagation, 1
d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last5_words_df['rank'] = last5_words_df.groupby(['iteration', 'node_id'])['count'].rank(method='min', ascending=False)
