In [1]:
import pandas as pd
import numpy as np
import os
import glob
from scipy.special import gammaln

def calculate_renyi_entropy_vectorized(node_data, all_words, eta_prior=1.0, renyi_alpha=2.0):
    """
    向量化版本的Renyi熵计算
    
    Parameters:
    node_data: DataFrame, 包含word和count列的节点数据
    all_words: list, 全量词汇表
    eta_prior: float, Dirichlet先验平滑参数（从eta值获取）
    renyi_alpha: float, Renyi熵的阶数参数
    
    Returns:
    tuple: (entropy, nonzero_word_count) Renyi熵值和非零词汇数量
    """
    if len(all_words) == 0:
        return 0.0, 0
    
    # 创建词汇到索引的映射
    word_to_idx = {word: idx for idx, word in enumerate(all_words)}
    
    # 初始化计数向量
    counts = np.zeros(len(all_words))
    
    # 填充实际计数
    for _, row in node_data.iterrows():
        word = row['word']
        if pd.notna(word) and word in word_to_idx:
            counts[word_to_idx[word]] = row['count']
    
    # 统计非零词汇数量（平滑前）
    nonzero_word_count = np.sum(counts > 0)
    
    # 添加eta平滑
    smoothed_counts = counts + eta_prior
    
    # 计算概率分布
    probabilities = smoothed_counts / np.sum(smoothed_counts)
    
    # 计算Renyi熵（使用自然对数）
    if renyi_alpha == 1.0:
        # Shannon熵（由于alpha平滑，所有概率都>0，无需添加小常数）
        entropy = -np.sum(probabilities * np.log(probabilities))
    else:
        # 一般Renyi熵
        entropy = (1 / (1 - renyi_alpha)) * np.log(np.sum(probabilities ** renyi_alpha))
    
    return entropy, int(nonzero_word_count)

def process_all_iteration_files_by_alpha(base_path=".", renyi_alpha=2.0):
    """
    针对每个iteration_node_word_distributions.csv单独处理并保存结果
    修正版：使用固定的eta=0.05作为Dirichlet平滑参数（适配step3）
    """
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    # 去重，确保每个文件只处理一次
    files = list(set(files))
    files.sort()  # 排序以便有序处理
    
    print(f"总共找到 {len(files)} 个文件待处理")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # step3中eta值固定为0.05（用于Dirichlet平滑）
        eta_prior = 0.05  # 修正：固定使用0.05作为平滑参数
        
        # 从文件夹名称提取alpha值（仅用于记录文件夹信息）
        alpha = 0.1  # 默认值
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError) as e:
                # 通过文件夹名称模式匹配
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # 通过文件夹名称模式匹配
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        print(f"\n[{idx}/{len(files)}] 处理文件: {file_path}")
        print(f"文件夹: {folder_name}")
        print(f"提取的alpha值: {alpha} (仅用于记录)")
        print(f"使用的eta平滑值: {eta_prior}")
        
        try:
            df = pd.read_csv(file_path)
            
            # 清理列名，去除单引号、双引号和空格
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            if 'node_id' not in df.columns:
                print(f"警告：{file_path} 缺少 node_id 列，跳过该文件")
                continue
                
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            all_words = list(last_iteration_data['word'].dropna().unique())
            
            print(f"最后一轮iteration: {max_iteration}, 词汇表大小: {len(all_words)}, 节点数: {last_iteration_data['node_id'].nunique()}")
            
            results = []
            for node_id in last_iteration_data['node_id'].unique():
                node_data = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # 使用固定的eta_prior=0.05进行Dirichlet平滑
                entropy, nonzero_words = calculate_renyi_entropy_vectorized(
                    node_data, all_words, eta_prior, renyi_alpha  # 使用eta_prior
                )
                
                # 计算稀疏度（非零词汇占比）
                sparsity_ratio = nonzero_words / len(all_words) if len(all_words) > 0 else 0
                
                results.append({
                    'node_id': node_id,
                    'renyi_entropy_corrected': entropy,
                    'nonzero_word_count': nonzero_words,
                    'total_vocabulary_size': len(all_words),
                    'sparsity_ratio': sparsity_ratio,
                    'eta_used': eta_prior,  # 修正：记录实际使用的eta值
                    'alpha_folder': alpha,  # 修正：记录文件夹的alpha值
                    'renyi_alpha': renyi_alpha,
                    'iteration': max_iteration
                })
            
            # 保存新的corrected_renyi_entropy.csv文件
            results_df = pd.DataFrame(results)
            output_path = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            results_df.to_csv(output_path, index=False)
            print(f"✓ 保存修正的Renyi熵结果到: {output_path}")
            
            # 输出一些统计信息
            print(f"节点词汇稀疏性统计:")
            print(f"  - 平均非零词汇数: {results_df['nonzero_word_count'].mean():.1f}")
            print(f"  - 非零词汇数范围: {results_df['nonzero_word_count'].min()}-{results_df['nonzero_word_count'].max()}")
            print(f"  - 平均稀疏度: {results_df['sparsity_ratio'].mean():.3f}")
            print("=" * 50)
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {file_path} 时出错: {str(e)}")
            print("详细错误信息:")
            traceback.print_exc()

In [2]:
# 设置参数 - 适配step3
base_path = "/Volumes/My Passport/收敛结果/step3"  # step3路径
renyi_alpha = 2.0  # Renyi熵阶数参数

print("=" * 80)
print("Step3: 开始分析Alpha参数对模型的影响")
print("=" * 80)

# 1. 计算修正的Renyi熵（按alpha值自动调整先验）
print("开始计算修正的Renyi熵...")
process_all_iteration_files_by_alpha(base_path, renyi_alpha)

print("=" * 50)
print("✅ Step3 Renyi熵计算完成！")
print("=" * 50)

Step3: 开始分析Alpha参数对模型的影响
开始计算修正的Renyi熵...
总共找到 18 个文件待处理

[1/18] 处理文件: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_1/iteration_node_word_distributions.csv
文件夹: depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_1
提取的alpha值: 0.01 (仅用于记录)
使用的eta平滑值: 0.05
最后一轮iteration: 285, 词汇表大小: 1490, 节点数: 312
✓ 保存修正的Renyi熵结果到: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_1/corrected_renyi_entropy.csv
节点词汇稀疏性统计:
  - 平均非零词汇数: 60.7
  - 非零词汇数范围: 0-829
  - 平均稀疏度: 0.041

[2/18] 处理文件: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_2/iteration_node_word_distributions.csv
文件夹: depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_2
提取的alpha值: 0.01 (仅用于记录)
使用的eta平滑值: 0.05
最后一轮iteration: 285, 词汇表大小: 1490, 节点数: 313
✓ 保存修正的Renyi熵结果到: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_2/corrected_renyi_entropy.csv
节点词汇稀疏性统计:
  -

In [3]:
def calculate_node_document_counts(path_structures_df):
    """
    从叶子节点向上聚合，计算每个节点的文档数和层级关系
    
    Parameters:
    path_structures_df: DataFrame, iteration_path_structures.csv的数据（已经过滤为最后一轮）
    
    Returns:
    dict: {node_id: {'document_count': int, 'layer': int, 'parent_id': int, 'child_ids': list}} 映射
    """
    # 获取所有layer列 - 修正正则表达式
    layer_columns = [col for col in path_structures_df.columns if col.startswith('layer_') and col.endswith('_node_id')]
    layer_columns.sort()  # 确保按顺序排列
    max_layer_idx = len(layer_columns) - 1
    
    print(f"[DEBUG] 发现层级列: {layer_columns}")
    print(f"[DEBUG] 最大层级索引: {max_layer_idx}")
    
    # 初始化节点信息字典
    node_info = {}
    
    # 首先建立所有节点的层级和父子关系
    for _, row in path_structures_df.iterrows():
        path_nodes = []
        for layer_idx in range(max_layer_idx + 1):
            layer_col = f'layer_{layer_idx}_node_id'
            if layer_col in path_structures_df.columns and pd.notna(row[layer_col]):
                path_nodes.append(row[layer_col])
            else:
                break
        
        # 为路径中的每个节点建立层级和父子关系
        for i, node in enumerate(path_nodes):
            if node not in node_info:
                node_info[node] = {
                    'document_count': 0,
                    'layer': i,
                    'parent_id': None,
                    'child_ids': [],
                    'child_count': 0
                }
            else:
                # 更新层级信息（确保一致性）
                node_info[node]['layer'] = i
            
            # 设置父节点关系
            if i > 0:  # 不是根节点
                parent_node = path_nodes[i-1]
                node_info[node]['parent_id'] = parent_node
                
                # 在父节点的子节点列表中添加当前节点
                if parent_node not in node_info:
                    node_info[parent_node] = {
                        'document_count': 0,
                        'layer': i-1,
                        'parent_id': None,
                        'child_ids': [],
                        'child_count': 0
                    }
                
                if node not in node_info[parent_node]['child_ids']:
                    node_info[parent_node]['child_ids'].append(node)
    
    # 然后处理叶子节点的文档数 - 在层级关系建立后进行
    for _, row in path_structures_df.iterrows():
        leaf_node = row['leaf_node_id']
        if pd.notna(leaf_node) and leaf_node in node_info:
            node_info[leaf_node]['document_count'] += row['document_count']
    
    # 从倒数第二层开始向上聚合文档数
    for layer_idx in range(max_layer_idx - 1, -1, -1):  # 从倒数第二层到第0层
        layer_col = f'layer_{layer_idx}_node_id'
        
        if layer_col not in path_structures_df.columns:
            continue
            
        # 获取这一层的所有唯一节点
        layer_nodes = path_structures_df[layer_col].dropna().unique()
        
        for node in layer_nodes:
            if node in node_info and node_info[node]['document_count'] == 0:
                # 计算文档数：汇总所有子节点的文档数
                child_doc_count = 0
                for child_id in node_info[node]['child_ids']:
                    if child_id in node_info:
                        child_doc_count += node_info[child_id]['document_count']
                
                # 如果没有子节点文档数，则直接从路径结构中计算
                if child_doc_count == 0:
                    total_docs = path_structures_df[path_structures_df[layer_col] == node]['document_count'].sum()
                    node_info[node]['document_count'] = total_docs
                else:
                    node_info[node]['document_count'] = child_doc_count

    # 计算每个节点的子节点数量
    for node_id, info in node_info.items():
        info['child_count'] = len(info['child_ids'])
    
    return node_info

def add_document_counts_to_entropy_files(base_path="."):
    """
    将文档数和层级信息添加到corrected_renyi_entropy.csv文件中（适配step3）
    """
    pattern = os.path.join(base_path, "**", "iteration_path_structures.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        print(f"\n处理路径结构文件: {folder_name}")
        
        try:
            # 读取path_structures文件
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            # 获取最后一轮数据
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            print(f"最后一轮iteration: {max_iteration}, 路径数: {len(last_iteration_data)}")
            
            # 计算每个节点的文档数和层级关系
            node_info = calculate_node_document_counts(last_iteration_data)
            
            print(f"计算得到 {len(node_info)} 个节点的信息")
            
            # 读取对应的corrected_renyi_entropy.csv
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if os.path.exists(entropy_file):
                entropy_df = pd.read_csv(entropy_file)
                
                # 添加新列 - 修正child_ids格式和child_count计算
                entropy_df['document_count'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('document_count', 0))
                entropy_df['layer'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('layer', -1))
                entropy_df['parent_id'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('parent_id', None))
                
                # 修正child_ids格式：使用方括号而不是逗号
                entropy_df['child_ids'] = entropy_df['node_id'].map(
                    lambda x: '[' + ','.join(map(str, node_info.get(x, {}).get('child_ids', []))) + ']' 
                    if node_info.get(x, {}).get('child_ids') else ''
                )
                
                # 修正child_count：直接使用列表长度
                entropy_df['child_count'] = entropy_df['node_id'].map(lambda x: len(node_info.get(x, {}).get('child_ids', [])))

                # 保存更新后的文件
                entropy_df.to_csv(entropy_file, index=False)
                print(f"已更新 {entropy_file}，添加了document_count, layer, parent_id, child_ids, child_count列")
                
                # 显示一些统计信息
                print(f"节点层级统计:")
                print(f"  - 层级分布: {entropy_df['layer'].value_counts().sort_index().to_dict()}")
                print(f"  - 文档数范围: {entropy_df['document_count'].min()}-{entropy_df['document_count'].max()}")
                print(f"  - 根节点数: {entropy_df[entropy_df['parent_id'].isna()].shape[0]}")
                print(f"  - 叶子节点数: {entropy_df[entropy_df['child_ids'] == ''].shape[0]}")
                print(f"  - 子节点数分布: {entropy_df['child_count'].value_counts().sort_index().to_dict()}")
            else:
                print(f"警告：未找到对应的entropy文件 {entropy_file}")
                
        except Exception as e:
            import traceback
            print(f"处理文件 {file_path} 时出错: {str(e)}")
            print("详细错误信息:")
            traceback.print_exc()

In [4]:
# 主函数：添加文档数和层级信息到entropy文件（适配step3）
import os
import glob
import pandas as pd 

base_path = "/Volumes/My Passport/收敛结果/step3"  # 修改为step3路径

print("=" * 50)
print("Step3: 开始添加文档数和层级信息到entropy文件...")
print("=" * 50)
add_document_counts_to_entropy_files(base_path)
print("=" * 50)
print("Step3: 文档数和层级信息添加完成！")
print("=" * 50)

Step3: 开始添加文档数和层级信息到entropy文件...

处理路径结构文件: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3
最后一轮iteration: 265, 路径数: 242
[DEBUG] 发现层级列: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id']
[DEBUG] 最大层级索引: 2
计算得到 307 个节点的信息
已更新 /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/corrected_renyi_entropy.csv，添加了document_count, layer, parent_id, child_ids, child_count列
节点层级统计:
  - 层级分布: {0: 1, 1: 64, 2: 242}
  - 文档数范围: 1-970
  - 根节点数: 1
  - 叶子节点数: 242
  - 子节点数分布: {0: 242, 1: 14, 2: 15, 3: 11, 4: 7, 5: 7, 6: 5, 7: 3, 9: 1, 42: 1, 64: 1}

处理路径结构文件: depth_3_gamma_0.05_eta_0.05_alpha_1_run_1
最后一轮iteration: 245, 路径数: 267
[DEBUG] 发现层级列: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id']
[DEBUG] 最大层级索引: 2
计算得到 329 个节点的信息
已更新 /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_1/corrected_renyi_entropy.csv，添加了document_count, layer, parent_id, child_ids, child_count列
节点层级统计:
  - 层级分布: {0: 1, 1: 61, 2: 267}
 

In [5]:
def jensen_shannon_distance(p, q):
    """
    计算两个概率分布之间的Jensen-Shannon距离
    
    Parameters:
    p, q: array-like, 概率分布（应该已经归一化）
    
    Returns:
    float: Jensen-Shannon距离
    """
    # 确保输入是numpy数组
    p = np.array(p)
    q = np.array(q)
    
    # 计算中点分布
    m = 0.5 * (p + q)
    
    # 计算KL散度，添加小常数避免log(0)
    eps = 1e-10
    kl_pm = np.sum(p * np.log((p + eps) / (m + eps)))
    kl_qm = np.sum(q * np.log((q + eps) / (m + eps)))
    
    # Jensen-Shannon散度
    js_divergence = 0.5 * kl_pm + 0.5 * kl_qm
    
    # Jensen-Shannon距离（散度的平方根）
    js_distance = np.sqrt(js_divergence)
    
    return js_distance

def calculate_jensen_shannon_distances_with_weighted_entropy_by_alpha(base_path="."):
    """
    修正版：计算每层节点之间的Jensen-Shannon距离和文档数加权平均Renyi熵
    使用固定的eta=0.05作为Dirichlet平滑参数，alpha值仅用于识别文件夹
    
    注意：Renyi熵计算使用自然对数(loge)，单位为nats
    """
    # 查找所有iteration_node_word_distributions.csv文件
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"找到 {len(files)} 个词分布文件待处理")
    
    # step3中eta值固定为0.05（用于Dirichlet平滑）
    eta = 0.05  # 修正：固定使用0.05作为平滑参数
    
    # 按alpha值分组显示文件分布
    files_by_alpha = {}
    for file_path in files:
        folder_name = os.path.basename(os.path.dirname(file_path))
        alpha = 0.1  # 默认值
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # 通过文件夹名称模式匹配
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # 通过文件夹名称模式匹配
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha not in files_by_alpha:
            files_by_alpha[alpha] = []
        files_by_alpha[alpha].append(file_path)
    
    print("文件分布：")
    for alpha in sorted(files_by_alpha.keys()):
        print(f"  Alpha {alpha}: {len(files_by_alpha[alpha])} 个文件")
    print(f"使用固定的Eta值: {eta}")
    print()
    
    # 处理每个文件
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # 从文件夹名称提取alpha值和run信息（仅用于记录）
        alpha = 0.1  # 默认值
        run_id = "未知"
        
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # 通过文件夹名称模式匹配
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # 通过文件夹名称模式匹配
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if '_run_' in folder_name:
            try:
                run_id = folder_name.split('_run_')[1]
            except:
                pass
        
        print("=" * 80)
        print(f"[{idx}/{len(files)}] 处理 Alpha={alpha}, Run={run_id}")
        print(f"使用固定Eta={eta}进行Dirichlet平滑")
        print("=" * 80)
        
        try:
            # 读取词分布数据
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # 获取最后一轮数据
            max_iteration = word_df['iteration'].max()
            last_iteration_data = word_df[word_df['iteration'] == max_iteration]
            
            # 获取全量词汇表
            all_words = sorted(list(last_iteration_data['word'].dropna().unique()))
            
            # 读取entropy文件获取层级信息
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print(f"⚠️  未找到entropy文件，跳过此文件")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            # 基本信息
            print(f"📊 基本信息:")
            print(f"   词汇表大小: {len(all_words)}")
            print(f"   最后iteration: {max_iteration}")
            
            # 按层级分组节点
            layers = entropy_df.groupby('layer')['node_id'].apply(list).to_dict()
            print(f"   层级分布: {[(layer, len(nodes)) for layer, nodes in layers.items()]}")
            
            # 为每个节点构建概率分布
            print(f"🔄 构建概率分布...")
            node_distributions = {}
            
            for node_id in entropy_df['node_id'].unique():
                # 获取该节点的词分布
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # 初始化计数向量
                counts = np.zeros(len(all_words))
                word_to_idx = {word: idx for idx, word in enumerate(all_words)}
                
                # 填充实际计数
                for _, row in node_words.iterrows():
                    word = row['word']
                    if pd.notna(word) and word in word_to_idx:
                        counts[word_to_idx[word]] = row['count']
                
                # 修正：使用固定的eta值进行Dirichlet平滑
                smoothed_counts = counts + eta  # 使用eta=0.05而不是alpha
                
                # 计算概率分布
                probabilities = smoothed_counts / np.sum(smoothed_counts)
                node_distributions[node_id] = probabilities
            
            print(f"   ✓ 完成 {len(node_distributions)} 个节点的概率分布")
            
            # 计算每层内节点的JS距离和加权平均熵
            all_js_distances = []
            layer_avg_distances = []
            
            print(f"📐 计算JS距离...")
            for layer, layer_nodes in layers.items():
                layer_js_distances = []
                n = len(layer_nodes)
                
                # 计算该层内所有节点对的JS距离
                for i, node1 in enumerate(layer_nodes):
                    for j, node2 in enumerate(layer_nodes):
                        if i < j:  # 只计算上三角矩阵，避免重复和自己与自己
                            if node1 in node_distributions and node2 in node_distributions:
                                p = node_distributions[node1]
                                q = node_distributions[node2]
                                
                                # 计算Jensen-Shannon距离
                                js_distance = jensen_shannon_distance(p, q)
                                
                                layer_js_distances.append({
                                    'layer': layer,
                                    'node1_id': node1,
                                    'node2_id': node2,
                                    'js_distance': js_distance,
                                    'node1_doc_count': entropy_df[entropy_df['node_id'] == node1]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node1]) > 0 else 0,
                                    'node2_doc_count': entropy_df[entropy_df['node_id'] == node2]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node2]) > 0 else 0
                                })
                
                all_js_distances.extend(layer_js_distances)
                
                # 计算该层的平均JS距离
                avg_js_distance = 0.0
                if layer_js_distances and n > 1:
                    total_js_distance = sum(d['js_distance'] for d in layer_js_distances)
                    max_pairs = n * (n - 1) // 2
                    avg_js_distance = total_js_distance / max_pairs
                
                # 计算该层的文档数加权平均Renyi熵
                layer_entropy_data = entropy_df[entropy_df['layer'] == layer]
                total_docs = layer_entropy_data['document_count'].sum()
                
                if total_docs > 0:
                    weighted_entropy = (layer_entropy_data['document_count'] * layer_entropy_data['renyi_entropy_corrected']).sum() / total_docs
                else:
                    weighted_entropy = 0.0
                
                layer_avg_distances.append({
                    'layer': layer,
                    'node_count': n,
                    'total_pairs': len(layer_js_distances),
                    'max_pairs': n * (n - 1) // 2 if n > 1 else 0,
                    'sum_js_distance': sum(d['js_distance'] for d in layer_js_distances),
                    'avg_js_distance': avg_js_distance,
                    'total_documents': total_docs,
                    'weighted_avg_renyi_entropy': weighted_entropy,
                    'eta_used': eta,  # 修正：记录实际使用的eta值
                    'alpha_folder': alpha  # 修正：记录文件夹的alpha值
                })
                
                # 简洁的层级统计输出
                print(f"   Layer {layer}: {n}节点, JS={avg_js_distance:.4f}, 熵={weighted_entropy:.4f}")
            
            # 保存结果文件
            if all_js_distances:
                js_df = pd.DataFrame(all_js_distances)
                output_path = os.path.join(folder_path, 'jensen_shannon_distances.csv')
                js_df.to_csv(output_path, index=False)
            
            if layer_avg_distances:
                avg_df = pd.DataFrame(layer_avg_distances)
                avg_output_path = os.path.join(folder_path, 'layer_average_js_distances.csv')
                avg_df.to_csv(avg_output_path, index=False)
            
            print(f"💾 结果已保存")
            
        except Exception as e:
            print(f"❌ 处理失败: {str(e)}")
    
    print("\n" + "=" * 80)
    print("✅ 全部文件处理完成！")
    print("=" * 80)

In [None]:
import numpy as np
import os
import glob
import pandas as pd 
# 主函数：计算Jensen-Shannon距离和加权平均Renyi熵
base_path = "/Volumes/My Passport/收敛结果/step3"  # 根目录

print("=" * 50)
print("Step3: 开始计算Jensen-Shannon距离和加权平均Renyi熵（按alpha值自动调整）...")  # 更新注释
print("=" * 50)
calculate_jensen_shannon_distances_with_weighted_entropy_by_alpha(base_path)
print("=" * 50)
print("Step3: Jensen-Shannon距离和加权平均Renyi熵计算完成！")  # 更新注释
print("=" * 50)

Step3: 开始计算Jensen-Shannon距离和加权平均Renyi熵（按alpha值自动调整）...
找到 18 个词分布文件待处理
文件分布：
  Alpha 0.01: 3 个文件
  Alpha 0.05: 3 个文件
  Alpha 0.1: 3 个文件
  Alpha 0.2: 3 个文件
  Alpha 0.5: 3 个文件
  Alpha 1.0: 3 个文件
使用固定的Eta值: 0.05

[1/18] 处理 Alpha=1.0, Run=3
使用固定Eta=0.05进行Dirichlet平滑
📊 基本信息:
   词汇表大小: 1490
   最后iteration: 265
   层级分布: [(0, 1), (1, 64), (2, 242)]
🔄 构建概率分布...
   ✓ 完成 307 个节点的概率分布
📐 计算JS距离...
   Layer 0: 1节点, JS=0.0000, 熵=4.9219
   Layer 1: 64节点, JS=0.6081, 熵=4.6239
   Layer 2: 242节点, JS=0.5566, 熵=4.5810
💾 结果已保存
[2/18] 处理 Alpha=1.0, Run=1
使用固定Eta=0.05进行Dirichlet平滑
📊 基本信息:
   词汇表大小: 1490
   最后iteration: 245
   层级分布: [(0, 1), (1, 61), (2, 267)]
🔄 构建概率分布...
   ✓ 完成 329 个节点的概率分布
📐 计算JS距离...
   Layer 0: 1节点, JS=0.0000, 熵=4.8951
   Layer 1: 61节点, JS=0.5939, 熵=4.4591
   Layer 2: 267节点, JS=0.5639, 熵=4.4854
💾 结果已保存
[3/18] 处理 Alpha=1.0, Run=2
使用固定Eta=0.05进行Dirichlet平滑
📊 基本信息:
   词汇表大小: 1490
   最后iteration: 245
   层级分布: [(0, 1), (1, 57), (2, 263)]
🔄 构建概率分布...
   ✓ 完成 321 个节点的概率分布
📐 计算JS距离...
   Layer 0: 

In [None]:
def aggregate_layer_statistics_by_alpha(base_path="."):
    """
    按alpha值汇总各层的JS距离和加权熵统计，在与run文件夹同级位置生成汇总表
    修正版：适配step3中的alpha参数而非eta参数
    """
    # 查找所有layer_average_js_distances.csv文件
    pattern = os.path.join(base_path, "**", "layer_average_js_distances.csv")
    files = glob.glob(pattern, recursive=True)
    
    # 存储所有数据和分组信息
    all_data = []
    alpha_groups = {}  # 用于存储每个alpha组合的父目录
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)  # run文件夹的父目录
        
        # 从文件夹名称提取alpha值（适配step3）
        alpha = None
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError):
                # 通过文件夹名称模式匹配
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # 通过文件夹名称模式匹配
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha is None:
            print(f"警告：无法从文件夹名称 {folder_name} 提取alpha值")
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            print(f"警告：无法从文件夹名称 {folder_name} 提取run编号")
            continue
        
        # 记录alpha组合的父目录
        if alpha not in alpha_groups:
            alpha_groups[alpha] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'alpha': alpha,  # 修正：使用alpha而不是eta
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'avg_js_distance': row['avg_js_distance'],
                    'weighted_avg_renyi_entropy': row['weighted_avg_renyi_entropy'],
                    'total_documents': row['total_documents'],
                    'eta_used': row.get('eta_used', 0.05),  # 记录实际使用的eta值（固定0.05）
                    'alpha_folder': row.get('alpha_folder', alpha),  # 记录文件夹的alpha值
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("未找到有效数据")
        return
    
    print("=" * 70)
    print("各ALPHA值的层级汇总统计（Step3）")
    print("=" * 70)
    
    # 按alpha分组，生成汇总文件
    for alpha, group_data in summary_df.groupby('alpha'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\n处理 Alpha={alpha}")
        print(f"输出目录: {parent_folder}")
        
        # 计算各层的汇总统计
        layer_summary = group_data.groupby('layer').agg({
            'avg_js_distance': ['mean', 'std', 'count'],
            'weighted_avg_renyi_entropy': ['mean', 'std', 'count'],
            'node_count': ['mean', 'std'],
            'total_documents': 'mean',
            'eta_used': 'first',  # 记录使用的eta值
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # 平铺列名
        layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
        layer_summary = layer_summary.reset_index()
        
        # 重命名列，使其更清晰
        column_mapping = {
            'avg_js_distance_mean': 'avg_js_distance_mean',
            'avg_js_distance_std': 'avg_js_distance_std', 
            'avg_js_distance_count': 'run_count',
            'weighted_avg_renyi_entropy_mean': 'weighted_avg_renyi_entropy_mean',
            'weighted_avg_renyi_entropy_std': 'weighted_avg_renyi_entropy_std',
            'weighted_avg_renyi_entropy_count': 'entropy_run_count',
            'node_count_mean': 'avg_node_count',
            'node_count_std': 'node_count_std',
            'total_documents_mean': 'avg_total_documents',
            'eta_used_first': 'eta_used',
            'run_id_<lambda>': 'included_runs'
        }
        
        for old_name, new_name in column_mapping.items():
            if old_name in layer_summary.columns:
                layer_summary = layer_summary.rename(columns={old_name: new_name})
        
        # 添加alpha信息
        layer_summary.insert(0, 'alpha', alpha)
        
        # 保存汇总结果到与run文件夹同级的位置
        output_filename = f'alpha_{alpha}_layer_summary.csv'
        output_path = os.path.join(parent_folder, output_filename)
        layer_summary.to_csv(output_path, index=False)
        
        print(f"  保存汇总文件: {output_path}")
        print(f"  包含运行: {layer_summary['included_runs'].iloc[0] if 'included_runs' in layer_summary.columns else 'N/A'}")
        print(f"  层数: {len(layer_summary)}")
        print(f"  使用的Eta值: {layer_summary.get('eta_used', pd.Series([0.05])).iloc[0]}")
        
        # 显示简要统计
        for _, row in layer_summary.iterrows():
            layer_num = int(row['layer'])
            js_mean = row['avg_js_distance_mean']
            js_std = row['avg_js_distance_std'] if 'avg_js_distance_std' in row else 0
            entropy_mean = row['weighted_avg_renyi_entropy_mean']
            entropy_std = row['weighted_avg_renyi_entropy_std'] if 'weighted_avg_renyi_entropy_std' in row else 0
            node_count = row['avg_node_count']
            run_count = int(row['run_count']) if 'run_count' in row else 0
            
            print(f"    Layer {layer_num}: JS={js_mean:.4f}(±{js_std:.4f}), 熵={entropy_mean:.4f}(±{entropy_std:.4f}), 节点={node_count:.1f}, runs={run_count}")
    
    # 生成总体对比文件（保存在base_path下）
    print(f"\n" + "=" * 70)
    print("生成总体对比文件")
    print("=" * 70)
    
    overall_summary = summary_df.groupby(['alpha', 'layer']).agg({
        'avg_js_distance': ['mean', 'std'],
        'weighted_avg_renyi_entropy': ['mean', 'std'],
        'node_count': ['mean', 'std'],
        'run_id': 'count'
    }).round(4)
    
    # 平铺列名
    overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
    overall_summary = overall_summary.reset_index()
    
    overall_output_path = os.path.join(base_path, 'alpha_layer_comparison.csv')
    overall_summary.to_csv(overall_output_path, index=False)
    print(f"总体对比文件保存到: {overall_output_path}")
    
    # 显示跨alpha对比
    for layer in sorted(summary_df['layer'].unique()):
        print(f"\nLayer {int(layer)} 跨Alpha对比:")
        print("Alpha值     JS距离(±std)      加权熵(±std)      节点数(±std)   运行数")
        print("-" * 75)
        
        layer_data = overall_summary[overall_summary['layer'] == layer]
        for _, row in layer_data.iterrows():
            alpha = row['alpha']
            js_mean = row['avg_js_distance_mean']
            js_std = row['avg_js_distance_std']
            entropy_mean = row['weighted_avg_renyi_entropy_mean']
            entropy_std = row['weighted_avg_renyi_entropy_std']
            node_mean = row['node_count_mean']
            node_std = row['node_count_std']
            run_count = int(row['run_id_count'])
            
            print(f"{alpha:7.3f}    {js_mean:6.4f}(±{js_std:5.4f})   {entropy_mean:6.4f}(±{entropy_std:5.4f})   {node_mean:6.1f}(±{node_std:4.1f})   {run_count:4d}")

In [None]:
# 执行汇总分析
base_path = "/Volumes/My Passport/收敛结果/step2"
print("=" * 70)
print("开始汇总各Eta值的层级统计...")
print("=" * 70)
aggregate_layer_statistics_by_alpha(base_path)
print("=" * 70)
print("汇总分析完成！")
print("=" * 70)

开始汇总各Eta值的层级统计...
各ETA值的层级汇总统计

处理 Eta=0.005
输出目录: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e0005_基于e01_收敛
  保存汇总文件: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e0005_基于e01_收敛/eta_0.005_layer_summary.csv
  包含运行: 1, 2, 3
  层数: 3
    Layer 0: JS=0.0000(±0.0000), 熵=5.4077(±0.0307), 节点=1.0, runs=3
    Layer 1: JS=0.7370(±0.0077), 熵=3.5804(±0.0238), 节点=85.0, runs=3
    Layer 2: JS=0.7486(±0.0018), 熵=3.0128(±0.0564), 节点=346.3, runs=3

处理 Eta=0.01
输出目录: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e001_基于e01_收敛
  保存汇总文件: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e001_基于e01_收敛/eta_0.01_layer_summary.csv
  包含运行: 1, 2, 3
  层数: 3
    Layer 0: JS=0.0000(±0.0000), 熵=5.3045(±0.0209), 节点=1.0, runs=3
    Layer 1: JS=0.6998(±0.0031), 熵=3.7481(±0.0531), 节点=79.0, runs=3
    Layer 2: JS=0.7070(±0.0031), 熵=3.3435(±0.0365), 节点=340.3, runs=3

处理 Eta=0.02
输出目录: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e002_基于e01_收敛
  保存汇总文件: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e002_基于e01_收敛/eta_0.02

In [11]:
# 汇总所有result_layers.csv，按layer分组求mean（包括nodes_in_layer）
import pandas as pd
import glob
import os

base_path = "/Volumes/My Passport/收敛结果/step2"
pattern = os.path.join(base_path, "**", "result_layers.csv")
files = glob.glob(pattern, recursive=True)

all_rows = []
for file in files:
    df = pd.read_csv(file)
    run_folder = os.path.dirname(file)
    folder_name = os.path.basename(run_folder)
    
    # 从文件夹名称提取eta值
    eta = None
    if 'eta_' in folder_name:
        try:
            eta_part = folder_name.split('eta_')[1].split('_')[0]
            eta = float(eta_part)
        except (IndexError, ValueError):
            eta = None
    
    # 按 layer 分组求均值
    grouped = df.groupby('layer').agg({
        'entropy_wavg': 'mean',
        'distinctiveness_wavg_jsd': 'mean',
        'nodes_in_layer': 'mean'
    }).reset_index()
    grouped['run_folder'] = run_folder
    grouped['eta'] = eta
    
    # 如果有其他参数信息（如 depth, gamma, alpha），可从原df取第一行补充
    for col in ['depth', 'gamma', 'alpha']:
        if col in df.columns:
            grouped[col] = df[col].iloc[0]
        else:
            # 从文件夹名称提取
            if col == 'depth' and 'depth_' in folder_name:
                try:
                    grouped[col] = int(folder_name.split('depth_')[1].split('_')[0])
                except:
                    grouped[col] = 3  # 默认depth=3
            elif col == 'gamma' and 'gamma_' in folder_name:
                try:
                    grouped[col] = float(folder_name.split('gamma_')[1].split('_')[0])
                except:
                    grouped[col] = 0.05  # 默认gamma=0.05
            elif col == 'alpha':
                grouped[col] = 0.1  # 默认alpha=0.1
            else:
                grouped[col] = None
    
    all_rows.append(grouped)

summary_df = pd.concat(all_rows, ignore_index=True)
summary_df.to_csv(os.path.join(base_path, "all_layers_summary.csv"), index=False)
print("已汇总所有run的层级均值到 all_layers_summary.csv")

已汇总所有run的层级均值到 all_layers_summary.csv


In [12]:
import pandas as pd

# filepath: /Volumes/My Passport/收敛结果/step2/all_layers_summary.csv
df = pd.read_csv("/Volumes/My Passport/收敛结果/step2/all_layers_summary.csv")

# 按 eta 和 layer 分组，求均值和标准差
summary = df.groupby(['eta', 'layer']).agg({
    'entropy_wavg': ['mean', 'std'],
    'distinctiveness_wavg_jsd': ['mean', 'std'],
    'nodes_in_layer': ['mean', 'std'],
    'depth': 'first',
    'gamma': 'first',
    'alpha': 'first'
}).reset_index()

# 展开多级列名
summary.columns = ['_'.join(col).strip('_') for col in summary.columns]

summary.to_csv("/Volumes/My Passport/收敛结果/step2/layer_eta_group_mean.csv", index=False)
print("已生成所有run按eta和层整体均值表 layer_eta_group_mean.csv")

已生成所有run按eta和层整体均值表 layer_eta_group_mean.csv


In [13]:
import pandas as pd
import glob
import os

base_path = "/Volumes/My Passport/收敛结果/step2"
pattern = os.path.join(base_path, "**", "result_layers.csv")
files = glob.glob(pattern, recursive=True)

all_rows = []
for file in files:
    df = pd.read_csv(file)
    df['run_folder'] = os.path.dirname(file)  # 标记来源
    
    # 从路径中提取参数
    folder = os.path.dirname(file)
    folder_name = os.path.basename(folder)
    
    for col in ['depth', 'gamma', 'eta', 'alpha']:
        if col not in df.columns:
            if f"{col}_" in folder_name:
                try:
                    value = float(folder_name.split(f"{col}_")[1].split("_")[0])
                except:
                    value = None
                df[col] = value
            else:
                # 设置默认值
                if col == 'depth':
                    df[col] = 3
                elif col == 'gamma':
                    df[col] = 0.05
                elif col == 'alpha':
                    df[col] = 0.1
                else:
                    df[col] = None
    
    all_rows.append(df)

summary_df = pd.concat(all_rows, ignore_index=True)
summary_df.to_csv(os.path.join(base_path, "all_result_layers_merged.csv"), index=False)
print("已汇总所有result_layers.csv到 all_result_layers_merged.csv")

已汇总所有result_layers.csv到 all_result_layers_merged.csv


In [14]:
import pandas as pd
import glob
import os

base_path = "/Volumes/My Passport/收敛结果/step2"
pattern = os.path.join(base_path, "**", "result_layers.csv")
files = glob.glob(pattern, recursive=True)

all_rows = []
for file in files:
    df = pd.read_csv(file)
    folder = os.path.dirname(file)
    folder_name = os.path.basename(folder)
    
    # 补充参数信息，从文件夹名称提取
    for col in ['depth', 'gamma', 'eta', 'alpha']:
        if col not in df.columns:
            if f"{col}_" in folder_name:
                try:
                    value = float(folder_name.split(f"{col}_")[1].split("_")[0])
                except:
                    value = None
                df[col] = value
            else:
                # 设置默认值
                if col == 'depth':
                    df[col] = 3
                elif col == 'gamma':
                    df[col] = 0.05
                elif col == 'alpha':
                    df[col] = 0.1
                else:
                    df[col] = None
    all_rows.append(df)

merged = pd.concat(all_rows, ignore_index=True)

# 按参数组和layer分组，计算均值和标准差
group_cols = ['depth', 'gamma', 'eta', 'alpha', 'layer']
summary = merged.groupby(group_cols).agg({
    'entropy_wavg': ['mean', 'std'],
    'distinctiveness_wavg_jsd': ['mean', 'std'],
    'nodes_in_layer': ['mean', 'std'],
}).reset_index()

# 展开多级列名
summary.columns = ['_'.join(col).strip('_') for col in summary.columns]

summary.to_csv(os.path.join(base_path, "all_params_layer_mean.csv"), index=False)
print("已生成每组参数每层的均值表 all_params_layer_mean.csv")

已生成每组参数每层的均值表 all_params_layer_mean.csv


In [15]:
# 简单检查文件数量和路径
import os
import glob

base_path = "/Volumes/My Passport/收敛结果/step2"
pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
files = glob.glob(pattern, recursive=True)

print(f"总共找到 {len(files)} 个文件")
print("文件路径列表:")
for i, file_path in enumerate(sorted(files), 1):
    folder_name = os.path.basename(os.path.dirname(file_path))
    # 提取eta值
    eta = "未知"
    if 'eta_' in folder_name:
        try:
            eta_part = folder_name.split('eta_')[1].split('_')[0]
            eta = float(eta_part)
        except:
            pass
    print(f"{i:2d}. eta={eta} | {folder_name}")

# 检查是否有重复路径
if len(files) != len(set(files)):
    print("\n⚠️  发现重复文件路径！")
    from collections import Counter
    counter = Counter(files)
    for file_path, count in counter.items():
        if count > 1:
            print(f"重复 {count} 次: {file_path}")
else:
    print("\n✓ 没有重复文件路径")

总共找到 18 个文件
文件路径列表:
 1. eta=0.005 | depth_3_gamma_0.05_eta_0.005_run_1
 2. eta=0.005 | depth_3_gamma_0.05_eta_0.005_run_2
 3. eta=0.005 | depth_3_gamma_0.05_eta_0.005_run_3
 4. eta=0.01 | depth_3_gamma_0.05_eta_0.01_run_1
 5. eta=0.01 | depth_3_gamma_0.05_eta_0.01_run_2
 6. eta=0.01 | depth_3_gamma_0.05_eta_0.01_run_3
 7. eta=0.02 | depth_3_gamma_0.05_eta_0.02_run_1
 8. eta=0.02 | depth_3_gamma_0.05_eta_0.02_run_2
 9. eta=0.02 | depth_3_gamma_0.05_eta_0.02_run_3
10. eta=0.05 | depth_3_gamma_0.05_eta_0.05_run_1
11. eta=0.05 | depth_3_gamma_0.05_eta_0.05_run_2
12. eta=0.05 | depth_3_gamma_0.05_eta_0.05_run_3
13. eta=0.1 | depth_3_gamma_0.05_eta_0.1_run_1
14. eta=0.1 | depth_3_gamma_0.05_eta_0.1_run_2
15. eta=0.1 | depth_3_gamma_0.05_eta_0.1_run_3
16. eta=0.2 | depth_3_gamma_0.05_eta_0.2_run_1
17. eta=0.2 | depth_3_gamma_0.05_eta_0.2_run_2
18. eta=0.2 | depth_3_gamma_0.05_eta_0.2_run_3

✓ 没有重复文件路径


In [46]:
import os
import glob

def clean_incorrect_metric_files(base_path="."):
    """
    删除计算有误的扩展指标文件
    """
    files_to_delete = [
        "extended_metrics.csv",
        "extended_metrics_corrected.csv", 
        "*extended_summary.csv",
        "eta_extended_metrics_comparison.csv"
    ]
    
    deleted_count = 0
    
    print("🗑️  开始清理计算有误的扩展指标文件...")
    print("=" * 60)
    
    for pattern in files_to_delete:
        # 查找所有匹配的文件
        search_pattern = os.path.join(base_path, "**", pattern)
        files = glob.glob(search_pattern, recursive=True)
        
        for file_path in files:
            try:
                os.remove(file_path)
                print(f"✓ 已删除: {file_path}")
                deleted_count += 1
            except Exception as e:
                print(f"❌ 删除失败: {file_path} - {e}")
    
    # 也删除base_path下的汇总文件
    base_files = [
        "eta_extended_metrics_comparison.csv",
        "all_extended_metrics.csv"
    ]
    
    for filename in base_files:
        file_path = os.path.join(base_path, filename)
        if os.path.exists(file_path):
            try:
                os.remove(file_path)
                print(f"✓ 已删除汇总文件: {file_path}")
                deleted_count += 1
            except Exception as e:
                print(f"❌ 删除汇总文件失败: {file_path} - {e}")
    
    print("=" * 60)
    print(f"🎯 清理完成！共删除 {deleted_count} 个文件")
    print("=" * 60)
    
    # 显示清理后保留的文件
    print("📋 保留的正确文件:")
    preserved_patterns = [
        "corrected_renyi_entropy.csv",
        "jensen_shannon_distances.csv", 
        "layer_average_js_distances.csv",
        "*layer_summary.csv",
        "result_layers.csv"
    ]
    
    preserved_count = 0
    for pattern in preserved_patterns:
        search_pattern = os.path.join(base_path, "**", pattern)
        files = glob.glob(search_pattern, recursive=True)
        preserved_count += len(files)
        if files:
            print(f"  {pattern}: {len(files)} 个文件")
    
    print(f"✅ 共保留 {preserved_count} 个正确的指标文件")

# 执行清理
base_path = "/Volumes/My Passport/收敛结果/step2"
clean_incorrect_metric_files(base_path)

🗑️  开始清理计算有误的扩展指标文件...
✓ 已删除: /Volumes/My Passport/收敛结果/step2/d3_g005_收敛/depth_3_gamma_0.05_run_1/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/d3_g005_收敛/depth_3_gamma_0.05_run_2/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/d3_g005_收敛/depth_3_gamma_0.05_run_3/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_3/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_1/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e005_基于e01_第二次_收敛/depth_3_gamma_0.05_eta_0.05_run_3/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e005_基于e01_第二次_收敛/depth_3_gamma_0.05_eta_0.05_run_1/extended_metrics.csv
✓ 已删除: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e005_基于e01_第二次_收敛/depth_3_gamma_0.05_eta_0

In [55]:
import pandas as pd
import numpy as np
import os
import glob
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

def calculate_standard_coherence_from_corpus_corrected(corpus, word_distributions_df, top_k=15):
    """
    修正版：计算完整的节点级和全局级一致性指标
    """
    
    print(f"📊 准备计算标准一致性指标...")
    print(f"   语料文档数: {len(corpus)}")
    print(f"   节点数: {word_distributions_df['node_id'].nunique()}")
    
    # 1. 准备texts和dictionary
    texts = list(corpus.values())
    dictionary = Dictionary(texts)
    
    print(f"   文档总数: {len(texts)}")
    print(f"   词典大小: {len(dictionary)}")
    
    # 2. 准备topics和节点映射
    topics = []
    node_topic_mapping = {}
    node_to_topic_idx = {}  # 新增：节点ID到主题索引的直接映射
    
    topic_idx = 0
    for node_id in word_distributions_df['node_id'].unique():
        node_data = word_distributions_df[word_distributions_df['node_id'] == node_id]
        top_words = node_data.nlargest(top_k, 'count')['word'].tolist()
        
        valid_words = []
        for word in top_words:
            if pd.notna(word) and word in dictionary.token2id:
                valid_words.append(word)
        
        if len(valid_words) >= 2:
            topics.append(valid_words)
            node_topic_mapping[node_id] = valid_words
            node_to_topic_idx[node_id] = topic_idx  # 直接映射
            topic_idx += 1
    
    print(f"   有效主题数: {len(topics)}")
    
    if len(topics) == 0:
        return {}, {}, {}
    
    # 3. 计算所有一致性指标（全局+每主题）
    coherence_measures = ['c_npmi', 'c_v', 'u_mass']
    global_coherence = {}
    per_topic_coherence = {}
    
    for measure in coherence_measures:
        try:
            print(f"   正在计算 {measure}...")
            
            cm = CoherenceModel(
                topics=topics,
                texts=texts,
                dictionary=dictionary,
                coherence=measure,
                processes=1
            )
            
            # 全局平均一致性
            global_score = cm.get_coherence()
            global_coherence[measure] = global_score
            
            # 每个主题的一致性
            per_topic_scores = cm.get_coherence_per_topic()
            per_topic_coherence[measure] = per_topic_scores
            
            print(f"   ✓ {measure}: 全局={global_score:.4f}, 范围=[{min(per_topic_scores):.4f}, {max(per_topic_scores):.4f}]")
            
        except Exception as e:
            print(f"   ❌ 计算 {measure} 时出错: {e}")
            global_coherence[measure] = 0.0
            per_topic_coherence[measure] = [0.0] * len(topics)
    
    return global_coherence, per_topic_coherence, node_to_topic_idx

def process_coherence_with_original_corpus_corrected(base_path=".", corpus=None, top_k=15):
    """
    修正版：完整计算节点级和全局级一致性指标
    """
    
    if corpus is None:
        print("❌ 必须提供原始语料corpus")
        return
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个词分布文件待处理")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # 参数提取
        eta = 0.1
        gamma = 0.05
        depth = 3
        alpha = 0.1
        
        for param_name in ['eta', 'gamma', 'depth', 'alpha']:
            if f'{param_name}_' in folder_name:
                try:
                    param_part = folder_name.split(f'{param_name}_')[1].split('_')[0]
                    if param_name == 'depth':
                        locals()[param_name] = int(param_part)
                    else:
                        locals()[param_name] = float(param_part)
                except:
                    pass
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] 处理文件: {folder_name}")
        print(f"参数 - Eta: {eta}, Gamma: {gamma}, Depth: {depth}, Alpha: {alpha}")
        print(f"{'='*80}")
        
        try:
            # 读取数据
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            print(f"📈 最后iteration: {max_iteration}")
            print(f"📈 节点数: {last_iteration_data['node_id'].nunique()}")
            
            # 计算一致性（修正版）
            global_coherence, per_topic_coherence, node_to_topic_idx = calculate_standard_coherence_from_corpus_corrected(
                corpus, last_iteration_data, top_k=top_k
            )
            
            if not global_coherence:
                print("⚠️ 一致性计算失败，跳过此文件")
                continue
            
            # 准备保存数据
            results_data = []
            
            for node_id in last_iteration_data['node_id'].unique():
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                top_words = node_words.nlargest(top_k, 'count')['word'].tolist()
                top_words = [word for word in top_words if pd.notna(word)]
                
                # 获取该节点的各项一致性指标（修正版）
                node_coherence_scores = {}
                
                if node_id in node_to_topic_idx:
                    # 直接通过索引获取各项指标
                    topic_idx = node_to_topic_idx[node_id]
                    
                    for measure in ['c_npmi', 'c_v', 'u_mass']:
                        if measure in per_topic_coherence:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = per_topic_coherence[measure][topic_idx]
                        else:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = 0.0
                else:
                    # 如果节点不在映射中，设为0
                    for measure in ['npmi', 'v', 'u_mass']:
                        node_coherence_scores[f'node_{measure}'] = 0.0
                
                results_data.append({
                    'node_id': node_id,
                    'eta': eta,
                    'gamma': gamma, 
                    'depth': depth,
                    'alpha': alpha,
                    'top_k': top_k,
                    'top_words': ', '.join(top_words[:10]),
                    'word_count': len(top_words),
                    
                    # 节点级一致性指标（修正）
                    'node_npmi': node_coherence_scores.get('node_npmi', 0.0),
                    'node_c_v': node_coherence_scores.get('node_v', 0.0),
                    'node_u_mass': node_coherence_scores.get('node_u_mass', 0.0),
                    
                    # 全局级一致性指标
                    'global_npmi': global_coherence.get('c_npmi', 0.0),
                    'global_c_v': global_coherence.get('c_v', 0.0),
                    'global_u_mass': global_coherence.get('u_mass', 0.0),
                    
                    'iteration': max_iteration
                })
            
            # 保存结果
            results_df = pd.DataFrame(results_data)
            output_path = os.path.join(folder_path, 'standard_coherence.csv')
            results_df.to_csv(output_path, index=False)
            
            print(f"💾 标准一致性结果已保存到: {output_path}")
            print(f"📊 结果摘要:")
            print(f"   - 全局NPMI: {global_coherence.get('c_npmi', 0.0):.4f}")
            print(f"   - 全局C_V: {global_coherence.get('c_v', 0.0):.4f}")
            print(f"   - 全局U_Mass: {global_coherence.get('u_mass', 0.0):.4f}")
            
            # 显示节点级指标范围
            if len(results_df) > 0:
                print(f"   - 节点NPMI范围: [{results_df['node_npmi'].min():.4f}, {results_df['node_npmi'].max():.4f}]")
                print(f"   - 节点C_V范围: [{results_df['node_c_v'].min():.4f}, {results_df['node_c_v'].max():.4f}]")
                print(f"   - 节点U_Mass范围: [{results_df['node_u_mass'].min():.4f}, {results_df['node_u_mass'].max():.4f}]")
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {file_path} 时出错: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ 所有文件的标准一致性计算完成！")

In [56]:
""" 0. set-up part:  import necessary libraries and set up environment """

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from collections import Counter, defaultdict
import numpy as np
import math
import copy
import itertools
import matplotlib.pyplot as plt
import matplotlib as mpl

import joblib
from joblib import Parallel, delayed
from threading import Thread

import os
import pickle
import time

import operator
from functools import reduce
import json
import cProfile

# download nltk data once time
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')

#  chinese character support in matplotlib
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS' 'SimHei' 'DejaVu Sans']  
plt.rcParams['axes.unicode_minus'] = False  


""" 1.1 Data Preprocessing: load data, clean text, lemmatization, remove low-frequency words"""

# Map POS tags to WordNet format， Penn Treebank annotation: fine-grained (45 tags), WordNet annotation: coarse-grained (4 tags: a, v, n, r)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # 形容词
    elif treebank_tag.startswith('V'):
        return 'v'  # 动词
    elif treebank_tag.startswith('N'):
        return 'n'  # 名词
    elif treebank_tag.startswith('R'):
        return 'r'  # 副词
    else:
        return 'n'  # 默认名词

# Text cleaning and lemmatization preprocessing function
def clean_and_lemmatize(text):
    if pd.isnull(text):
        return []
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters using regex
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    pos_tags = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags]
    return lemmatized  

#-----------------Load data----------------
data = pd.read_excel('/Volumes/My Passport/收敛结果/step2/papers_CM.xlsx', usecols=['PaperID', 'Abstract', 'Keywords', 'Year'])

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# clean and lemmatize the abstracts
data['Lemmatized_Tokens'] = data['Abstract'].apply(clean_and_lemmatize)

# count word frequencies
all_tokens = [word for tokens in data['Lemmatized_Tokens'] for word in tokens]
word_counts = Counter(all_tokens)

# set a minimum frequency threshold for valid words
min_freq = 10
valid_words = set([word for word, freq in word_counts.items() if freq >= min_freq])

# remove rare words based on frequency threshold
def remove_rare_words(tokens):
    return [word for word in tokens if word in valid_words]

data['Filtered_Tokens'] = data['Lemmatized_Tokens'].apply(remove_rare_words)

# join tokens back into cleaned abstracts
data['Cleaned_Abstract'] = data['Filtered_Tokens'].apply(lambda x: " ".join(x))

# create a cleaned DataFrame with relevant columns
cleaned_data = data[['PaperID', 'Year', 'Cleaned_Abstract']]
cleaned_data = cleaned_data[~(cleaned_data['PaperID'] == 57188)] # this paper has no abstract
cleaned_data = cleaned_data.reset_index(drop=True) 
cleaned_data.insert(0, 'Document_ID', range(len(cleaned_data))) 
abstract_list = cleaned_data['Cleaned_Abstract'].apply(lambda x: x.split()).tolist()

corpus = {doc_id: abstract_list for doc_id, abstract_list in enumerate(abstract_list)}
# cleaned_data.to_csv('./data/processed/cleaned_data.xlsx', index=False, encoding='utf-8-sig')

In [59]:
# 删除旧的不完整文件
def clean_old_coherence_files(base_path="."):
    """删除旧的不完整的标准一致性文件"""
    pattern = os.path.join(base_path, "**", "standard_coherence.csv")
    files = glob.glob(pattern, recursive=True)
    
    deleted_count = 0
    for file_path in files:
        try:
            os.remove(file_path)
            print(f"✓ 删除旧文件: {os.path.basename(os.path.dirname(file_path))}")
            deleted_count += 1
        except Exception as e:
            print(f"❌ 删除失败: {file_path} - {e}")
    
    print(f"🗑️ 共删除 {deleted_count} 个旧的一致性文件")

# 执行修正版计算
base_path = "/Volumes/My Passport/收敛结果/step2"
top_k = 5

print("=" * 80)
print("🗑️ 清理旧的不完整文件...")
print("=" * 80)
clean_old_coherence_files(base_path)

print("\n" + "=" * 80)
print("🔄 开始重新计算完整的一致性指标...")
print("=" * 80)

# 使用修正版函数
process_coherence_with_original_corpus_corrected(base_path, corpus, top_k)

print("=" * 80)
print("✅ 修正版一致性计算完成！")
print("=" * 80)

🗑️ 清理旧的不完整文件...
✓ 删除旧文件: depth_3_gamma_0.05_eta_0.1_run_2
✓ 删除旧文件: depth_3_gamma_0.05_eta_0.1_run_3
✓ 删除旧文件: depth_3_gamma_0.05_eta_0.1_run_1
🗑️ 共删除 3 个旧的一致性文件

🔄 开始重新计算完整的一致性指标...
🔍 找到 18 个词分布文件待处理

[1/18] 处理文件: depth_3_gamma_0.05_eta_0.1_run_2
参数 - Eta: 0.1, Gamma: 0.05, Depth: 3, Alpha: 0.1
📈 最后iteration: 175
📈 节点数: 231
📊 准备计算标准一致性指标...
   语料文档数: 970
   节点数: 231
   文档总数: 970
   词典大小: 1490
   有效主题数: 231
   正在计算 c_npmi...
   ✓ c_npmi: 全局=0.0460, 范围=[-0.4494, 0.7414]
   正在计算 c_v...
   ✓ c_v: 全局=0.5559, 范围=[0.1778, 0.9900]
   正在计算 u_mass...
   ✓ u_mass: 全局=-2.9908, 范围=[-14.9995, -0.3317]
💾 标准一致性结果已保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/standard_coherence.csv
📊 结果摘要:
   - 全局NPMI: 0.0460
   - 全局C_V: 0.5559
   - 全局U_Mass: -2.9908
   - 节点NPMI范围: [-0.4494, 0.7414]
   - 节点C_V范围: [0.1778, 0.9900]
   - 节点U_Mass范围: [-14.9995, -0.3317]

[2/18] 处理文件: depth_3_gamma_0.05_eta_0.1_run_3
参数 - Eta: 0.1, Gamma: 0.05, Depth: 3, Alpha: 0.1
📈 最后iteration: 

In [60]:
import pandas as pd
import os
import glob

def add_layer_and_document_info_to_coherence(base_path="."):
    """
    将corrected_renyi_entropy.csv中的layer和document_count信息添加到standard_coherence.csv中
    
    Parameters:
    base_path: str, 结果文件的根目录
    """
    
    # 查找所有包含standard_coherence.csv的文件夹
    pattern = os.path.join(base_path, "**", "standard_coherence.csv")
    coherence_files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(coherence_files)} 个标准一致性文件待处理")
    
    for idx, coherence_file_path in enumerate(coherence_files, 1):
        folder_path = os.path.dirname(coherence_file_path)
        folder_name = os.path.basename(folder_path)
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(coherence_files)}] 处理文件夹: {folder_name}")
        print(f"{'='*80}")
        
        # 检查对应的corrected_renyi_entropy.csv是否存在
        entropy_file_path = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
        
        if not os.path.exists(entropy_file_path):
            print(f"⚠️  未找到对应的entropy文件: {entropy_file_path}")
            continue
        
        try:
            # 读取两个文件
            print("📖 读取文件...")
            coherence_df = pd.read_csv(coherence_file_path)
            entropy_df = pd.read_csv(entropy_file_path)
            
            print(f"   一致性文件: {len(coherence_df)} 行")
            print(f"   熵文件: {len(entropy_df)} 行")
            
            # 检查是否已经有layer和document_count列
            existing_cols = coherence_df.columns.tolist()
            has_layer = 'layer' in existing_cols
            has_doc_count = 'document_count' in existing_cols
            
            print(f"   当前列: {existing_cols}")
            print(f"   已有layer列: {has_layer}")
            print(f"   已有document_count列: {has_doc_count}")
            
            # 创建node_id到layer和document_count的映射
            node_layer_map = entropy_df.set_index('node_id')['layer'].to_dict()
            node_doc_count_map = entropy_df.set_index('node_id')['document_count'].to_dict()
            
            print(f"   可映射的节点数: {len(node_layer_map)}")
            
            # 添加或更新layer列
            if not has_layer:
                coherence_df['layer'] = coherence_df['node_id'].map(node_layer_map)
                print("   ✓ 添加了layer列")
            else:
                coherence_df['layer'] = coherence_df['node_id'].map(node_layer_map)
                print("   ✓ 更新了layer列")
            
            # 添加或更新document_count列
            if not has_doc_count:
                coherence_df['document_count'] = coherence_df['node_id'].map(node_doc_count_map)
                print("   ✓ 添加了document_count列")
            else:
                coherence_df['document_count'] = coherence_df['node_id'].map(node_doc_count_map)
                print("   ✓ 更新了document_count列")
            
            # 检查映射结果
            layer_null_count = coherence_df['layer'].isnull().sum()
            doc_count_null_count = coherence_df['document_count'].isnull().sum()
            
            if layer_null_count > 0:
                print(f"   ⚠️  有 {layer_null_count} 个节点未找到layer信息")
            
            if doc_count_null_count > 0:
                print(f"   ⚠️  有 {doc_count_null_count} 个节点未找到document_count信息")
            
            # 显示层级分布统计
            layer_stats = coherence_df['layer'].value_counts().sort_index()
            print(f"   📊 层级分布: {layer_stats.to_dict()}")
            
            # 显示文档数统计
            doc_stats = coherence_df['document_count'].describe()
            print(f"   📊 文档数统计:")
            print(f"      最小值: {doc_stats['min']:.0f}")
            print(f"      最大值: {doc_stats['max']:.0f}")
            print(f"      平均值: {doc_stats['mean']:.1f}")
            
            # 保存更新后的文件
            coherence_df.to_csv(coherence_file_path, index=False)
            print(f"💾 已更新并保存: {coherence_file_path}")
            
            # 显示更新后的列结构
            updated_cols = coherence_df.columns.tolist()
            print(f"   更新后的列: {updated_cols}")
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {coherence_file_path} 时出错: {str(e)}")
            print("详细错误信息:")
            traceback.print_exc()
    
    print(f"\n✅ 所有标准一致性文件的layer和document_count信息更新完成！")

def verify_coherence_files_update(base_path="."):
    """
    验证standard_coherence.csv文件的更新情况
    """
    pattern = os.path.join(base_path, "**", "standard_coherence.csv")
    coherence_files = glob.glob(pattern, recursive=True)
    
    print("🔍 验证更新结果:")
    print("="*80)
    
    all_have_layer = True
    all_have_doc_count = True
    
    for file_path in coherence_files:
        folder_name = os.path.basename(os.path.dirname(file_path))
        
        try:
            df = pd.read_csv(file_path)
            has_layer = 'layer' in df.columns
            has_doc_count = 'document_count' in df.columns
            
            layer_null = df['layer'].isnull().sum() if has_layer else "无列"
            doc_null = df['document_count'].isnull().sum() if has_doc_count else "无列"
            
            status = "✅" if (has_layer and has_doc_count and layer_null == 0 and doc_null == 0) else "⚠️"
            
            print(f"{status} {folder_name}")
            print(f"   Layer列: {'有' if has_layer else '无'} (空值: {layer_null})")
            print(f"   DocCount列: {'有' if has_doc_count else '无'} (空值: {doc_null})")
            
            if not has_layer:
                all_have_layer = False
            if not has_doc_count:
                all_have_doc_count = False
                
        except Exception as e:
            print(f"❌ {folder_name}: 读取失败 - {e}")
    
    print("="*80)
    print(f"📋 汇总:")
    print(f"   总文件数: {len(coherence_files)}")
    print(f"   都有layer列: {'是' if all_have_layer else '否'}")
    print(f"   都有document_count列: {'是' if all_have_doc_count else '否'}")

In [61]:
# 执行更新
base_path = "/Volumes/My Passport/收敛结果/step2"

print("=" * 80)
print("开始为standard_coherence.csv添加layer和document_count信息...")
print("=" * 80)

# 添加layer和document_count信息
add_layer_and_document_info_to_coherence(base_path)

print("\n" + "=" * 80)
print("验证更新结果...")
print("=" * 80)

# 验证更新结果
verify_coherence_files_update(base_path)

print("\n" + "=" * 80)
print("✅ Layer和document_count信息添加完成！")
print("=" * 80)

开始为standard_coherence.csv添加layer和document_count信息...
🔍 找到 18 个标准一致性文件待处理

[1/18] 处理文件夹: depth_3_gamma_0.05_eta_0.1_run_2
📖 读取文件...
   一致性文件: 231 行
   熵文件: 231 行
   当前列: ['node_id', 'eta', 'gamma', 'depth', 'alpha', 'top_k', 'top_words', 'word_count', 'node_npmi', 'node_c_v', 'node_u_mass', 'global_npmi', 'global_c_v', 'global_u_mass', 'iteration']
   已有layer列: False
   已有document_count列: False
   可映射的节点数: 231
   ✓ 添加了layer列
   ✓ 添加了document_count列
   📊 层级分布: {0: 1, 1: 44, 2: 186}
   📊 文档数统计:
      最小值: 1
      最大值: 970
      平均值: 12.6
💾 已更新并保存: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/standard_coherence.csv
   更新后的列: ['node_id', 'eta', 'gamma', 'depth', 'alpha', 'top_k', 'top_words', 'word_count', 'node_npmi', 'node_c_v', 'node_u_mass', 'global_npmi', 'global_c_v', 'global_u_mass', 'iteration', 'layer', 'document_count']

[2/18] 处理文件夹: depth_3_gamma_0.05_eta_0.1_run_3
📖 读取文件...
   一致性文件: 215 行
   熵文件: 215 行
   当前列: ['node_id', 'eta', 'gamma'

In [66]:
def calculate_coherence_layered_analysis(base_path=".", corpus=None, top_k=15):
    """
    计算节点一致性指标并按层级进行加权汇总分析
    """
    
    if corpus is None:
        print("❌ 必须提供原始语料corpus")
        return
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个词分布文件待处理 (top_k={top_k})")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # 参数提取
        eta = 0.1
        gamma = 0.05
        depth = 3
        alpha = 0.1
        
        for param_name in ['eta', 'gamma', 'depth', 'alpha']:
            if f'{param_name}_' in folder_name:
                try:
                    param_part = folder_name.split(f'{param_name}_')[1].split('_')[0]
                    if param_name == 'depth':
                        locals()[param_name] = int(param_part)
                    else:
                        locals()[param_name] = float(param_part)
                except:
                    pass
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] 处理文件: {folder_name} (k={top_k})")
        print(f"参数 - Eta: {eta}, Gamma: {gamma}, Depth: {depth}, Alpha: {alpha}")
        print(f"{'='*80}")
        
        try:
            # 读取数据
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            # 读取层级和文档数信息
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print("⚠️ 未找到entropy文件，跳过此文件")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            print(f"📈 最后iteration: {max_iteration}")
            print(f"📈 节点数: {last_iteration_data['node_id'].nunique()}")
            
            # 计算节点级一致性（只保留节点级）
            texts = list(corpus.values())
            dictionary = Dictionary(texts)
            
            topics = []
            node_to_topic_idx = {}
            
            topic_idx = 0
            for node_id in last_iteration_data['node_id'].unique():
                node_data = last_iteration_data[last_iteration_data['node_id'] == node_id]
                top_words = node_data.nlargest(top_k, 'count')['word'].tolist()
                
                valid_words = []
                for word in top_words:
                    if pd.notna(word) and word in dictionary.token2id:
                        valid_words.append(word)
                
                if len(valid_words) >= 2:
                    topics.append(valid_words)
                    node_to_topic_idx[node_id] = topic_idx
                    topic_idx += 1
            
            if len(topics) == 0:
                print("⚠️ 没有有效主题，跳过此文件")
                continue
            
            # 计算各项一致性指标
            coherence_measures = ['c_npmi', 'c_v', 'u_mass']
            per_topic_coherence = {}
            
            for measure in coherence_measures:
                try:
                    print(f"   正在计算 {measure}...")
                    
                    cm = CoherenceModel(
                        topics=topics,
                        texts=texts,
                        dictionary=dictionary,
                        coherence=measure,
                        processes=1
                    )
                    
                    per_topic_scores = cm.get_coherence_per_topic()
                    per_topic_coherence[measure] = per_topic_scores
                    
                    print(f"   ✓ {measure}: 范围=[{min(per_topic_scores):.4f}, {max(per_topic_scores):.4f}]")
                    
                except Exception as e:
                    print(f"   ❌ 计算 {measure} 时出错: {e}")
                    per_topic_coherence[measure] = [0.0] * len(topics)
            
            # 合并节点级一致性和层级信息
            node_coherence_data = []
            
            for node_id in last_iteration_data['node_id'].unique():
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                top_words = node_words.nlargest(top_k, 'count')['word'].tolist()
                top_words = [word for word in top_words if pd.notna(word)]
                
                # 获取层级和文档数信息
                node_entropy_info = entropy_df[entropy_df['node_id'] == node_id]
                if len(node_entropy_info) > 0:
                    layer = node_entropy_info['layer'].iloc[0]
                    document_count = node_entropy_info['document_count'].iloc[0]
                else:
                    layer = -1
                    document_count = 0
                
                # 获取节点一致性得分
                node_coherence_scores = {}
                if node_id in node_to_topic_idx:
                    topic_idx = node_to_topic_idx[node_id]
                    for measure in ['c_npmi', 'c_v', 'u_mass']:
                        if measure in per_topic_coherence:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = per_topic_coherence[measure][topic_idx]
                        else:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = 0.0
                else:
                    for measure in ['npmi', 'v', 'u_mass']:
                        node_coherence_scores[f'node_{measure}'] = 0.0
                
                node_coherence_data.append({
                    'node_id': node_id,
                    'eta': eta,
                    'gamma': gamma, 
                    'depth': depth,
                    'alpha': alpha,
                    'layer': layer,
                    'document_count': document_count,
                    'top_k': top_k,
                    'top_words': ', '.join(top_words[:10]),
                    'word_count': len(top_words),
                    
                    # 只保留节点级一致性指标
                    'node_npmi': node_coherence_scores.get('node_npmi', 0.0),
                    'node_c_v': node_coherence_scores.get('node_v', 0.0),
                    'node_u_mass': node_coherence_scores.get('node_u_mass', 0.0),
                    
                    'iteration': max_iteration
                })
            
            # 保存节点级一致性结果（加上k值）
            coherence_df = pd.DataFrame(node_coherence_data)
            node_output_path = os.path.join(folder_path, f'node_coherence_k{top_k}.csv')
            coherence_df.to_csv(node_output_path, index=False)
            
            # 计算层级加权平均一致性
            layer_coherence_summary = []
            
            for layer in coherence_df['layer'].unique():
                if layer == -1:  # 跳过无效层级
                    continue
                    
                layer_data = coherence_df[coherence_df['layer'] == layer]
                total_docs = layer_data['document_count'].sum()
                
                if total_docs > 0:
                    # 按文档数加权平均
                    weighted_npmi = (layer_data['document_count'] * layer_data['node_npmi']).sum() / total_docs
                    weighted_c_v = (layer_data['document_count'] * layer_data['node_c_v']).sum() / total_docs
                    weighted_u_mass = (layer_data['document_count'] * layer_data['node_u_mass']).sum() / total_docs
                    
                    # 简单平均（不加权）
                    simple_npmi = layer_data['node_npmi'].mean()
                    simple_c_v = layer_data['node_c_v'].mean()
                    simple_u_mass = layer_data['node_u_mass'].mean()
                    
                    layer_coherence_summary.append({
                        'layer': layer,
                        'node_count': len(layer_data),
                        'total_documents': total_docs,
                        'avg_documents_per_node': total_docs / len(layer_data),
                        
                        # 文档数加权平均一致性
                        'weighted_avg_npmi': weighted_npmi,
                        'weighted_avg_c_v': weighted_c_v,
                        'weighted_avg_u_mass': weighted_u_mass,
                        
                        # 简单平均一致性
                        'simple_avg_npmi': simple_npmi,
                        'simple_avg_c_v': simple_c_v,
                        'simple_avg_u_mass': simple_u_mass,
                        
                        # 标准差
                        'std_npmi': layer_data['node_npmi'].std(),
                        'std_c_v': layer_data['node_c_v'].std(),
                        'std_u_mass': layer_data['node_u_mass'].std(),
                        
                        'top_k': top_k,  # 添加k值记录
                        'eta': eta,
                        'gamma': gamma,
                        'depth': depth,
                        'alpha': alpha
                    })
            
            # 保存层级汇总结果（加上k值）
            if layer_coherence_summary:
                layer_summary_df = pd.DataFrame(layer_coherence_summary)
                layer_output_path = os.path.join(folder_path, f'layer_coherence_summary_k{top_k}.csv')
                layer_summary_df.to_csv(layer_output_path, index=False)
                
                print(f"💾 节点一致性结果已保存到: {node_output_path}")
                print(f"💾 层级汇总结果已保存到: {layer_output_path}")
                
                print(f"📊 层级一致性汇总 (k={top_k}):")
                for _, row in layer_summary_df.iterrows():
                    layer_num = int(row['layer'])
                    node_count = int(row['node_count'])
                    w_npmi = row['weighted_avg_npmi']
                    w_cv = row['weighted_avg_c_v']
                    w_umass = row['weighted_avg_u_mass']
                    print(f"   Layer {layer_num} ({node_count}节点): NPMI={w_npmi:.4f}, C_V={w_cv:.4f}, U_Mass={w_umass:.4f}")
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {file_path} 时出错: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ 所有文件的一致性分层分析完成！(k={top_k})")

def aggregate_coherence_by_eta(base_path=".", top_k=15):
    """
    按eta值汇总各层的一致性统计（包含k值）
    """
    # 查找所有layer_coherence_summary_k{top_k}.csv文件
    pattern = os.path.join(base_path, "**", f"layer_coherence_summary_k{top_k}.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 查找文件模式: layer_coherence_summary_k{top_k}.csv")
    print(f"🔍 找到 {len(files)} 个层级汇总文件")
    
    all_data = []
    eta_groups = {}
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # 提取eta值
        eta = None
        if 'eta_' in folder_name:
            try:
                eta_part = folder_name.split('eta_')[1].split('_')[0]
                eta = float(eta_part)
            except:
                continue
        else:
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        if eta not in eta_groups:
            eta_groups[eta] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'eta': eta,
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'total_documents': row['total_documents'],
                    'weighted_avg_npmi': row['weighted_avg_npmi'],
                    'weighted_avg_c_v': row['weighted_avg_c_v'],
                    'weighted_avg_u_mass': row['weighted_avg_u_mass'],
                    'simple_avg_npmi': row['simple_avg_npmi'],
                    'simple_avg_c_v': row['simple_avg_c_v'],
                    'simple_avg_u_mass': row['simple_avg_u_mass'],
                    'top_k': top_k,  # 添加k值记录
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame并按eta分组汇总
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("未找到有效数据")
        return
    
    print("=" * 70)
    print(f"各ETA值的一致性层级汇总统计 (k={top_k})")
    print("=" * 70)
    
    # 按eta分组生成汇总文件
    for eta, group_data in summary_df.groupby('eta'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\n处理 Eta={eta} (k={top_k})")
        
        layer_summary = group_data.groupby('layer').agg({
            'weighted_avg_npmi': ['mean', 'std', 'count'],
            'weighted_avg_c_v': ['mean', 'std', 'count'],
            'weighted_avg_u_mass': ['mean', 'std', 'count'],
            'simple_avg_npmi': ['mean', 'std'],
            'simple_avg_c_v': ['mean', 'std'],
            'simple_avg_u_mass': ['mean', 'std'],
            'node_count': 'mean',
            'total_documents': 'mean',
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # 平铺列名
        layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
        layer_summary = layer_summary.reset_index()
        layer_summary.insert(0, 'eta', eta)
        layer_summary.insert(1, 'top_k', top_k)  # 添加k值列
        
        # 保存汇总结果（文件名包含k值）
        output_filename = f'eta_{eta}_coherence_layer_summary_k{top_k}.csv'
        output_path = os.path.join(parent_folder, output_filename)
        layer_summary.to_csv(output_path, index=False)
        
        print(f"  保存汇总文件: {output_path}")
        print(f"  层数: {len(layer_summary)}")
        
        # 显示简要统计
        for _, row in layer_summary.iterrows():
            layer_num = int(row['layer'])
            w_npmi = row['weighted_avg_npmi_mean']
            w_cv = row['weighted_avg_c_v_mean']
            w_umass = row['weighted_avg_u_mass_mean']
            run_count = int(row['weighted_avg_npmi_count'])
            
            print(f"    Layer {layer_num}: W_NPMI={w_npmi:.4f}, W_C_V={w_cv:.4f}, W_U_Mass={w_umass:.4f}, runs={run_count}")
    
    # 生成总体对比文件（文件名包含k值）
    overall_summary = summary_df.groupby(['eta', 'layer']).agg({
        'weighted_avg_npmi': ['mean', 'std'],
        'weighted_avg_c_v': ['mean', 'std'],
        'weighted_avg_u_mass': ['mean', 'std'],
        'run_id': 'count'
    }).round(4)
    
    overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
    overall_summary = overall_summary.reset_index()
    overall_summary.insert(2, 'top_k', top_k)  # 添加k值列
    
    overall_output_path = os.path.join(base_path, f'eta_coherence_layer_comparison_k{top_k}.csv')
    overall_summary.to_csv(overall_output_path, index=False)
    print(f"\n总体对比文件保存到: {overall_output_path}")

In [68]:
# 执行精简版一致性分层分析（文件名包含k值）
base_path = "/Volumes/My Passport/收敛结果/step2"
top_k = 10

print("=" * 80)
print(f"开始计算节点一致性指标并进行分层分析 (k={top_k})...")
print("=" * 80)

# 计算节点一致性和层级汇总
calculate_coherence_layered_analysis(base_path, corpus, top_k)

print("\n" + "=" * 80)
print(f"开始按eta值汇总层级一致性统计 (k={top_k})...")
print("=" * 80)

# 按eta汇总（传入top_k参数）
aggregate_coherence_by_eta(base_path, top_k)

print("=" * 80)
print(f"✅ 一致性分层分析完成！(k={top_k})")
print("=" * 80)

开始计算节点一致性指标并进行分层分析 (k=10)...
🔍 找到 18 个词分布文件待处理 (top_k=10)

[1/18] 处理文件: depth_3_gamma_0.05_eta_0.1_run_2 (k=10)
参数 - Eta: 0.1, Gamma: 0.05, Depth: 3, Alpha: 0.1
📈 最后iteration: 175
📈 节点数: 231
   正在计算 c_npmi...
   ✓ c_npmi: 范围=[-0.3876, 0.2668]
   正在计算 c_v...
   ✓ c_v: 范围=[0.1804, 0.9180]
   正在计算 u_mass...
   ✓ u_mass: 范围=[-9.9187, -0.7397]
💾 节点一致性结果已保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/node_coherence_k10.csv
💾 层级汇总结果已保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/layer_coherence_summary_k10.csv
📊 层级一致性汇总 (k=10):
   Layer 0 (1节点): NPMI=0.0271, C_V=0.4820, U_Mass=-0.7397
   Layer 1 (44节点): NPMI=-0.0408, C_V=0.4139, U_Mass=-2.4372
   Layer 2 (186节点): NPMI=0.0045, C_V=0.4639, U_Mass=-3.1948

[2/18] 处理文件: depth_3_gamma_0.05_eta_0.1_run_3 (k=10)
参数 - Eta: 0.1, Gamma: 0.05, Depth: 3, Alpha: 0.1
📈 最后iteration: 175
📈 节点数: 215
   正在计算 c_npmi...
   ✓ c_npmi: 范围=[-0.3944, 0.2994]
   正在计算 c_v...
   ✓ c_

In [80]:
def compute_perplexity_with_path_mapping_fixed(word_data, path_mapping_data, corpus, test_doc_ids, eta_smoothing=0.1):
    """
    修正版：基于iteration_path_document_mapping.csv计算困惑度
    """
    
    print("🔄 构建模型参数（基于路径映射）...")
    
    # 1. 构建词典
    all_words = sorted(list(word_data['word'].dropna().unique()))
    word_to_id = {word: idx for idx, word in enumerate(all_words)}
    vocab_size = len(all_words)
    
    print(f"   词典大小: {vocab_size}")
    
    # 2. 构建节点的词分布 φ (topic-word distribution)
    node_word_probs = {}
    for node_id in word_data['node_id'].unique():
        node_words = word_data[word_data['node_id'] == node_id]
        
        # 初始化计数向量
        word_counts = np.zeros(vocab_size)
        
        # 填充词计数
        for _, row in node_words.iterrows():
            word = row['word']
            if pd.notna(word) and word in word_to_id:
                word_counts[word_to_id[word]] = row['count']
        
        # 添加平滑并归一化
        smoothed_counts = word_counts + eta_smoothing
        word_probs = smoothed_counts / smoothed_counts.sum()
        node_word_probs[node_id] = word_probs
    
    print(f"   构建了 {len(node_word_probs)} 个节点的词分布")
    
    # 3. 构建文档路径映射
    print("🔄 构建文档路径映射...")
    
    doc_paths = {}
    path_lengths = []
    
    print(f"   路径映射数据列: {path_mapping_data.columns.tolist()}")
    
    # 找到文档ID列
    doc_id_column = 'document_id'
    if doc_id_column not in path_mapping_data.columns:
        print("❌ 未找到document_id列")
        return None
    
    # 修正：获取层级列（包含_node_id后缀）
    layer_columns = []
    for i in range(10):  # 假设最多10层
        layer_col = f'layer_{i}_node_id'  # 修正：添加_node_id后缀
        if layer_col in path_mapping_data.columns:
            layer_columns.append(layer_col)
        else:
            break
    
    print(f"   找到层级列: {layer_columns}")
    
    if not layer_columns:
        print("❌ 未找到任何层级列")
        return None
    
    # 统计文档ID范围
    test_doc_range = f"[{min(test_doc_ids)}, {max(test_doc_ids)}]"
    data_doc_range = f"[{path_mapping_data[doc_id_column].min()}, {path_mapping_data[doc_id_column].max()}]"
    print(f"   测试文档ID范围: {test_doc_range}")
    print(f"   数据文档ID范围: {data_doc_range}")
    
    # 提取每个文档的完整路径
    matched_docs = 0
    for _, row in path_mapping_data.iterrows():
        doc_id = row[doc_id_column]
        
        # 检查文档ID是否在测试集中
        if doc_id not in test_doc_ids:
            continue
        
        # 构建完整路径：从layer_0_node_id到最后一个非空层级
        path = []
        for layer_col in layer_columns:
            if layer_col in row and pd.notna(row[layer_col]):
                path.append(int(row[layer_col]))
            else:
                break  # 遇到空值就停止
        
        if path:
            doc_paths[doc_id] = path
            path_lengths.append(len(path))
            matched_docs += 1
            
            # 调试：显示前几个成功匹配的路径
            if matched_docs <= 5:
                print(f"   调试 - 文档{doc_id}的完整路径: {path} (长度: {len(path)})")
    
    match_rate = matched_docs / len(test_doc_ids) if test_doc_ids else 0
    avg_path_length = np.mean(path_lengths) if path_lengths else 0
    
    print(f"   匹配到路径的测试文档: {matched_docs}/{len(test_doc_ids)} ({match_rate:.1%})")
    print(f"   平均路径长度: {avg_path_length:.1f}")
    
    if matched_docs == 0:
        print("❌ 没有匹配到任何文档路径，无法计算困惑度")
        return None
    
    # 4. 计算困惑度
    print("🔄 计算困惑度...")
    
    total_log_likelihood = 0.0
    total_words = 0
    valid_docs = 0
    doc_perplexities = []
    
    for doc_id in test_doc_ids:
        if doc_id not in corpus or doc_id not in doc_paths:
            continue
            
        doc_words = corpus[doc_id]
        if not doc_words:
            continue
        
        doc_path = doc_paths[doc_id]
        if not doc_path:
            continue
            
        valid_docs += 1
        
        # 计算文档的对数似然
        doc_log_likelihood = 0.0
        doc_word_count = 0
        
        for word in doc_words:
            if word in word_to_id:
                word_id = word_to_id[word]
                
                # 计算词在文档路径下的概率 (路径平均策略)
                word_prob = 0.0
                valid_nodes = 0
                
                for node_id in doc_path:
                    if node_id in node_word_probs:
                        word_prob += node_word_probs[node_id][word_id]
                        valid_nodes += 1
                
                if valid_nodes > 0:
                    word_prob /= valid_nodes
                
                # 确保概率非零
                if word_prob <= 0:
                    word_prob = 1e-10
                
                doc_log_likelihood += math.log(word_prob)
                doc_word_count += 1
        
        if doc_word_count > 0:
            # 计算单文档困惑度
            doc_perplexity = math.exp(-doc_log_likelihood / doc_word_count)
            doc_perplexities.append(doc_perplexity)
            
            total_log_likelihood += doc_log_likelihood
            total_words += doc_word_count
    
    if total_words == 0 or valid_docs == 0:
        print("⚠️ 没有有效的测试数据")
        return None
    
    # 计算总体困惑度
    overall_perplexity = math.exp(-total_log_likelihood / total_words)
    avg_doc_perplexity = np.mean(doc_perplexities) if doc_perplexities else 0.0
    
    print(f"   ✓ 处理了 {valid_docs} 个有效测试文档")
    print(f"   ✓ 总计 {total_words} 个测试词")
    
    return {
        'perplexity': overall_perplexity,
        'avg_doc_perplexity': avg_doc_perplexity,
        'log_likelihood': total_log_likelihood,
        'total_words': total_words,
        'valid_docs': valid_docs,
        'matched_docs': matched_docs,
        'match_rate': match_rate,
        'avg_path_length': avg_path_length
    }

def calculate_hlda_perplexity_with_path_mapping(base_path=".", corpus=None, test_ratio=0.2, random_state=42):
    """
    基于iteration_path_document_mapping.csv的hLDA困惑度计算
    
    Parameters:
    base_path: str, 结果文件的根目录
    corpus: dict, 原始语料 {doc_id: [word_list]}
    test_ratio: float, 测试集比例
    random_state: int, 随机种子
    """
    
    if corpus is None:
        print("❌ 必须提供原始语料corpus")
        return
    
    # 划分训练集和测试集
    doc_ids = list(corpus.keys())
    train_ids, test_ids = train_test_split(doc_ids, test_size=test_ratio, random_state=random_state)
    
    print(f"📊 数据集划分:")
    print(f"   总文档数: {len(doc_ids)}")
    print(f"   训练集: {len(train_ids)} 文档")
    print(f"   测试集: {len(test_ids)} 文档")
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个模型结果文件待处理")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # 动态参数提取
        eta = 0.1
        gamma = 0.05
        depth = 3
        alpha = 0.1
        
        for param_name in ['eta', 'gamma', 'depth', 'alpha']:
            if f'{param_name}_' in folder_name:
                try:
                    param_part = folder_name.split(f'{param_name}_')[1].split('_')[0]
                    if param_name == 'depth':
                        locals()[param_name] = int(param_part)
                    else:
                        locals()[param_name] = float(param_part)
                except Exception as e:
                    print(f"   ⚠️ 提取参数 {param_name} 失败: {e}")
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] 计算困惑度: {folder_name}")
        print(f"参数 - Eta: {eta}, Gamma: {gamma}, Depth: {depth}, Alpha: {alpha}")
        print(f"{'='*80}")
        
        try:
            # 读取词分布数据
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # 读取路径映射数据
            path_mapping_file = os.path.join(folder_path, 'iteration_path_document_mapping.csv')
            if not os.path.exists(path_mapping_file):
                print("⚠️ 未找到路径映射文件，跳过此文件")
                continue
                
            path_mapping_df = pd.read_csv(path_mapping_file)
            path_mapping_df.columns = [col.strip("'\" ") for col in path_mapping_df.columns]
            
            # 获取最后一轮数据
            max_iteration = word_df['iteration'].max()
            last_word_data = word_df[word_df['iteration'] == max_iteration]
            last_path_mapping_data = path_mapping_df[path_mapping_df['iteration'] == max_iteration]
            
            print(f"📈 最后iteration: {max_iteration}")
            print(f"📈 节点数: {last_word_data['node_id'].nunique()}")
            print(f"📈 路径映射数: {len(last_path_mapping_data)}")
            
            # 计算困惑度
            perplexity_results = compute_perplexity_with_path_mapping(
                last_word_data, 
                last_path_mapping_data, 
                corpus, 
                test_ids, 
                eta
            )
            
            if perplexity_results is not None:
                # 保存困惑度结果
                perplexity_data = [{
                    'eta': eta,
                    'gamma': gamma,
                    'depth': depth,
                    'alpha': alpha,
                    'iteration': max_iteration,
                    'test_docs_count': len(test_ids),
                    'valid_test_docs': perplexity_results['valid_docs'],
                    'matched_docs': perplexity_results['matched_docs'],
                    'total_test_words': perplexity_results['total_words'],
                    'log_likelihood': perplexity_results['log_likelihood'],
                    'perplexity': perplexity_results['perplexity'],
                    'avg_doc_perplexity': perplexity_results['avg_doc_perplexity'],
                    'doc_match_rate': perplexity_results['match_rate'],
                    'avg_path_length': perplexity_results['avg_path_length']
                }]
                
                perplexity_df = pd.DataFrame(perplexity_data)
                output_path = os.path.join(folder_path, 'perplexity_results_path_mapping.csv')
                perplexity_df.to_csv(output_path, index=False)
                
                print(f"💾 困惑度结果已保存到: {output_path}")
                print(f"📊 困惑度结果:")
                print(f"   - 总困惑度: {perplexity_results['perplexity']:.4f}")
                print(f"   - 平均文档困惑度: {perplexity_results['avg_doc_perplexity']:.4f}")
                print(f"   - 文档匹配率: {perplexity_results['match_rate']:.1%}")
                print(f"   - 平均路径长度: {perplexity_results['avg_path_length']:.1f}")
                print(f"   - 有效测试文档: {perplexity_results['valid_docs']}/{len(test_ids)}")
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {file_path} 时出错: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ 所有文件的困惑度计算完成（基于路径映射）！")

def aggregate_path_mapping_perplexity_by_eta(base_path="."):
    """
    按eta值汇总基于路径映射的困惑度统计
    """
    pattern = os.path.join(base_path, "**", "perplexity_results_path_mapping.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个路径映射困惑度结果文件")
    
    all_data = []
    eta_groups = {}
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # 提取eta值
        eta = None
        if 'eta_' in folder_name:
            try:
                eta_part = folder_name.split('eta_')[1].split('_')[0]
                eta = float(eta_part)
            except:
                continue
        else:
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        if eta not in eta_groups:
            eta_groups[eta] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'eta': eta,
                    'run_id': run_id,
                    'gamma': row['gamma'],
                    'depth': row['depth'],
                    'alpha': row['alpha'],
                    'perplexity': row['perplexity'],
                    'avg_doc_perplexity': row['avg_doc_perplexity'],
                    'valid_test_docs': row['valid_test_docs'],
                    'total_test_words': row['total_test_words'],
                    'doc_match_rate': row['doc_match_rate'],
                    'avg_path_length': row['avg_path_length'],
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame并按eta分组汇总
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("未找到有效数据")
        return
    
    print("=" * 70)
    print("各ETA值的路径映射困惑度汇总统计")
    print("=" * 70)
    
    # 按eta分组生成汇总文件
    for eta, group_data in summary_df.groupby('eta'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\n处理 Eta={eta}")
        
        # 计算汇总统计
        eta_summary = group_data.agg({
            'perplexity': ['mean', 'std', 'min', 'max', 'count'],
            'avg_doc_perplexity': ['mean', 'std', 'min', 'max'],
            'valid_test_docs': ['mean', 'std'],
            'total_test_words': 'mean',
            'doc_match_rate': ['mean', 'std'],
            'avg_path_length': ['mean', 'std'],
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # 平铺列名
        eta_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in eta_summary.columns]
        eta_summary = eta_summary.reset_index()
        eta_summary.insert(0, 'eta', eta)
        
        # 保存汇总结果
        output_filename = f'eta_{eta}_perplexity_path_mapping_summary.csv'
        output_path = os.path.join(parent_folder, output_filename)
        eta_summary.to_csv(output_path, index=False)
        
        print(f"  保存汇总文件: {output_path}")
        print(f"  运行数: {int(eta_summary['perplexity_count'].iloc[0])}")
        print(f"  平均困惑度: {eta_summary['perplexity_mean'].iloc[0]:.4f} (±{eta_summary['perplexity_std'].iloc[0]:.4f})")
        print(f"  困惑度范围: [{eta_summary['perplexity_min'].iloc[0]:.4f}, {eta_summary['perplexity_max'].iloc[0]:.4f}]")
        print(f"  平均匹配率: {eta_summary['doc_match_rate_mean'].iloc[0]:.1%}")
        print(f"  平均路径长度: {eta_summary['avg_path_length_mean'].iloc[0]:.1f}")
    
    # 生成总体对比文件
    overall_summary = summary_df.groupby('eta').agg({
        'perplexity': ['mean', 'std', 'min', 'max'],
        'avg_doc_perplexity': ['mean', 'std'],
        'doc_match_rate': ['mean', 'std'],
        'avg_path_length': ['mean', 'std'],
        'run_id': 'count'
    }).round(4)
    
    overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
    overall_summary = overall_summary.reset_index()
    
    overall_output_path = os.path.join(base_path, 'eta_perplexity_path_mapping_comparison.csv')
    overall_summary.to_csv(overall_output_path, index=False)
    print(f"\n总体对比文件保存到: {overall_output_path}")
    
    # 显示跨eta对比
    print(f"\n跨Eta困惑度对比（基于路径映射）:")
    print("Eta值      平均困惑度(±std)     最小值    最大值    匹配率(±std)    平均路径长度    运行数")
    print("-" * 85)
    
    for _, row in overall_summary.iterrows():
        eta = row['eta']
        mean_perp = row['perplexity_mean']
        std_perp = row['perplexity_std']
        min_perp = row['perplexity_min']
        max_perp = row['perplexity_max']
        match_rate = row['doc_match_rate_mean']
        match_std = row['doc_match_rate_std']
        path_length = row['avg_path_length_mean']
        run_count = int(row['run_id_count'])
        
        print(f"{eta:6.3f}    {mean_perp:8.4f}(±{std_perp:6.4f})   {min_perp:7.4f}   {max_perp:7.4f}   {match_rate:.1%}(±{match_std:.1%})   {path_length:8.1f}        {run_count:4d}")

In [86]:
# 首先运行完整的困惑度计算（如果还没有运行）
from sklearn.model_selection import train_test_split
import math
import pandas as pd
import numpy as np
import os
import glob

def calculate_hlda_perplexity_with_path_mapping_complete(base_path=".", corpus=None, test_ratio=0.2, random_state=42):
    """
    完整版：基于iteration_path_document_mapping.csv的hLDA困惑度计算
    """
    
    if corpus is None:
        print("❌ 必须提供原始语料corpus")
        return
    
    # 划分训练集和测试集
    doc_ids = list(corpus.keys())
    train_ids, test_ids = train_test_split(doc_ids, test_size=test_ratio, random_state=random_state)
    
    print(f"📊 数据集划分:")
    print(f"   总文档数: {len(doc_ids)}")
    print(f"   训练集: {len(train_ids)} 文档")
    print(f"   测试集: {len(test_ids)} 文档")
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个模型结果文件待处理")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # 动态参数提取
        eta = 0.1
        gamma = 0.05
        depth = 3
        alpha = 0.1
        
        for param_name in ['eta', 'gamma', 'depth', 'alpha']:
            if f'{param_name}_' in folder_name:
                try:
                    param_part = folder_name.split(f'{param_name}_')[1].split('_')[0]
                    if param_name == 'depth':
                        locals()[param_name] = int(param_part)
                    else:
                        locals()[param_name] = float(param_part)
                except Exception as e:
                    print(f"   ⚠️ 提取参数 {param_name} 失败: {e}")
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] 计算困惑度: {folder_name}")
        print(f"参数 - Eta: {eta}, Gamma: {gamma}, Depth: {depth}, Alpha: {alpha}")
        print(f"{'='*80}")
        
        try:
            # 读取词分布数据
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # 读取路径映射数据
            path_mapping_file = os.path.join(folder_path, 'iteration_path_document_mapping.csv')
            if not os.path.exists(path_mapping_file):
                print("⚠️ 未找到路径映射文件，跳过此文件")
                continue
                
            path_mapping_df = pd.read_csv(path_mapping_file)
            path_mapping_df.columns = [col.strip("'\" ") for col in path_mapping_df.columns]
            
            # 获取最后一轮数据
            max_iteration = word_df['iteration'].max()
            last_word_data = word_df[word_df['iteration'] == max_iteration]
            last_path_mapping_data = path_mapping_df[path_mapping_df['iteration'] == max_iteration]
            
            print(f"📈 最后iteration: {max_iteration}")
            print(f"📈 节点数: {last_word_data['node_id'].nunique()}")
            print(f"📈 路径映射数: {len(last_path_mapping_data)}")
            
            # 使用修正版计算困惑度
            perplexity_results = compute_perplexity_with_path_mapping_fixed(
                last_word_data, 
                last_path_mapping_data, 
                corpus, 
                test_ids, 
                eta
            )
            
            if perplexity_results is not None:
                # 保存困惑度结果
                perplexity_data = [{
                    'eta': eta,
                    'gamma': gamma,
                    'depth': depth,
                    'alpha': alpha,
                    'iteration': max_iteration,
                    'test_docs_count': len(test_ids),
                    'valid_test_docs': perplexity_results['valid_docs'],
                    'matched_docs': perplexity_results['matched_docs'],
                    'total_test_words': perplexity_results['total_words'],
                    'log_likelihood': perplexity_results['log_likelihood'],
                    'perplexity': perplexity_results['perplexity'],
                    'avg_doc_perplexity': perplexity_results['avg_doc_perplexity'],
                    'doc_match_rate': perplexity_results['match_rate'],
                    'avg_path_length': perplexity_results['avg_path_length']
                }]
                
                perplexity_df = pd.DataFrame(perplexity_data)
                output_path = os.path.join(folder_path, 'perplexity_results_final.csv')
                perplexity_df.to_csv(output_path, index=False)
                
                print(f"💾 困惑度结果已保存到: {output_path}")
                print(f"📊 困惑度结果:")
                print(f"   - 总困惑度: {perplexity_results['perplexity']:.4f}")
                print(f"   - 平均文档困惑度: {perplexity_results['avg_doc_perplexity']:.4f}")
                print(f"   - 文档匹配率: {perplexity_results['match_rate']:.1%}")
                print(f"   - 平均路径长度: {perplexity_results['avg_path_length']:.1f}")
                print(f"   - 有效测试文档: {perplexity_results['valid_docs']}/{len(test_ids)}")
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {file_path} 时出错: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ 所有文件的困惑度计算完成！")

def aggregate_perplexity_by_eta_groups(base_path="."):
    """
    按eta值汇总多个run的平均困惑度等指标（修复版）
    """
    
    # 查找所有perplexity_results_final.csv文件
    pattern = os.path.join(base_path, "**", "perplexity_results_final.csv")
    files = glob.glob(pattern, recursive=True)
    
    # 如果没有final文件，寻找其他困惑度文件
    if len(files) == 0:
        patterns = [
            "perplexity_results_path_mapping.csv",
            "perplexity_results_test.csv",
            "perplexity_results.csv"
        ]
        for pattern_name in patterns:
            pattern = os.path.join(base_path, "**", pattern_name)
            files = glob.glob(pattern, recursive=True)
            if len(files) > 0:
                print(f"🔍 使用文件模式: {pattern_name}")
                break
    
    print(f"🔍 找到 {len(files)} 个困惑度结果文件")
    
    if len(files) == 0:
        print("❌ 未找到任何困惑度结果文件")
        return
    
    all_data = []
    eta_groups = {}
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # 提取eta值
        eta = None
        if 'eta_' in folder_name:
            try:
                eta_part = folder_name.split('eta_')[1].split('_')[0]
                eta = float(eta_part)
            except:
                print(f"警告：无法从文件夹名称 {folder_name} 提取eta值")
                continue
        else:
            print(f"警告：文件夹名称 {folder_name} 不包含eta信息")
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            print(f"警告：无法从文件夹名称 {folder_name} 提取run编号")
            run_id = "unknown"
        
        if eta not in eta_groups:
            eta_groups[eta] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            print(f"📖 读取文件: {folder_name} - {len(df)} 行数据")
            
            for _, row in df.iterrows():
                # 检查必需字段是否存在
                if 'perplexity' not in row:
                    print(f"警告：{file_path} 缺少 perplexity 列")
                    continue
                    
                all_data.append({
                    'eta': eta,
                    'run_id': run_id,
                    'gamma': row.get('gamma', 0.05),
                    'depth': row.get('depth', 3),
                    'alpha': row.get('alpha', 0.1),
                    'perplexity': row.get('perplexity', 0),
                    'avg_doc_perplexity': row.get('avg_doc_perplexity', row.get('perplexity', 0)),
                    'valid_test_docs': row.get('valid_test_docs', 0),
                    'total_test_words': row.get('total_test_words', 0),
                    'doc_match_rate': row.get('doc_match_rate', 0),
                    'avg_path_length': row.get('avg_path_length', 0),
                    'log_likelihood': row.get('log_likelihood', 0),
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("未找到有效数据")
        return
    
    print(f"📊 数据摘要:")
    print(f"   总数据行数: {len(summary_df)}")
    print(f"   唯一eta值: {sorted(summary_df['eta'].unique())}")
    print(f"   每个eta的数据量: {summary_df['eta'].value_counts().sort_index().to_dict()}")
    
    print("=" * 80)
    print("各ETA值的困惑度汇总统计")
    print("=" * 80)
    
    # 按eta分组生成汇总文件
    for eta, group_data in summary_df.groupby('eta'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\n处理 Eta={eta}")
        print(f"输出目录: {parent_folder}")
        print(f"该组数据量: {len(group_data)}")
        
        # 检查group_data是否为空
        if len(group_data) == 0:
            print(f"警告：Eta={eta} 组没有数据，跳过")
            continue
        
        # 修复：更安全的聚合字典构建
        agg_dict = {}
        
        # 检查每列是否存在有效数据，并构建相应的聚合字典
        numeric_cols = ['perplexity', 'avg_doc_perplexity', 'valid_test_docs', 
                       'total_test_words', 'doc_match_rate', 'avg_path_length', 'log_likelihood']
        
        for col in numeric_cols:
            if col in group_data.columns:
                # 检查列是否有非空且非NaN的数据
                valid_data = group_data[col].dropna()
                if len(valid_data) > 0:
                    if col in ['perplexity', 'avg_doc_perplexity', 'doc_match_rate', 'avg_path_length', 'log_likelihood']:
                        agg_dict[col] = ['mean', 'std', 'min', 'max']
                    else:
                        agg_dict[col] = ['mean', 'std']
                else:
                    print(f"   警告：{col} 列没有有效数据")
        
        # 添加计数 - 使用不会为空的列
        if 'run_id' in group_data.columns:
            agg_dict['run_id'] = 'count'
        
        # 参数列 - 使用first（第一个值）
        for col in ['gamma', 'depth', 'alpha']:
            if col in group_data.columns:
                agg_dict[col] = 'first'
        
        # 检查聚合字典是否为空
        if not agg_dict:
            print(f"警告：Eta={eta} 组没有可聚合的列，跳过")
            continue
        
        try:
            # 执行聚合操作
            print(f"   聚合字典: {list(agg_dict.keys())}")
            eta_summary = group_data.agg(agg_dict).round(4)
            
            # 平铺列名
            eta_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in eta_summary.columns]
            eta_summary = eta_summary.reset_index()
            eta_summary.insert(0, 'eta', eta)
            
            # 添加run_id列表
            run_ids = ', '.join(sorted(group_data['run_id'].unique()))
            eta_summary['run_ids'] = run_ids
            
            # 保存汇总结果到与run文件夹同级的位置
            output_filename = f'eta_{eta}_perplexity_summary.csv'
            output_path = os.path.join(parent_folder, output_filename)
            eta_summary.to_csv(output_path, index=False)
            
            print(f"  ✓ 保存汇总文件: {output_path}")
            
            # 显示统计信息
            if 'run_id_count' in eta_summary.columns:
                print(f"  运行数: {int(eta_summary['run_id_count'].iloc[0])}")
            
            if 'perplexity_mean' in eta_summary.columns:
                mean_perp = eta_summary['perplexity_mean'].iloc[0]
                std_perp = eta_summary.get('perplexity_std', pd.Series([0])).iloc[0]
                print(f"  平均困惑度: {mean_perp:.4f} (±{std_perp:.4f})")
            
            print(f"  包含运行: {run_ids}")
            
        except Exception as e:
            print(f"❌ 处理Eta={eta}时出错: {e}")
            import traceback
            traceback.print_exc()
    
    # 生成总体对比文件（保存在base_path下）
    print(f"\n" + "=" * 80)
    print("生成总体对比文件")
    print("=" * 80)
    
    try:
        # 构建总体聚合字典
        overall_agg_dict = {}
        
        # 检查每列是否有足够的数据进行聚合
        for col in ['perplexity', 'avg_doc_perplexity', 'doc_match_rate', 'avg_path_length', 
                   'valid_test_docs', 'total_test_words', 'log_likelihood']:
            if col in summary_df.columns:
                valid_data = summary_df[col].dropna()
                if len(valid_data) > 0:
                    overall_agg_dict[col] = ['mean', 'std']
                    if col in ['perplexity', 'avg_doc_perplexity']:
                        overall_agg_dict[col].extend(['min', 'max'])
        
        # 添加计数
        if 'run_id' in summary_df.columns:
            overall_agg_dict['run_id'] = 'count'
        
        if not overall_agg_dict:
            print("警告：没有可聚合的列用于总体对比")
            return None
        
        overall_summary = summary_df.groupby('eta').agg(overall_agg_dict).round(4)
        
        # 平铺列名
        overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
        overall_summary = overall_summary.reset_index()
        
        overall_output_path = os.path.join(base_path, 'eta_perplexity_comparison.csv')
        overall_summary.to_csv(overall_output_path, index=False)
        print(f"✓ 总体对比文件保存到: {overall_output_path}")
        
        # 显示跨eta对比
        print(f"\n跨Eta困惑度对比:")
        print("Eta值      平均困惑度(±std)     运行数")
        print("-" * 50)
        
        for _, row in overall_summary.iterrows():
            eta = row['eta']
            run_count = int(row.get('run_id_count', 0))
            
            if 'perplexity_mean' in row:
                mean_perp = row['perplexity_mean']
                std_perp = row.get('perplexity_std', 0)
                print(f"{eta:6.3f}    {mean_perp:8.4f}(±{std_perp:6.4f})        {run_count:4d}")
            else:
                print(f"{eta:6.3f}    数据缺失                    {run_count:4d}")
        
        return overall_summary
        
    except Exception as e:
        print(f"❌ 生成总体对比时出错: {e}")
        import traceback
        traceback.print_exc()
        return None

def analyze_perplexity_trends(base_path="."):
    """
    分析困惑度趋势
    """
    # 读取总体对比文件
    comparison_file = os.path.join(base_path, 'eta_perplexity_comparison.csv')
    
    if os.path.exists(comparison_file):
        df = pd.read_csv(comparison_file)
        
        print(f"\n📈 困惑度趋势分析:")
        print("=" * 60)
        
        # 计算eta与困惑度的相关性
        eta_perp_corr = df['eta'].corr(df['perplexity_mean'])
        eta_match_corr = df['eta'].corr(df['doc_match_rate_mean'])
        eta_path_corr = df['eta'].corr(df['avg_path_length_mean'])
        
        print(f"Eta与平均困惑度的相关系数: {eta_perp_corr:.4f}")
        print(f"Eta与文档匹配率的相关系数: {eta_match_corr:.4f}")
        print(f"Eta与平均路径长度的相关系数: {eta_path_corr:.4f}")
        
        # 找出最佳eta值
        best_eta_idx = df['perplexity_mean'].idxmin()
        best_eta = df.loc[best_eta_idx, 'eta']
        best_perplexity = df.loc[best_eta_idx, 'perplexity_mean']
        
        print(f"\n🏆 最佳表现:")
        print(f"   最低平均困惑度: {best_perplexity:.4f} (Eta={best_eta})")
        print(f"   对应匹配率: {df.loc[best_eta_idx, 'doc_match_rate_mean']:.1%}")
        print(f"   运行次数: {int(df.loc[best_eta_idx, 'run_id_count'])}")
        
        # 稳定性分析
        print(f"\n📊 稳定性分析 (变异系数):")
        for _, row in df.iterrows():
            eta = row['eta']
            cv = row['perplexity_std'] / row['perplexity_mean'] if row['perplexity_mean'] > 0 else 0
            print(f"   Eta {eta}: CV={cv:.4f}")
    
    else:
        print("⚠️ 未找到总体对比文件，请先运行汇总函数")

# 执行完整的困惑度计算和汇总
base_path = "/Volumes/My Passport/收敛结果/step2"

print("=" * 80)
print("开始完整的困惑度计算...")
print("=" * 80)

# 1. 计算困惑度（如果还没有完成）
calculate_hlda_perplexity_with_path_mapping_complete(base_path, corpus, test_ratio=0.2)

print("\n" + "=" * 80)
print("开始按eta值汇总困惑度统计...")
print("=" * 80)

# 2. 按eta汇总
overall_summary = aggregate_perplexity_by_eta_groups(base_path)

print("\n" + "=" * 80)
print("开始困惑度趋势分析...")
print("=" * 80)

# 3. 趋势分析
analyze_perplexity_trends(base_path)

print("=" * 80)
print("✅ 困惑度计算和汇总分析完成！")
print("=" * 80)

开始完整的困惑度计算...
📊 数据集划分:
   总文档数: 970
   训练集: 776 文档
   测试集: 194 文档
🔍 找到 18 个模型结果文件待处理

[1/18] 计算困惑度: depth_3_gamma_0.05_eta_0.1_run_2
参数 - Eta: 0.1, Gamma: 0.05, Depth: 3, Alpha: 0.1
📈 最后iteration: 175
📈 节点数: 231
📈 路径映射数: 970
🔄 构建模型参数（基于路径映射）...
   词典大小: 1490
   构建了 231 个节点的词分布
🔄 构建文档路径映射...
   路径映射数据列: ['iteration', 'leaf_node_id', 'document_id', 'layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id']
   找到层级列: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id']
   测试文档ID范围: [23, 968]
   数据文档ID范围: [0, 969]
   调试 - 文档107的完整路径: [0, 1, 2] (长度: 3)
   调试 - 文档420的完整路径: [0, 1, 2] (长度: 3)
   调试 - 文档656的完整路径: [0, 1, 2] (长度: 3)
   调试 - 文档851的完整路径: [0, 1, 2] (长度: 3)
   调试 - 文档31的完整路径: [0, 1, 3] (长度: 3)
   匹配到路径的测试文档: 194/194 (100.0%)
   平均路径长度: 3.0
🔄 计算困惑度...
   ✓ 处理了 194 个有效测试文档
   ✓ 总计 16508 个测试词
💾 困惑度结果已保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/perplexity_results_final.csv
📊 困惑度结果:
   - 总困惑度: 402.1430
   - 平均文档困惑度: 418.0372
   - 文档匹配率: 10

Traceback (most recent call last):
  File "/var/folders/v5/6mdkg5713kxgwg5xs24g8rvr0000gn/T/ipykernel_97915/1564661990.py", line 283, in aggregate_perplexity_by_eta_groups
    eta_summary = group_data.agg(agg_dict).round(4)
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/frame.py", line 9342, in aggregate
    result = op.agg()
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 776, in agg
    result = super().agg()
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 172, in agg
    return self.agg_dict_like()
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 504, in agg_dict_like
    results = {
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 505, in <dictcomp>
    k

In [91]:
import pandas as pd
import numpy as np
import os
import glob

def calculate_branching_and_gini_metrics(base_path="."):
    """
    计算每个模型的分枝数和基尼系数指标
    """
    pattern = os.path.join(base_path, "**", "corrected_renyi_entropy.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个entropy文件待处理")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        print(f"\n[{idx}/{len(files)}] 处理文件夹: {folder_name}")
        
        try:
            # 读取entropy文件
            entropy_df = pd.read_csv(file_path)
            
            # 检查必要的列是否存在
            required_cols = ['node_id', 'layer', 'document_count', 'child_count']
            missing_cols = [col for col in required_cols if col not in entropy_df.columns]
            
            if missing_cols:
                print(f"⚠️ 缺少必要列: {missing_cols}，跳过此文件")
                continue
            
            # 1. 计算层级分枝数和基尼系数指标
            layer_metrics = []
            
            for layer in entropy_df['layer'].unique():
                if layer == -1:  # 跳过无效层级
                    continue
                    
                layer_nodes = entropy_df[entropy_df['layer'] == layer]
                
                # 基本统计
                node_count = len(layer_nodes)
                total_documents = layer_nodes['document_count'].sum()
                
                # 分枝数统计
                child_counts = layer_nodes['child_count'].values
                total_branches = child_counts.sum()
                
                # 非叶子节点统计
                non_leaf_nodes = (child_counts > 0).sum()
                non_leaf_counts = child_counts[child_counts > 0]
                
                # 分枝因子统计
                if len(non_leaf_counts) > 0:
                    avg_branching_factor = non_leaf_counts.mean()
                    std_branching_factor = non_leaf_counts.std()
                    non_leaf_avg_branching = non_leaf_counts.mean()
                else:
                    avg_branching_factor = 0.0
                    std_branching_factor = 0.0
                    non_leaf_avg_branching = 0.0
                
                # 基尼系数计算
                def gini_coefficient(values):
                    """计算基尼系数"""
                    if len(values) == 0:
                        return 0.0
                    values = np.array(values)
                    values = values[values > 0]  # 只考虑正值
                    if len(values) <= 1:
                        return 0.0
                    
                    values = np.sort(values)
                    n = len(values)
                    cumsum = np.cumsum(values)
                    return (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
                
                # 文档分布基尼系数
                doc_counts = layer_nodes['document_count'].values
                gini_doc_distribution = gini_coefficient(doc_counts)
                
                # 分枝分布基尼系数
                gini_branch_distribution = gini_coefficient(child_counts)
                
                layer_metrics.append({
                    'layer': layer,
                    'node_count': node_count,
                    'total_branches': total_branches,
                    'avg_branching_factor': avg_branching_factor,
                    'std_branching_factor': std_branching_factor,
                    'non_leaf_nodes': non_leaf_nodes,
                    'non_leaf_avg_branching': non_leaf_avg_branching,
                    'total_documents': total_documents,
                    'gini_doc_distribution': gini_doc_distribution,
                    'gini_branch_distribution': gini_branch_distribution
                })
            
            # 保存层级指标
            if layer_metrics:
                layer_df = pd.DataFrame(layer_metrics)
                layer_output_path = os.path.join(folder_path, 'layer_branching_gini_metrics.csv')
                layer_df.to_csv(layer_output_path, index=False)
                print(f"✓ 层级指标保存到: {layer_output_path}")
            
            # 2. 计算全局分枝数和基尼系数指标
            total_nodes = len(entropy_df)
            total_layers = len(entropy_df['layer'].unique()) - (1 if -1 in entropy_df['layer'].unique() else 0)
            
            all_child_counts = entropy_df['child_count'].values
            total_branches = all_child_counts.sum()
            
            # 全局分枝统计
            non_zero_branches = all_child_counts[all_child_counts > 0]
            if len(non_zero_branches) > 0:
                global_avg_branching = non_zero_branches.mean()
                global_std_branching = non_zero_branches.std()
                global_max_branching = non_zero_branches.max()
            else:
                global_avg_branching = 0.0
                global_std_branching = 0.0
                global_max_branching = 0
            
            global_total_documents = entropy_df['document_count'].sum()
            
            # 全局基尼系数
            def gini_coefficient(values):
                if len(values) == 0:
                    return 0.0
                values = np.array(values)
                values = values[values > 0]
                if len(values) <= 1:
                    return 0.0
                
                values = np.sort(values)
                n = len(values)
                cumsum = np.cumsum(values)
                return (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
            
            global_gini_doc_distribution = gini_coefficient(entropy_df['document_count'].values)
            global_gini_branch_distribution = gini_coefficient(all_child_counts)
            
            global_metrics = [{
                'total_nodes': total_nodes,
                'total_layers': total_layers,
                'total_branches': total_branches,
                'global_avg_branching': global_avg_branching,
                'global_std_branching': global_std_branching,
                'global_max_branching': global_max_branching,
                'global_total_documents': global_total_documents,
                'global_gini_doc_distribution': global_gini_doc_distribution,
                'global_gini_branch_distribution': global_gini_branch_distribution
            }]
            
            # 保存全局指标
            global_df = pd.DataFrame(global_metrics)
            global_output_path = os.path.join(folder_path, 'global_branching_gini_metrics.csv')
            global_df.to_csv(global_output_path, index=False)
            print(f"✓ 全局指标保存到: {global_output_path}")
            
            # 显示简要统计
            print(f"📊 指标摘要:")
            print(f"   总节点数: {total_nodes}")
            print(f"   总层数: {total_layers}")
            print(f"   全局平均分枝: {global_avg_branching:.2f}")
            print(f"   全局文档基尼: {global_gini_doc_distribution:.4f}")
            print(f"   全局分枝基尼: {global_gini_branch_distribution:.4f}")
            
        except Exception as e:
            import traceback
            print(f"❌ 处理文件 {file_path} 时出错: {str(e)}")
            traceback.print_exc()

def aggregate_branching_gini_by_eta(base_path="."):
    """
    按eta值汇总分枝数和基尼系数统计
    """
    
    # 1. 汇总层级指标
    print("=" * 80)
    print("汇总层级分枝数和基尼系数指标...")
    print("=" * 80)
    
    pattern = os.path.join(base_path, "**", "layer_branching_gini_metrics.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个层级指标文件")
    
    all_layer_data = []
    eta_groups = {}
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # 提取eta值
        eta = None
        if 'eta_' in folder_name:
            try:
                eta_part = folder_name.split('eta_')[1].split('_')[0]
                eta = float(eta_part)
            except:
                continue
        else:
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        if eta not in eta_groups:
            eta_groups[eta] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_layer_data.append({
                    'eta': eta,
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'total_branches': row['total_branches'],
                    'avg_branching_factor': row['avg_branching_factor'],
                    'std_branching_factor': row['std_branching_factor'],
                    'non_leaf_nodes': row['non_leaf_nodes'],
                    'non_leaf_avg_branching': row['non_leaf_avg_branching'],
                    'total_documents': row['total_documents'],
                    'gini_doc_distribution': row['gini_doc_distribution'],
                    'gini_branch_distribution': row['gini_branch_distribution'],
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame并按eta分组汇总
    if all_layer_data:
        layer_summary_df = pd.DataFrame(all_layer_data)
        
        print("各ETA值的层级分枝数和基尼系数汇总统计")
        print("=" * 80)
        
        # 按eta分组生成层级汇总文件
        for eta, group_data in layer_summary_df.groupby('eta'):
            parent_folder = group_data['parent_folder'].iloc[0]
            
            print(f"\n处理 Eta={eta}")
            
            layer_summary = group_data.groupby('layer').agg({
                'node_count': ['mean', 'std'],
                'total_branches': ['mean', 'std'],
                'avg_branching_factor': ['mean', 'std'],
                'std_branching_factor': ['mean', 'std'],
                'non_leaf_nodes': ['mean', 'std'],
                'non_leaf_avg_branching': ['mean', 'std'],
                'total_documents': ['mean', 'std'],
                'gini_doc_distribution': ['mean', 'std'],
                'gini_branch_distribution': ['mean', 'std'],
                'run_id': 'count'
            }).round(4)
            
            # 平铺列名
            layer_summary.columns = ['_'.join(col).strip() for col in layer_summary.columns]
            layer_summary = layer_summary.reset_index()
            layer_summary.insert(0, 'eta', eta)
            
            # 保存汇总结果
            output_filename = f'eta_{eta}_layer_branching_gini_summary.csv'
            output_path = os.path.join(parent_folder, output_filename)
            layer_summary.to_csv(output_path, index=False)
            
            print(f"  保存层级汇总文件: {output_path}")
            print(f"  层数: {len(layer_summary)}")
            
            # 修复：查找正确的计数列名
            count_col = None
            for col in layer_summary.columns:
                if 'run_id' in col and ('count' in col or col.endswith('_count')):
                    count_col = col
                    break
            
            # 显示简要统计
            for _, row in layer_summary.iterrows():
                layer_num = int(row['layer'])
                avg_branch = row['avg_branching_factor_mean']
                doc_gini = row['gini_doc_distribution_mean']
                branch_gini = row['gini_branch_distribution_mean']
                run_count = int(row[count_col]) if count_col else 0
                
                print(f"    Layer {layer_num}: 分枝={avg_branch:.2f}, 文档基尼={doc_gini:.4f}, 分枝基尼={branch_gini:.4f}, runs={run_count}")
        
        # 生成总体层级对比文件
        overall_layer_summary = layer_summary_df.groupby(['eta', 'layer']).agg({
            'avg_branching_factor': ['mean', 'std'],
            'gini_doc_distribution': ['mean', 'std'],
            'gini_branch_distribution': ['mean', 'std'],
            'node_count': ['mean', 'std'],
            'run_id': 'count'
        }).round(4)
        
        overall_layer_summary.columns = ['_'.join(col).strip() for col in overall_layer_summary.columns]
        overall_layer_summary = overall_layer_summary.reset_index()
        
        overall_layer_output_path = os.path.join(base_path, 'eta_layer_branching_gini_comparison.csv')
        overall_layer_summary.to_csv(overall_layer_output_path, index=False)
        print(f"\n总体层级对比文件保存到: {overall_layer_output_path}")
    
    # 2. 汇总全局指标
    print("\n" + "=" * 80)
    print("汇总全局分枝数和基尼系数指标...")
    print("=" * 80)
    
    pattern = os.path.join(base_path, "**", "global_branching_gini_metrics.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 找到 {len(files)} 个全局指标文件")
    
    all_global_data = []
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # 提取eta值
        eta = None
        if 'eta_' in folder_name:
            try:
                eta_part = folder_name.split('eta_')[1].split('_')[0]
                eta = float(eta_part)
            except:
                continue
        else:
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_global_data.append({
                    'eta': eta,
                    'run_id': run_id,
                    'total_nodes': row['total_nodes'],
                    'total_layers': row['total_layers'],
                    'total_branches': row['total_branches'],
                    'global_avg_branching': row['global_avg_branching'],
                    'global_std_branching': row['global_std_branching'],
                    'global_max_branching': row['global_max_branching'],
                    'global_total_documents': row['global_total_documents'],
                    'global_gini_doc_distribution': row['global_gini_doc_distribution'],
                    'global_gini_branch_distribution': row['global_gini_branch_distribution'],
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame并按eta分组汇总
    if all_global_data:
        global_summary_df = pd.DataFrame(all_global_data)
        
        print("各ETA值的全局分枝数和基尼系数汇总统计")
        print("=" * 80)
        
        # 按eta分组生成全局汇总文件
        for eta, group_data in global_summary_df.groupby('eta'):
            parent_folder = group_data['parent_folder'].iloc[0]
            
            print(f"\n处理 Eta={eta} 全局指标")
            
            global_summary = group_data.agg({
                'total_nodes': ['mean', 'std'],
                'total_layers': ['mean', 'std'],
                'total_branches': ['mean', 'std'],
                'global_avg_branching': ['mean', 'std'],
                'global_std_branching': ['mean', 'std'],
                'global_max_branching': ['mean', 'std'],
                'global_total_documents': ['mean', 'std'],
                'global_gini_doc_distribution': ['mean', 'std'],
                'global_gini_branch_distribution': ['mean', 'std'],
                'run_id': 'count'
            }).round(4)
            
            # 平铺列名
            global_summary.columns = ['_'.join(col).strip() for col in global_summary.columns]
            global_summary = global_summary.reset_index()
            global_summary.insert(0, 'eta', eta)
            
            # 保存汇总结果
            output_filename = f'eta_{eta}_global_branching_gini_summary.csv'
            output_path = os.path.join(parent_folder, output_filename)
            global_summary.to_csv(output_path, index=False)
            
            print(f"  保存全局汇总文件: {output_path}")
            
            # 修复：查找正确的计数列名
            count_col = None
            for col in global_summary.columns:
                if 'run_id' in col and ('count' in col or col.endswith('_count')):
                    count_col = col
                    break
            
            # 安全访问列
            if len(global_summary) > 0:
                run_count = int(global_summary[count_col].iloc[0]) if count_col else 0
                
                # 安全访问其他列
                cols_to_access = ['global_avg_branching_mean', 'global_gini_doc_distribution_mean', 'global_gini_branch_distribution_mean']
                values = {}
                
                for col_name in cols_to_access:
                    if col_name in global_summary.columns:
                        values[col_name] = global_summary[col_name].iloc[0]
                    else:
                        values[col_name] = 0.0
                
                print(f"  运行数: {run_count}")
                print(f"  全局平均分枝: {values['global_avg_branching_mean']:.2f}")
                print(f"  全局文档基尼: {values['global_gini_doc_distribution_mean']:.4f}")
                print(f"  全局分枝基尼: {values['global_gini_branch_distribution_mean']:.4f}")
        
        # 生成总体全局对比文件
        overall_global_summary = global_summary_df.groupby('eta').agg({
            'total_nodes': ['mean', 'std'],
            'total_branches': ['mean', 'std'],
            'global_avg_branching': ['mean', 'std'],
            'global_gini_doc_distribution': ['mean', 'std'],
            'global_gini_branch_distribution': ['mean', 'std'],
            'run_id': 'count'
        }).round(4)
        
        overall_global_summary.columns = ['_'.join(col).strip() for col in overall_global_summary.columns]
        overall_global_summary = overall_global_summary.reset_index()
        
        overall_global_output_path = os.path.join(base_path, 'eta_global_branching_gini_comparison.csv')
        overall_global_summary.to_csv(overall_global_output_path, index=False)
        print(f"\n总体全局对比文件保存到: {overall_global_output_path}")
        
        # 修复：查找正确的计数列名用于显示
        count_col = None
        for col in overall_global_summary.columns:
            if 'run_id' in col and ('count' in col or col.endswith('_count')):
                count_col = col
                break
        
        # 显示跨eta对比
        print(f"\n跨Eta全局指标对比:")
        print("Eta值      平均分枝(±std)     文档基尼(±std)     分枝基尼(±std)     运行数")
        print("-" * 80)
        
        for _, row in overall_global_summary.iterrows():
            eta = row['eta']
            
            # 安全访问列
            cols_needed = ['global_avg_branching_mean', 'global_avg_branching_std', 
                          'global_gini_doc_distribution_mean', 'global_gini_doc_distribution_std',
                          'global_gini_branch_distribution_mean', 'global_gini_branch_distribution_std']
            
            values = {}
            for col_name in cols_needed:
                if col_name in row:
                    values[col_name] = row[col_name]
                else:
                    values[col_name] = 0.0
            
            run_count = int(row[count_col]) if count_col and count_col in row else 0
            
            print(f"{eta:6.3f}    {values['global_avg_branching_mean']:6.2f}(±{values['global_avg_branching_std']:4.2f})     "
                  f"{values['global_gini_doc_distribution_mean']:6.4f}(±{values['global_gini_doc_distribution_std']:5.4f})     "
                  f"{values['global_gini_branch_distribution_mean']:6.4f}(±{values['global_gini_branch_distribution_std']:5.4f})     {run_count:4d}")

def display_branching_gini_summary(base_path="."):
    """
    显示分枝数和基尼系数的汇总报告
    """
    print("=" * 100)
    print("分枝数和基尼系数分析汇总报告")
    print("=" * 100)
    
    # 读取总体对比文件
    layer_comparison_file = os.path.join(base_path, 'eta_layer_branching_gini_comparison.csv')
    global_comparison_file = os.path.join(base_path, 'eta_global_branching_gini_comparison.csv')
    
    if os.path.exists(layer_comparison_file):
        print("\n📊 层级分枝数和基尼系数分析:")
        print("-" * 60)
        
        df = pd.read_csv(layer_comparison_file)
        
        # 修复：查找正确的计数列名
        count_col = None
        for col in df.columns:
            if 'run_id' in col and ('count' in col or col.endswith('_count')):
                count_col = col
                break
        
        for layer in sorted(df['layer'].unique()):
            print(f"\nLayer {int(layer)} 跨Eta对比:")
            print("Eta值      平均分枝(±std)     文档基尼(±std)     分枝基尼(±std)     运行数")
            print("-" * 75)
            
            layer_data = df[df['layer'] == layer]
            for _, row in layer_data.iterrows():
                eta = row['eta']
                avg_branch = row['avg_branching_factor_mean']
                branch_std = row['avg_branching_factor_std']
                doc_gini = row['gini_doc_distribution_mean']
                doc_gini_std = row['gini_doc_distribution_std']
                branch_gini = row['gini_branch_distribution_mean']
                branch_gini_std = row['gini_branch_distribution_std']
                run_count = int(row[count_col]) if count_col else 0
                
                print(f"{eta:6.3f}    {avg_branch:6.2f}(±{branch_std:4.2f})     {doc_gini:6.4f}(±{doc_gini_std:5.4f})     {branch_gini:6.4f}(±{branch_gini_std:5.4f})     {run_count:4d}")
    
    if os.path.exists(global_comparison_file):
        print(f"\n📊 全局分枝数和基尼系数分析:")
        print("-" * 60)
        
        df = pd.read_csv(global_comparison_file)
        
        # 修复：查找正确的计数列名
        count_col = None
        for col in df.columns:
            if 'run_id' in col and ('count' in col or col.endswith('_count')):
                count_col = col
                break
        
        print("Eta值      平均分枝(±std)     文档基尼(±std)     分枝基尼(±std)     运行数")
        print("-" * 80)
        
        for _, row in df.iterrows():
            eta = row['eta']
            avg_branch = row['global_avg_branching_mean']
            branch_std = row['global_avg_branching_std']
            doc_gini = row['global_gini_doc_distribution_mean']
            doc_gini_std = row['global_gini_doc_distribution_std']
            branch_gini = row['global_gini_branch_distribution_mean']
            branch_gini_std = row['global_gini_branch_distribution_std']
            run_count = int(row[count_col]) if count_col else 0
            
            print(f"{eta:6.3f}    {avg_branch:6.2f}(±{branch_std:4.2f})     {doc_gini:6.4f}(±{doc_gini_std:5.4f})     {branch_gini:6.4f}(±{branch_gini_std:5.4f})     {run_count:4d}")
    
    print("\n" + "=" * 100)
    print("✅ 分枝数和基尼系数分析完成！")
    print("=" * 100)

# 执行分枝数和基尼系数分析
base_path = "/Volumes/My Passport/收敛结果/step2"

print("=" * 80)
print("开始计算分枝数和基尼系数指标...")
print("=" * 80)

# 1. 计算每个模型的分枝数和基尼系数
calculate_branching_and_gini_metrics(base_path)

print("\n" + "=" * 80)
print("开始按eta值汇总分枝数和基尼系数统计...")
print("=" * 80)

# 2. 按eta汇总
aggregate_branching_gini_by_eta(base_path)

print("\n" + "=" * 80)
print("显示分枝数和基尼系数汇总报告...")
print("=" * 80)

# 3. 显示汇总报告
display_branching_gini_summary(base_path)

开始计算分枝数和基尼系数指标...
🔍 找到 18 个entropy文件待处理

[1/18] 处理文件夹: depth_3_gamma_0.05_eta_0.1_run_2
✓ 层级指标保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/layer_branching_gini_metrics.csv
✓ 全局指标保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_2/global_branching_gini_metrics.csv
📊 指标摘要:
   总节点数: 231
   总层数: 3
   全局平均分枝: 5.11
   全局文档基尼: 0.7333
   全局分枝基尼: 0.4790

[2/18] 处理文件夹: depth_3_gamma_0.05_eta_0.1_run_3
✓ 层级指标保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_3/layer_branching_gini_metrics.csv
✓ 全局指标保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_3/global_branching_gini_metrics.csv
📊 指标摘要:
   总节点数: 215
   总层数: 3
   全局平均分枝: 5.10
   全局文档基尼: 0.7276
   全局分枝基尼: 0.5579

[3/18] 处理文件夹: depth_3_gamma_0.05_eta_0.1_run_1
✓ 层级指标保存到: /Volumes/My Passport/收敛结果/step2/step2_d3_g005_e01_收敛/depth_3_gamma_0.05_eta_0.1_run_1/layer_branching_gini_metrics.cs