In [None]:
import pandas as pd
import numpy as np
import os
import glob
from scipy.special import gammaln

def calculate_renyi_entropy_vectorized(node_data, all_words, alpha_prior=1.0, renyi_alpha=2.0):
    """
    向量化版本的Renyi熵计算
    
    Parameters:
    node_data: DataFrame, 包含word和count列的节点数据
    all_words: list, 全量词汇表
    alpha_prior: float, Dirichlet先验平滑参数  
    renyi_alpha: float, Renyi熵的阶数参数
    
    Returns:
    tuple: (entropy, nonzero_word_count) Renyi熵值和非零词汇数量
    """
    if len(all_words) == 0:
        return 0.0, 0
    
    # 创建词汇到索引的映射
    word_to_idx = {word: idx for idx, word in enumerate(all_words)}
    
    # 初始化计数向量
    counts = np.zeros(len(all_words))
    
    # 填充实际计数
    for _, row in node_data.iterrows():
        word = row['word']
        if pd.notna(word) and word in word_to_idx:
            counts[word_to_idx[word]] = row['count']
    
    # 统计非零词汇数量（平滑前）
    nonzero_word_count = np.sum(counts > 0)
    
    # 添加alpha平滑
    smoothed_counts = counts + alpha_prior
    
    # 计算概率分布
    probabilities = smoothed_counts / np.sum(smoothed_counts)
    
    # 计算Renyi熵
    if renyi_alpha == 1.0:
        # Shannon熵（由于alpha平滑，所有概率都>0，无需添加小常数）
        entropy = -np.sum(probabilities * np.log2(probabilities))
    else:
        # 一般Renyi熵
        entropy = (1 / (1 - renyi_alpha)) * np.log2(np.sum(probabilities ** renyi_alpha))
    
    return entropy, int(nonzero_word_count)

def process_all_iteration_files(base_path=".", alpha_prior=1.0, renyi_alpha=2.0):
    """
    针对每个iteration_node_word_distributions.csv单独处理并保存结果
    """
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\n处理文件: {file_path}")
        
        try:
            df = pd.read_csv(file_path)
            
            # 清理列名，去除单引号、双引号和空格
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            if 'node_id' not in df.columns:
                print(f"警告：{file_path} 缺少 node_id 列，跳过该文件")
                continue
                
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            all_words = list(last_iteration_data['word'].dropna().unique())
            
            print(f"最后一轮iteration: {max_iteration}, 词汇表大小: {len(all_words)}, 节点数: {last_iteration_data['node_id'].nunique()}")
            
            results = []
            for node_id in last_iteration_data['node_id'].unique():
                node_data = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                entropy, nonzero_words = calculate_renyi_entropy_vectorized(
                    node_data, all_words, alpha_prior, renyi_alpha
                )
                
                # 计算稀疏度（非零词汇占比）
                sparsity_ratio = nonzero_words / len(all_words) if len(all_words) > 0 else 0
                
                results.append({
                    'node_id': node_id,
                    'renyi_entropy_corrected': entropy,
                    'nonzero_word_count': nonzero_words,
                    'total_vocabulary_size': len(all_words),
                    'sparsity_ratio': sparsity_ratio,
                    'alpha_prior': alpha_prior,
                    'renyi_alpha': renyi_alpha,
                    'iteration': max_iteration
                })
            
            # 保存新的corrected_renyi_entropy.csv文件
            results_df = pd.DataFrame(results)
            output_path = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            results_df.to_csv(output_path, index=False)
            print(f"保存修正的Renyi熵结果到: {output_path}")
            
            # 输出一些统计信息
            print(f"节点词汇稀疏性统计:")
            print(f"  - 平均非零词汇数: {results_df['nonzero_word_count'].mean():.1f}")
            print(f"  - 非零词汇数范围: {results_df['nonzero_word_count'].min()}-{results_df['nonzero_word_count'].max()}")
            print(f"  - 平均稀疏度: {results_df['sparsity_ratio'].mean():.3f}")
            print("=" * 50)
            
                
        except Exception as e:
            import traceback
            print(f"处理文件 {file_path} 时出错: {str(e)}")
            print("详细错误信息:")
            traceback.print_exc()

In [2]:
# 设置参数
base_path = "/Volumes/My Passport/收敛结果/2"  # 根目录
alpha_prior = 0.1  # Dirichlet先验平滑参数
renyi_alpha = 2.0  # Renyi熵阶数参数

print("=" * 50)
print("开始批量计算修正的Renyi熵...")
print("=" * 50)
process_all_iteration_files(base_path, alpha_prior, renyi_alpha)
print("=" * 50)
print("全部处理完成！")
print("=" * 50)

开始批量计算修正的Renyi熵...

处理文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_node_word_distributions.csv
最后一轮iteration: 115, 词汇表大小: 1490, 节点数: 228
保存修正的Renyi熵结果到: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/corrected_renyi_entropy.csv
节点词汇稀疏性统计:
  - 平均非零词汇数: 71.7
  - 非零词汇数范围: 0-842
  - 平均稀疏度: 0.048

处理文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/iteration_node_word_distributions.csv
最后一轮iteration: 115, 词汇表大小: 1490, 节点数: 235
保存修正的Renyi熵结果到: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/corrected_renyi_entropy.csv
节点词汇稀疏性统计:
  - 平均非零词汇数: 72.6
  - 非零词汇数范围: 0-888
  - 平均稀疏度: 0.049

处理文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_3/iteration_node_word_distributions.csv
最后一轮iteration: 115, 词汇表大小: 1490, 节点数: 246
保存修正的Renyi熵结果到: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_3/corrected_renyi_entropy.csv
节点词汇稀疏性统计:
  - 平均非零词汇数: 78.5
  - 非零词汇数范围: 0-760
  - 平均稀疏度: 0.0

In [3]:
def calculate_node_document_counts(path_structures_df):
    """
    从叶子节点向上聚合，计算每个节点的文档数和层级关系
    
    Parameters:
    path_structures_df: DataFrame, iteration_path_structures.csv的数据（已经过滤为最后一轮）
    
    Returns:
    dict: {node_id: {'document_count': int, 'layer': int, 'parent_id': int, 'child_ids': list}} 映射
    """
    # 获取所有layer列 - 修正正则表达式
    layer_columns = [col for col in path_structures_df.columns if col.startswith('layer_') and col.endswith('_node_id')]
    layer_columns.sort()  # 确保按顺序排列
    max_layer_idx = len(layer_columns) - 1
    
    print(f"[DEBUG] 发现层级列: {layer_columns}")
    print(f"[DEBUG] 最大层级索引: {max_layer_idx}")
    
    # 初始化节点信息字典
    node_info = {}
    
    # 处理叶子节点 - 直接使用leaf_node_id列
    for _, row in path_structures_df.iterrows():
        leaf_node = row['leaf_node_id']
        if pd.notna(leaf_node):
            if leaf_node not in node_info:
                node_info[leaf_node] = {
                    'document_count': 0,
                    'layer': max_layer_idx,
                    'parent_id': None,
                    'child_ids': [],
                    'child_count': 0
                }
            node_info[leaf_node]['document_count'] += row['document_count']
    
    # 建立父子关系和层级信息
    for _, row in path_structures_df.iterrows():
        path_nodes = []
        for layer_idx in range(max_layer_idx + 1):
            layer_col = f'layer_{layer_idx}_node_id'
            if layer_col in path_structures_df.columns and pd.notna(row[layer_col]):
                path_nodes.append(row[layer_col])
            else:
                break
        
        # 为路径中的每个节点建立层级和父子关系
        for i, node in enumerate(path_nodes):
            if node not in node_info:
                node_info[node] = {
                    'document_count': 0,
                    'layer': i,
                    'parent_id': None,
                    'child_ids': [],
                    'child_count': 0
                }
            else:
                # 更新层级信息（确保一致性）
                node_info[node]['layer'] = i
            
            # 设置父节点关系
            if i > 0:  # 不是根节点
                parent_node = path_nodes[i-1]
                node_info[node]['parent_id'] = parent_node
                
                # 在父节点的子节点列表中添加当前节点
                if parent_node not in node_info:
                    node_info[parent_node] = {
                        'document_count': 0,
                        'layer': i-1,
                        'parent_id': None,
                        'child_ids': [],
                        'child_count': 0
                    }
                
                if node not in node_info[parent_node]['child_ids']:
                    node_info[parent_node]['child_ids'].append(node)
    
    # 从倒数第二层开始向上聚合文档数
    for layer_idx in range(max_layer_idx - 1, -1, -1):  # 从倒数第二层到第0层
        layer_col = f'layer_{layer_idx}_node_id'
        
        if layer_col not in path_structures_df.columns:
            continue
            
        # 获取这一层的所有唯一节点
        layer_nodes = path_structures_df[layer_col].dropna().unique()
        
        for node in layer_nodes:
            if node in node_info and node_info[node]['document_count'] == 0:
                # 计算文档数
                total_docs = path_structures_df[path_structures_df[layer_col] == node]['document_count'].sum()
                node_info[node]['document_count'] = total_docs

    # 计算每个节点的子节点数量
    for node_id, info in node_info.items():
        info['child_count'] = len(info['child_ids'])
    
    return node_info

def add_document_counts_to_entropy_files(base_path="."):
    """
    将文档数和层级信息添加到corrected_renyi_entropy.csv文件中
    """
    pattern = os.path.join(base_path, "**", "iteration_path_structures.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\n处理路径结构文件: {file_path}")
        
        try:
            # 读取path_structures文件
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            # 获取最后一轮数据
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            print(f"最后一轮iteration: {max_iteration}, 路径数: {len(last_iteration_data)}")
            
            # 计算每个节点的文档数和层级关系
            node_info = calculate_node_document_counts(last_iteration_data)
            
            print(f"计算得到 {len(node_info)} 个节点的信息")
            
            # 读取对应的corrected_renyi_entropy.csv
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if os.path.exists(entropy_file):
                entropy_df = pd.read_csv(entropy_file)
                
                # 添加新列 - 修正child_ids格式和child_count计算
                entropy_df['document_count'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('document_count', 0))
                entropy_df['layer'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('layer', -1))
                entropy_df['parent_id'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('parent_id', None))
                
                # 修正child_ids格式：使用方括号而不是逗号
                entropy_df['child_ids'] = entropy_df['node_id'].map(
                    lambda x: '[' + ','.join(map(str, node_info.get(x, {}).get('child_ids', []))) + ']' 
                    if node_info.get(x, {}).get('child_ids') else ''
                )
                
                # 修正child_count：直接使用列表长度
                entropy_df['child_count'] = entropy_df['node_id'].map(lambda x: len(node_info.get(x, {}).get('child_ids', [])))

                # 保存更新后的文件
                entropy_df.to_csv(entropy_file, index=False)
                print(f"已更新 {entropy_file}，添加了document_count, layer, parent_id, child_ids, child_count列")
                
                # 显示一些统计信息
                print(f"节点层级统计:")
                print(f"  - 层级分布: {entropy_df['layer'].value_counts().sort_index().to_dict()}")
                print(f"  - 文档数范围: {entropy_df['document_count'].min()}-{entropy_df['document_count'].max()}")
                print(f"  - 根节点数: {entropy_df[entropy_df['parent_id'].isna()].shape[0]}")
                print(f"  - 叶子节点数: {entropy_df[entropy_df['child_ids'] == ''].shape[0]}")
                print(f"  - 子节点数分布: {entropy_df['child_count'].value_counts().sort_index().to_dict()}")
            else:
                print(f"警告：未找到对应的entropy文件 {entropy_file}")
                
        except Exception as e:
            import traceback
            print(f"处理文件 {file_path} 时出错: {str(e)}")
            print("详细错误信息:")
            traceback.print_exc()

In [4]:
# 主函数：添加文档数和层级信息到entropy文件
import os
import glob
import pandas as pd 

base_path = "/Volumes/My Passport/收敛结果/2"  # 根目录

print("=" * 50)
print("开始添加文档数和层级信息到entropy文件...")
print("=" * 50)
add_document_counts_to_entropy_files(base_path)
print("=" * 50)
print("文档数和层级信息添加完成！")
print("=" * 50)

开始添加文档数和层级信息到entropy文件...

处理路径结构文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_path_structures.csv
最后一轮iteration: 115, 路径数: 141
[DEBUG] 发现层级列: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id', 'layer_3_node_id']
[DEBUG] 最大层级索引: 3
计算得到 228 个节点的信息
已更新 /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/corrected_renyi_entropy.csv，添加了document_count, layer, parent_id, child_ids, child_count列
节点层级统计:
  - 层级分布: {0: 1, 1: 19, 2: 67, 3: 141}
  - 文档数范围: 1-970
  - 根节点数: 1
  - 叶子节点数: 141
  - 子节点数分布: {0: 141, 1: 37, 2: 28, 3: 8, 4: 9, 5: 1, 7: 1, 19: 2, 24: 1}

处理路径结构文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/iteration_path_structures.csv
最后一轮iteration: 115, 路径数: 155
[DEBUG] 发现层级列: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id', 'layer_3_node_id']
[DEBUG] 最大层级索引: 3
计算得到 235 个节点的信息
已更新 /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/corrected_renyi_entropy.csv，添加了document_count, layer, par

In [5]:
def calculate_jensen_shannon_distances_with_weighted_entropy(base_path=".", eta=0.1):
    """
    计算每层节点之间的Jensen-Shannon距离和文档数加权平均Renyi熵
    
    Parameters:
    base_path: str, 根目录路径
    eta: float, Dirichlet平滑参数
    """
    # 查找所有iteration_node_word_distributions.csv文件
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\n处理文件: {file_path}")
        
        try:
            # 读取词分布数据
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # 获取最后一轮数据
            max_iteration = word_df['iteration'].max()
            last_iteration_data = word_df[word_df['iteration'] == max_iteration]
            
            # 获取全量词汇表
            all_words = sorted(list(last_iteration_data['word'].dropna().unique()))
            print(f"全量词汇表大小: {len(all_words)}")
            
            # 读取entropy文件获取层级信息
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print(f"警告：未找到entropy文件 {entropy_file}")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            # 按层级分组节点
            layers = entropy_df.groupby('layer')['node_id'].apply(list).to_dict()
            print(f"层级分布: {[(layer, len(nodes)) for layer, nodes in layers.items()]}")
            
            # 为每个节点构建概率分布
            node_distributions = {}
            
            for node_id in entropy_df['node_id'].unique():
                # 获取该节点的词分布
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # 初始化计数向量
                counts = np.zeros(len(all_words))
                word_to_idx = {word: idx for idx, word in enumerate(all_words)}
                
                # 填充实际计数
                for _, row in node_words.iterrows():
                    word = row['word']
                    if pd.notna(word) and word in word_to_idx:
                        counts[word_to_idx[word]] = row['count']
                
                # 添加Dirichlet平滑
                smoothed_counts = counts + eta
                
                # 计算概率分布
                probabilities = smoothed_counts / np.sum(smoothed_counts)
                node_distributions[node_id] = probabilities
            
            # 计算每层内节点的JS距离和加权平均熵
            all_js_distances = []
            layer_avg_distances = []
            
            for layer, layer_nodes in layers.items():
                print(f"\n计算Layer {layer}的JS距离和加权平均熵 ({len(layer_nodes)} 个节点)")
                
                layer_js_distances = []
                n = len(layer_nodes)
                
                # 计算该层内所有节点对的JS距离
                for i, node1 in enumerate(layer_nodes):
                    for j, node2 in enumerate(layer_nodes):
                        if i < j:  # 只计算上三角矩阵，避免重复和自己与自己
                            if node1 in node_distributions and node2 in node_distributions:
                                p = node_distributions[node1]
                                q = node_distributions[node2]
                                
                                # 计算Jensen-Shannon距离
                                js_distance = jensen_shannon_distance(p, q)
                                
                                layer_js_distances.append({
                                    'layer': layer,
                                    'node1_id': node1,
                                    'node2_id': node2,
                                    'js_distance': js_distance,
                                    'node1_doc_count': entropy_df[entropy_df['node_id'] == node1]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node1]) > 0 else 0,
                                    'node2_doc_count': entropy_df[entropy_df['node_id'] == node2]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node2]) > 0 else 0
                                })
                
                all_js_distances.extend(layer_js_distances)
                
                # 计算该层的平均JS距离
                avg_js_distance = 0.0
                if layer_js_distances and n > 1:
                    total_js_distance = sum(d['js_distance'] for d in layer_js_distances)
                    max_pairs = n * (n - 1) // 2  # n(n-1)/2
                    avg_js_distance = total_js_distance / max_pairs
                
                # 计算该层的文档数加权平均Renyi熵
                layer_entropy_data = entropy_df[entropy_df['layer'] == layer]
                total_docs = layer_entropy_data['document_count'].sum()
                
                if total_docs > 0:
                    # 计算加权平均熵：sum(文档数 * 熵) / 总文档数
                    weighted_entropy = (layer_entropy_data['document_count'] * layer_entropy_data['renyi_entropy_corrected']).sum() / total_docs
                else:
                    weighted_entropy = 0.0
                
                layer_avg_distances.append({
                    'layer': layer,
                    'node_count': n,
                    'total_pairs': len(layer_js_distances),
                    'max_pairs': n * (n - 1) // 2 if n > 1 else 0,
                    'sum_js_distance': sum(d['js_distance'] for d in layer_js_distances),
                    'avg_js_distance': avg_js_distance,
                    'total_documents': total_docs,
                    'weighted_avg_renyi_entropy': weighted_entropy
                })
                
                print(f"  - 节点数: {n}")
                print(f"  - 计算的节点对数: {len(layer_js_distances)}")
                print(f"  - 理论最大节点对数: {n * (n - 1) // 2 if n > 1 else 0}")
                print(f"  - 平均JS距离: {avg_js_distance:.4f}")
                print(f"  - 总文档数: {total_docs}")
                print(f"  - 文档数加权平均Renyi熵: {weighted_entropy:.4f}")
                print("=" * 50)
            
            # 保存详细的JS距离结果
            if all_js_distances:
                js_df = pd.DataFrame(all_js_distances)
                output_path = os.path.join(folder_path, 'jensen_shannon_distances.csv')
                js_df.to_csv(output_path, index=False)
                print(f"\n保存详细JS距离结果到: {output_path}")
            
            # 保存每层平均JS距离和加权熵结果
            if layer_avg_distances:
                avg_df = pd.DataFrame(layer_avg_distances)
                avg_output_path = os.path.join(folder_path, 'layer_average_js_distances.csv')
                avg_df.to_csv(avg_output_path, index=False)
                print(f"保存每层平均JS距离和加权熵结果到: {avg_output_path}")
                
                # 总体统计
                print(f"\n总体统计:")
                print(f"  - 总层数: {len(layer_avg_distances)}")
                print(f"  - 各层统计:")
                for row in layer_avg_distances:
                    print(f"    Layer {row['layer']}: JS距离={row['avg_js_distance']:.4f}, 加权熵={row['weighted_avg_renyi_entropy']:.4f} (基于{row['node_count']}个节点, {row['total_documents']}个文档)")
            
        except Exception as e:
            import traceback
            print(f"处理文件 {file_path} 时出错: {str(e)}")
            print("详细错误信息:")
            traceback.print_exc()

def jensen_shannon_distance(p, q):
    """
    计算两个概率分布之间的Jensen-Shannon距离
    
    Parameters:
    p, q: numpy arrays, 概率分布
    
    Returns:
    float: Jensen-Shannon距离
    """
    # 确保概率分布归一化
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # 计算平均分布
    m = 0.5 * (p + q)
    
    # 计算KL散度（使用自然对数）
    def kl_divergence(x, y):
        # 避免log(0)的情况
        mask = (x > 0) & (y > 0)
        if np.sum(mask) == 0:
            return 0.0
        return np.sum(x[mask] * np.log(x[mask] / y[mask]))
    
    # 计算Jensen-Shannon散度
    js_divergence = 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)
    
    # 转换为距离（取平方根）
    js_distance = np.sqrt(js_divergence)
    
    return js_distance

In [6]:
import numpy as np
import os
import glob
import pandas as pd 
# 主函数：计算Jensen-Shannon距离和加权平均Renyi熵
base_path = "/Volumes/My Passport/收敛结果/2"  # 根目录
eta = 0.1  # Dirichlet平滑参数

print("=" * 50)
print("开始计算Jensen-Shannon距离和加权平均Renyi熵...")
print("=" * 50)
calculate_jensen_shannon_distances_with_weighted_entropy(base_path, eta)
print("=" * 50)
print("Jensen-Shannon距离和加权平均Renyi熵计算完成！")
print("=" * 50)

开始计算Jensen-Shannon距离和加权平均Renyi熵...

处理文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_node_word_distributions.csv
全量词汇表大小: 1490
层级分布: [(0, 1), (1, 19), (2, 67), (3, 141)]

计算Layer 0的JS距离和加权平均熵 (1 个节点)
  - 节点数: 1
  - 计算的节点对数: 0
  - 理论最大节点对数: 0
  - 平均JS距离: 0.0000
  - 总文档数: 970
  - 文档数加权平均Renyi熵: 7.0594

计算Layer 1的JS距离和加权平均熵 (19 个节点)
  - 节点数: 19
  - 计算的节点对数: 171
  - 理论最大节点对数: 171
  - 平均JS距离: 0.3923
  - 总文档数: 970
  - 文档数加权平均Renyi熵: 7.3358

计算Layer 2的JS距离和加权平均熵 (67 个节点)
  - 节点数: 67
  - 计算的节点对数: 2211
  - 理论最大节点对数: 2211
  - 平均JS距离: 0.4114
  - 总文档数: 970
  - 文档数加权平均Renyi熵: 7.5513

计算Layer 3的JS距离和加权平均熵 (141 个节点)
  - 节点数: 141
  - 计算的节点对数: 9870
  - 理论最大节点对数: 9870
  - 平均JS距离: 0.4564
  - 总文档数: 970
  - 文档数加权平均Renyi熵: 7.6013

保存详细JS距离结果到: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/jensen_shannon_distances.csv
保存每层平均JS距离和加权熵结果到: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/layer_average_js_distances.csv

总体统计:
  - 总层数: 4
  - 各层

In [7]:
def aggregate_layer_statistics_by_gamma(base_path="."):
    """
    按gamma值汇总各层的JS距离和加权熵统计，在与run文件夹同级位置生成汇总表
    """
    # 查找所有layer_average_js_distances.csv文件
    pattern = os.path.join(base_path, "**", "layer_average_js_distances.csv")
    files = glob.glob(pattern, recursive=True)
    
    # 存储所有数据和分组信息
    all_data = []
    gamma_experiment_groups = {}  # 用于存储每个gamma_experiment组合的父目录
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)  # run文件夹的父目录
        
        # 从文件夹名称提取gamma值
        if 'gamma_0.001' in folder_name:
            if '2条链' in parent_folder:
                gamma = 0.001
                experiment_type = '2chains'
            else:
                gamma = 0.001
                experiment_type = 'single'
        elif 'gamma_0.005' in folder_name:
            gamma = 0.005
            experiment_type = 'single'
        elif 'gamma_0.01' in folder_name:
            gamma = 0.01
            experiment_type = 'single'
        elif 'gamma_0.05' in folder_name:
            gamma = 0.05
            experiment_type = 'single'
        elif 'gamma_0.1' in folder_name:
            gamma = 0.1
            experiment_type = 'single'
        else:
            continue
        
        # 提取run编号
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        # 记录gamma_experiment组合的父目录
        group_key = f"{gamma}_{experiment_type}"
        if group_key not in gamma_experiment_groups:
            gamma_experiment_groups[group_key] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'gamma': gamma,
                    'experiment_type': experiment_type,
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'avg_js_distance': row['avg_js_distance'],
                    'weighted_avg_renyi_entropy': row['weighted_avg_renyi_entropy'],
                    'total_documents': row['total_documents'],
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
    
    # 转换为DataFrame
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("未找到有效数据")
        return
    
    print("=" * 70)
    print("各GAMMA值的层级汇总统计")
    print("=" * 70)
    
    # 按gamma、experiment_type和parent_folder分组，生成汇总文件
    for (gamma, experiment_type), group_data in summary_df.groupby(['gamma', 'experiment_type']):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\n处理 Gamma={gamma:.3f}, 实验类型={'2条链' if experiment_type == '2chains' else '单链'}")
        print(f"输出目录: {parent_folder}")
        
        # 计算各层的汇总统计
        layer_summary = group_data.groupby('layer').agg({
            'avg_js_distance': ['mean', 'std', 'count'],
            'weighted_avg_renyi_entropy': ['mean', 'std', 'count'],
            'node_count': ['mean', 'std'],
            'total_documents': 'mean',
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # 平铺列名
        layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
        layer_summary = layer_summary.reset_index()
        
        # 重命名列，使其更清晰
        column_mapping = {
            'avg_js_distance_mean': 'avg_js_distance_mean',
            'avg_js_distance_std': 'avg_js_distance_std', 
            'avg_js_distance_count': 'run_count',
            'weighted_avg_renyi_entropy_mean': 'weighted_avg_renyi_entropy_mean',
            'weighted_avg_renyi_entropy_std': 'weighted_avg_renyi_entropy_std',
            'weighted_avg_renyi_entropy_count': 'entropy_run_count',
            'node_count_mean': 'avg_node_count',
            'node_count_std': 'node_count_std',
            'total_documents_mean': 'avg_total_documents',
            'run_id_<lambda>': 'included_runs'
        }
        
        for old_name, new_name in column_mapping.items():
            if old_name in layer_summary.columns:
                layer_summary = layer_summary.rename(columns={old_name: new_name})
        
        # 添加gamma和experiment_type信息
        layer_summary.insert(0, 'gamma', gamma)
        layer_summary.insert(1, 'experiment_type', experiment_type)
        
        # 保存汇总结果到与run文件夹同级的位置
        if experiment_type == '2chains':
            output_filename = f'gamma_{gamma:.3f}_2chains_layer_summary.csv'
        else:
            output_filename = f'gamma_{gamma:.3f}_single_layer_summary.csv'
        
        output_path = os.path.join(parent_folder, output_filename)
        layer_summary.to_csv(output_path, index=False)
        
        print(f"  保存汇总文件: {output_path}")
        print(f"  包含运行: {layer_summary['included_runs'].iloc[0] if 'included_runs' in layer_summary.columns else 'N/A'}")
        print(f"  层数: {len(layer_summary)}")
        
        # 显示简要统计
        for _, row in layer_summary.iterrows():
            layer_num = int(row['layer'])
            js_mean = row['avg_js_distance_mean']
            js_std = row['avg_js_distance_std'] if 'avg_js_distance_std' in row else 0
            entropy_mean = row['weighted_avg_renyi_entropy_mean']
            entropy_std = row['weighted_avg_renyi_entropy_std'] if 'weighted_avg_renyi_entropy_std' in row else 0
            node_count = row['avg_node_count']
            run_count = int(row['run_count']) if 'run_count' in row else 0
            
            print(f"    Layer {layer_num}: JS={js_mean:.4f}(±{js_std:.4f}), 熵={entropy_mean:.4f}(±{entropy_std:.4f}), 节点={node_count:.1f}, runs={run_count}")
    
    # 生成总体对比文件（保存在base_path下）
    print(f"\n" + "=" * 70)
    print("生成总体对比文件")
    print("=" * 70)
    
    # 只分析单链实验的跨gamma对比
    single_chain_data = summary_df[summary_df['experiment_type'] == 'single']
    
    if not single_chain_data.empty:
        overall_summary = single_chain_data.groupby(['gamma', 'layer']).agg({
            'avg_js_distance': ['mean', 'std'],
            'weighted_avg_renyi_entropy': ['mean', 'std'],
            'node_count': ['mean', 'std'],
            'run_id': 'count'
        }).round(4)
        
        # 平铺列名
        overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
        overall_summary = overall_summary.reset_index()
        
        overall_output_path = os.path.join(base_path, 'gamma_layer_comparison.csv')
        overall_summary.to_csv(overall_output_path, index=False)
        print(f"总体对比文件保存到: {overall_output_path}")
        
        # 显示跨gamma对比
        for layer in sorted(single_chain_data['layer'].unique()):
            print(f"\nLayer {int(layer)} 跨Gamma对比:")
            print("Gamma值    JS距离(±std)      加权熵(±std)      节点数(±std)   运行数")
            print("-" * 75)
            
            layer_data = overall_summary[overall_summary['layer'] == layer]
            for _, row in layer_data.iterrows():
                gamma = row['gamma']
                js_mean = row['avg_js_distance_mean']
                js_std = row['avg_js_distance_std']
                entropy_mean = row['weighted_avg_renyi_entropy_mean']
                entropy_std = row['weighted_avg_renyi_entropy_std']
                node_mean = row['node_count_mean']
                node_std = row['node_count_std']
                run_count = int(row['run_id_count'])
                
                print(f"{gamma:6.3f}    {js_mean:6.4f}(±{js_std:5.4f})   {entropy_mean:6.4f}(±{entropy_std:5.4f})   {node_mean:6.1f}(±{node_std:4.1f})   {run_count:4d}")

In [8]:
# 执行汇总分析
base_path = "/Volumes/My Passport/收敛结果/2"
print("=" * 70)
print("开始汇总各Gamma值的层级统计...")
print("=" * 70)
aggregate_layer_statistics_by_gamma(base_path)
print("=" * 70)
print("汇总分析完成！")
print("=" * 70)

开始汇总各Gamma值的层级统计...
各GAMMA值的层级汇总统计

处理 Gamma=0.001, 实验类型=单链
输出目录: /Volumes/My Passport/收敛结果/2/d4_g0001_收敛
  保存汇总文件: /Volumes/My Passport/收敛结果/2/d4_g0001_收敛/gamma_0.001_single_layer_summary.csv
  包含运行: 1, 2, 3
  层数: 4
    Layer 0: JS=0.0000(±0.0000), 熵=7.0208(±0.0282), 节点=1.0, runs=3
    Layer 1: JS=0.4364(±0.0271), 熵=7.1383(±0.3054), 节点=17.0, runs=3
    Layer 2: JS=0.4130(±0.0092), 熵=7.5895(±0.1510), 节点=68.0, runs=3
    Layer 3: JS=0.4722(±0.0105), 熵=7.4100(±0.1589), 节点=157.7, runs=3

处理 Gamma=0.005, 实验类型=单链
输出目录: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛
  保存汇总文件: /Volumes/My Passport/收敛结果/2/d4_g0005_收敛/gamma_0.005_single_layer_summary.csv
  包含运行: 1, 2, 3
  层数: 4
    Layer 0: JS=0.0000(±0.0000), 熵=7.1037(±0.0415), 节点=1.0, runs=3
    Layer 1: JS=0.4444(±0.0457), 熵=7.1893(±0.1358), 节点=16.7, runs=3
    Layer 2: JS=0.4044(±0.0198), 熵=7.5989(±0.1633), 节点=65.0, runs=3
    Layer 3: JS=0.4646(±0.0087), 熵=7.4875(±0.1168), 节点=153.7, runs=3

处理 Gamma=0.010, 实验类型=单链
输出目录: /Volumes/My Passport/收敛结果/

In [3]:
import pandas as pd
import glob
import os

base_path = "/Volumes/My Passport/收敛结果/4"
pattern = os.path.join(base_path, "**", "result_layers.csv")
files = glob.glob(pattern, recursive=True)

all_rows = []
for file in files:
    df = pd.read_csv(file)
    # 补充参数信息
    for col in ['depth', 'gamma', 'eta', 'alpha']:
        if col not in df.columns:
            # 从路径中提取参数
            folder = os.path.dirname(file)
            if f"{col}_" in folder:
                try:
                    value = float(folder.split(f"{col}_")[1].split("_")[0])
                except:
                    value = None
                df[col] = value
            else:
                df[col] = None
    all_rows.append(df)

merged = pd.concat(all_rows, ignore_index=True)

# 按参数组和layer分组，计算均值和标准差
group_cols = ['depth', 'gamma', 'eta', 'alpha', 'layer']
summary = merged.groupby(group_cols).agg({
    'entropy_wavg': ['mean', 'std'],
    'distinctiveness_wavg_jsd': ['mean', 'std'],
    'nodes_in_layer': ['mean', 'std'],
}).reset_index()

# 展开多级列名
summary.columns = ['_'.join(col).strip('_') for col in summary.columns]

summary.to_csv(os.path.join(base_path, "all_params_layer_mean.csv"), index=False)
print("已生成每组参数每层的均值表 all_params_layer_mean.csv")
# // filepath: /Volumes/My Passport/收敛结果/3/step1_analysis.ipynb

已生成每组参数每层的均值表 all_params_layer_mean.csv
