## Summary
### we do the same for steps as step2, detailed comment see step2_analysis.ipynb

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from scipy.special import gammaln

def calculate_renyi_entropy_vectorized(node_data, all_words, eta_prior=1.0, renyi_alpha=2.0):
    """
    Vectorized version of Renyi entropy calculation
    
    Parameters:
    node_data: DataFrame, node data containing word and count columns
    all_words: list, complete vocabulary
    eta_prior: float, Dirichlet prior smoothing parameter (obtained from eta value)
    renyi_alpha: float, order parameter for Renyi entropy
    
    Returns:
    tuple: (entropy, nonzero_word_count) Renyi entropy value and non-zero word count
    """
    if len(all_words) == 0:
        return 0.0, 0
    
    # Create word-to-index mapping
    word_to_idx = {word: idx for idx, word in enumerate(all_words)}
    
    # Initialize count vector
    counts = np.zeros(len(all_words))
    
    # Fill actual counts
    for _, row in node_data.iterrows():
        word = row['word']
        if pd.notna(word) and word in word_to_idx:
            counts[word_to_idx[word]] = row['count']
    
    # Count non-zero words (before smoothing)
    nonzero_word_count = np.sum(counts > 0)
    
    # Add eta smoothing
    smoothed_counts = counts + eta_prior
    
    # Calculate probability distribution
    probabilities = smoothed_counts / np.sum(smoothed_counts)
    
    # Calculate Renyi entropy (using natural logarithm)
    if renyi_alpha == 1.0:
        # Shannon entropy (since alpha smoothing makes all probabilities > 0, no need to add small constant)
        entropy = -np.sum(probabilities * np.log(probabilities))
    else:
        # General Renyi entropy
        entropy = (1 / (1 - renyi_alpha)) * np.log(np.sum(probabilities ** renyi_alpha))
    
    return entropy, int(nonzero_word_count)

def process_all_iteration_files_by_alpha(base_path=".", renyi_alpha=2.0):
    """
    Process each iteration_node_word_distributions.csv file separately and save results
    Corrected version: Use fixed eta=0.05 as Dirichlet smoothing parameter (adapted for step3)
    """
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    # Remove duplicates, ensure each file is processed only once
    files = list(set(files))
    files.sort()  # Sort for ordered processing
    
    print(f"Found {len(files)} files to process")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # In step3, eta value is fixed at 0.05 (for Dirichlet smoothing)
        eta_prior = 0.05  # Fixed: Use 0.05 as smoothing parameter
        
        # Extract alpha value from folder name (used only for recording folder information)
        alpha = 0.1  # Default value
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError) as e:
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        print(f"\n[{idx}/{len(files)}] Processing file: {file_path}")
        print(f"Folder: {folder_name}")
        print(f"Extracted alpha value: {alpha} (for recording only)")
        print(f"Using eta smoothing value: {eta_prior}")
        
        try:
            df = pd.read_csv(file_path)
            
            # Clean column names, remove single quotes, double quotes and spaces
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            if 'node_id' not in df.columns:
                print(f"Warning: {file_path} missing node_id column, skipping this file")
                continue
                
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            all_words = list(last_iteration_data['word'].dropna().unique())
            
            print(f"Last iteration: {max_iteration}, vocabulary size: {len(all_words)}, node count: {last_iteration_data['node_id'].nunique()}")
            
            results = []
            for node_id in last_iteration_data['node_id'].unique():
                node_data = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # Use fixed eta_prior=0.05 for Dirichlet smoothing
                entropy, nonzero_words = calculate_renyi_entropy_vectorized(
                    node_data, all_words, eta_prior, renyi_alpha  # Use eta_prior
                )
                
                # Calculate sparsity (proportion of non-zero words)
                sparsity_ratio = nonzero_words / len(all_words) if len(all_words) > 0 else 0
                
                results.append({
                    'node_id': node_id,
                    'renyi_entropy_corrected': entropy,
                    'nonzero_word_count': nonzero_words,
                    'total_vocabulary_size': len(all_words),
                    'sparsity_ratio': sparsity_ratio,
                    'eta_used': eta_prior,  # Corrected: Record actually used eta value
                    'alpha_folder': alpha,  # Corrected: Record folder's alpha value
                    'renyi_alpha': renyi_alpha,
                    'iteration': max_iteration
                })
            
            # Save new corrected_renyi_entropy.csv file
            results_df = pd.DataFrame(results)
            output_path = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            results_df.to_csv(output_path, index=False)
            print(f"✓ Saved corrected Renyi entropy results to: {output_path}")
            
            # Output some statistics
            print(f"Node vocabulary sparsity statistics:")
            print(f"  - Average non-zero word count: {results_df['nonzero_word_count'].mean():.1f}")
            print(f"  - Non-zero word count range: {results_df['nonzero_word_count'].min()}-{results_df['nonzero_word_count'].max()}")
            print(f"  - Average sparsity: {results_df['sparsity_ratio'].mean():.3f}")
            print("=" * 50)
            
        except Exception as e:
            import traceback
            print(f"❌ Error processing file {file_path}: {str(e)}")
            print("Detailed error information:")
            traceback.print_exc()

In [2]:
# Set parameters - adapted for step3
base_path = "/Volumes/My Passport/收敛结果/step3"  # step3 path
renyi_alpha = 2.0  # Renyi entropy order parameter

print("=" * 80)
print("Step3: Starting analysis of Alpha parameter impact on the model")
print("=" * 80)

# 1. Calculate corrected Renyi entropy (automatically adjust prior by alpha value)
print("Starting corrected Renyi entropy calculation...")
process_all_iteration_files_by_alpha(base_path, renyi_alpha)

print("=" * 50)
print("✅ Step3 Renyi entropy calculation completed!")
print("=" * 50)

Step3: Starting analysis of Alpha parameter impact on the model
Starting corrected Renyi entropy calculation...
Found 17 files to process

[1/17] Processing file: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_1/iteration_node_word_distributions.csv
Folder: depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_1
Extracted alpha value: 0.01 (for recording only)
Using eta smoothing value: 0.05
Last iteration: 285, vocabulary size: 1490, node count: 312
✓ Saved corrected Renyi entropy results to: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_1/corrected_renyi_entropy.csv
Node vocabulary sparsity statistics:
  - Average non-zero word count: 60.7
  - Non-zero word count range: 0-829
  - Average sparsity: 0.041

[2/17] Processing file: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a001/depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_2/iteration_node_word_distributions.csv
Folder: depth_3_gamma_0.05_

In [3]:
def aggregate_layer_statistics_by_alpha(base_path="."):
    """
    Aggregate JS distance and weighted entropy statistics by alpha value, generate summary tables at the same level as run folders
    Corrected version: adapted for alpha parameter in step3 instead of eta parameter
    """
    # Find all layer_average_js_distances.csv files
    pattern = os.path.join(base_path, "**", "layer_average_js_distances.csv")
    files = glob.glob(pattern, recursive=True)
    
    # Store all data and grouping information
    all_data = []
    alpha_groups = {}  # Store parent directory for each alpha combination
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)  # Parent directory of run folders
        
        # Extract alpha value from folder name (adapted for step3)
        alpha = None
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError):
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha is None:
            print(f"Warning: unable to extract alpha value from folder name {folder_name}")
            continue
        
        # Extract run number
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            print(f"Warning: unable to extract run number from folder name {folder_name}")
            continue
        
        # Record parent directory for alpha combination
        if alpha not in alpha_groups:
            alpha_groups[alpha] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'alpha': alpha,  # Corrected: use alpha instead of eta
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'avg_js_distance': row['avg_js_distance'],
                    'weighted_avg_renyi_entropy': row['weighted_avg_renyi_entropy'],
                    'total_documents': row['total_documents'],
                    'eta_used': row.get('eta_used', 0.05),  # Record actually used eta value (fixed 0.05)
                    'alpha_folder': row.get('alpha_folder', alpha),  # Record folder's alpha value
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Convert to DataFrame
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("No valid data found")
        return
    
    print("=" * 70)
    print("Layer summary statistics by ALPHA value (Step3)")
    print("=" * 70)
    
    # Generate summary files grouped by alpha
    for alpha, group_data in summary_df.groupby('alpha'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\nProcessing Alpha={alpha}")
        print(f"Output directory: {parent_folder}")
        
        # Calculate summary statistics for each layer
        layer_summary = group_data.groupby('layer').agg({
            'avg_js_distance': ['mean', 'std', 'count'],
            'weighted_avg_renyi_entropy': ['mean', 'std', 'count'],
            'node_count': ['mean', 'std'],
            'total_documents': 'mean',
            'eta_used': 'first',  # Record used eta value
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # Flatten column names
        layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
        layer_summary = layer_summary.reset_index()
        
        # Rename columns for clarity
        column_mapping = {
            'avg_js_distance_mean': 'avg_js_distance_mean',
            'avg_js_distance_std': 'avg_js_distance_std', 
            'avg_js_distance_count': 'run_count',
            'weighted_avg_renyi_entropy_mean': 'weighted_avg_renyi_entropy_mean',
            'weighted_avg_renyi_entropy_std': 'weighted_avg_renyi_entropy_std',
            'weighted_avg_renyi_entropy_count': 'entropy_run_count',
            'node_count_mean': 'avg_node_count',
            'node_count_std': 'node_count_std',
            'total_documents_mean': 'avg_total_documents',
            'eta_used_first': 'eta_used',
            'run_id_<lambda>': 'included_runs'
        }
        
        for old_name, new_name in column_mapping.items():
            if old_name in layer_summary.columns:
                layer_summary = layer_summary.rename(columns={old_name: new_name})
        
        # Add alpha information
        layer_summary.insert(0, 'alpha', alpha)
        
        # Save summary results at the same level as run folders
        output_filename = f'alpha_{alpha}_layer_summary.csv'
        output_path = os.path.join(parent_folder, output_filename)
        layer_summary.to_csv(output_path, index=False)
        
        print(f"  Summary file saved: {output_path}")
        print(f"  Included runs: {layer_summary['included_runs'].iloc[0] if 'included_runs' in layer_summary.columns else 'N/A'}")
        print(f"  Number of layers: {len(layer_summary)}")
        print(f"  Used Eta value: {layer_summary.get('eta_used', pd.Series([0.05])).iloc[0]}")
        
        # Display brief statistics
        for _, row in layer_summary.iterrows():
            layer_num = int(row['layer'])
            js_mean = row['avg_js_distance_mean']
            js_std = row['avg_js_distance_std'] if 'avg_js_distance_std' in row else 0
            entropy_mean = row['weighted_avg_renyi_entropy_mean']
            entropy_std = row['weighted_avg_renyi_entropy_std'] if 'weighted_avg_renyi_entropy_std' in row else 0
            node_count = row['avg_node_count']
            run_count = int(row['run_count']) if 'run_count' in row else 0
            
            print(f"    Layer {layer_num}: JS={js_mean:.4f}(±{js_std:.4f}), entropy={entropy_mean:.4f}(±{entropy_std:.4f}), nodes={node_count:.1f}, runs={run_count}")
    
    # Generate overall comparison file (saved under base_path)
    print(f"\n" + "=" * 70)
    print("Generating overall comparison file")
    print("=" * 70)
    
    overall_summary = summary_df.groupby(['alpha', 'layer']).agg({
        'avg_js_distance': ['mean', 'std'],
        'weighted_avg_renyi_entropy': ['mean', 'std'],
        'node_count': ['mean', 'std'],
        'run_id': 'count'
    }).round(4)
    
    # Flatten column names
    overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
    overall_summary = overall_summary.reset_index()
    
    overall_output_path = os.path.join(base_path, 'alpha_layer_comparison.csv')
    overall_summary.to_csv(overall_output_path, index=False)
    print(f"Overall comparison file saved to: {overall_output_path}")
    
    # Display cross-alpha comparison
    for layer in sorted(summary_df['layer'].unique()):
        print(f"\nLayer {int(layer)} Cross-Alpha Comparison:")
        print("Alpha Value     JS Distance(±std)      Weighted Entropy(±std)      Node Count(±std)   Run Count")
        print("-" * 75)
        
        layer_data = overall_summary[overall_summary['layer'] == layer]
        for _, row in layer_data.iterrows():
            alpha = row['alpha']
            js_mean = row['avg_js_distance_mean']
            js_std = row['avg_js_distance_std']
            entropy_mean = row['weighted_avg_renyi_entropy_mean']
            entropy_std = row['weighted_avg_renyi_entropy_std']
            node_mean = row['node_count_mean']
            node_std = row['node_count_std']
            run_count = int(row['run_id_count'])
            
            print(f"{alpha:7.3f}    {js_mean:6.4f}(±{js_std:5.4f})   {entropy_mean:6.4f}(±{entropy_std:5.4f})   {node_mean:6.1f}(±{node_std:4.1f})   {run_count:4d}")

In [4]:
def calculate_node_document_counts(path_structures_df):
    """
    Aggregate from leaf nodes upward to calculate document counts and hierarchical relationships for each node
    
    Parameters:
    path_structures_df: DataFrame, data from iteration_path_structures.csv (filtered to last iteration)
    
    Returns:
    dict: {node_id: {'document_count': int, 'layer': int, 'parent_id': int, 'child_ids': list}} mapping
    """
    # Get all layer columns - fix regular expression
    layer_columns = [col for col in path_structures_df.columns if col.startswith('layer_') and col.endswith('_node_id')]
    layer_columns.sort()  # Ensure ordered arrangement
    max_layer_idx = len(layer_columns) - 1
    
    print(f"[DEBUG] Found layer columns: {layer_columns}")
    print(f"[DEBUG] Maximum layer index: {max_layer_idx}")
    
    # Initialize node information dictionary
    node_info = {}
    
    # First establish all nodes' layer and parent-child relationships
    for _, row in path_structures_df.iterrows():
        path_nodes = []
        for layer_idx in range(max_layer_idx + 1):
            layer_col = f'layer_{layer_idx}_node_id'
            if layer_col in path_structures_df.columns and pd.notna(row[layer_col]):
                path_nodes.append(row[layer_col])
            else:
                break
        
        # Establish layer and parent-child relationships for each node in the path
        for i, node in enumerate(path_nodes):
            if node not in node_info:
                node_info[node] = {
                    'document_count': 0,
                    'layer': i,
                    'parent_id': None,
                    'child_ids': [],
                    'child_count': 0
                }
            else:
                # Update layer information (ensure consistency)
                node_info[node]['layer'] = i
            
            # Set parent node relationship
            if i > 0:  # Not root node
                parent_node = path_nodes[i-1]
                node_info[node]['parent_id'] = parent_node
                
                # Add current node to parent node's child list
                if parent_node not in node_info:
                    node_info[parent_node] = {
                        'document_count': 0,
                        'layer': i-1,
                        'parent_id': None,
                        'child_ids': [],
                        'child_count': 0
                    }
                
                if node not in node_info[parent_node]['child_ids']:
                    node_info[parent_node]['child_ids'].append(node)
    
    # Then process leaf node document counts - after hierarchical relationships are established
    for _, row in path_structures_df.iterrows():
        leaf_node = row['leaf_node_id']
        if pd.notna(leaf_node) and leaf_node in node_info:
            node_info[leaf_node]['document_count'] += row['document_count']
    
    # Aggregate document counts upward from second-to-last layer
    for layer_idx in range(max_layer_idx - 1, -1, -1):  # From second-to-last layer to layer 0
        layer_col = f'layer_{layer_idx}_node_id'
        
        if layer_col not in path_structures_df.columns:
            continue
            
        # Get all unique nodes in this layer
        layer_nodes = path_structures_df[layer_col].dropna().unique()
        
        for node in layer_nodes:
            if node in node_info and node_info[node]['document_count'] == 0:
                # Calculate document count: sum all child nodes' document counts
                child_doc_count = 0
                for child_id in node_info[node]['child_ids']:
                    if child_id in node_info:
                        child_doc_count += node_info[child_id]['document_count']
                
                # If no child node document count, calculate directly from path structure
                if child_doc_count == 0:
                    total_docs = path_structures_df[path_structures_df[layer_col] == node]['document_count'].sum()
                    node_info[node]['document_count'] = total_docs
                else:
                    node_info[node]['document_count'] = child_doc_count

    # Calculate child count for each node
    for node_id, info in node_info.items():
        info['child_count'] = len(info['child_ids'])
    
    return node_info

def add_document_counts_to_entropy_files(base_path="."):
    """
    Add document counts and layer information to corrected_renyi_entropy.csv files (adapted for step3)
    """
    pattern = os.path.join(base_path, "**", "iteration_path_structures.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        print(f"\nProcessing path structure file: {folder_name}")
        
        try:
            # Read path_structures file
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            # Get last iteration data
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            print(f"Last iteration: {max_iteration}, path count: {len(last_iteration_data)}")
            
            # Calculate document counts and hierarchical relationships for each node
            node_info = calculate_node_document_counts(last_iteration_data)
            
            print(f"Calculated information for {len(node_info)} nodes")
            
            # Read corresponding corrected_renyi_entropy.csv
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if os.path.exists(entropy_file):
                entropy_df = pd.read_csv(entropy_file)
                
                # Add new columns - fix child_ids format and child_count calculation
                entropy_df['document_count'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('document_count', 0))
                entropy_df['layer'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('layer', -1))
                entropy_df['parent_id'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('parent_id', None))
                
                # Fix child_ids format: use square brackets instead of commas
                entropy_df['child_ids'] = entropy_df['node_id'].map(
                    lambda x: '[' + ','.join(map(str, node_info.get(x, {}).get('child_ids', []))) + ']' 
                    if node_info.get(x, {}).get('child_ids') else ''
                )
                
                # Fix child_count: directly use list length
                entropy_df['child_count'] = entropy_df['node_id'].map(lambda x: len(node_info.get(x, {}).get('child_ids', [])))

                # Save updated file
                entropy_df.to_csv(entropy_file, index=False)
                print(f"Updated {entropy_file}, added document_count, layer, parent_id, child_ids, child_count columns")
                
                # Display some statistics
                print(f"Node layer statistics:")
                print(f"  - Layer distribution: {entropy_df['layer'].value_counts().sort_index().to_dict()}")
                print(f"  - Document count range: {entropy_df['document_count'].min()}-{entropy_df['document_count'].max()}")
                print(f"  - Root node count: {entropy_df[entropy_df['parent_id'].isna()].shape[0]}")
                print(f"  - Leaf node count: {entropy_df[entropy_df['child_ids'] == ''].shape[0]}")
                print(f"  - Child count distribution: {entropy_df['child_count'].value_counts().sort_index().to_dict()}")
            else:
                print(f"Warning: Corresponding entropy file not found {entropy_file}")
                
        except Exception as e:
            import traceback
            print(f"Error processing file {file_path}: {str(e)}")
            print("Detailed error information:")
            traceback.print_exc()

In [5]:
# Main function: Add document counts and layer information to entropy files (adapted for step3)
import os
import glob
import pandas as pd 

base_path = "/Volumes/My Passport/收敛结果/step3"  # Change to step3 path

print("=" * 50)
print("Step3: Starting to add document counts and layer information to entropy files...")
print("=" * 50)
add_document_counts_to_entropy_files(base_path)
print("=" * 50)
print("Step3: Document counts and layer information addition completed!")
print("=" * 50)

Step3: Starting to add document counts and layer information to entropy files...

Processing path structure file: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3
Last iteration: 265, path count: 242
[DEBUG] Found layer columns: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id']
[DEBUG] Maximum layer index: 2
Calculated information for 307 nodes
Updated /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/corrected_renyi_entropy.csv, added document_count, layer, parent_id, child_ids, child_count columns
Node layer statistics:
  - Layer distribution: {0: 1, 1: 64, 2: 242}
  - Document count range: 1-970
  - Root node count: 1
  - Leaf node count: 242
  - Child count distribution: {0: 242, 1: 14, 2: 15, 3: 11, 4: 7, 5: 7, 6: 5, 7: 3, 9: 1, 42: 1, 64: 1}

Processing path structure file: depth_3_gamma_0.05_eta_0.05_alpha_1_run_2
Last iteration: 245, path count: 263
[DEBUG] Found layer columns: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_i

In [6]:
def jensen_shannon_distance(p, q):
    """
    Calculate Jensen-Shannon distance between two probability distributions
    
    Parameters:
    p, q: array-like, probability distributions (should be normalized)
    
    Returns:
    float: Jensen-Shannon distance
    """
    # Ensure inputs are numpy arrays
    p = np.array(p)
    q = np.array(q)
    
    # Calculate midpoint distribution
    m = 0.5 * (p + q)
    
    # Calculate KL divergence, add small constant to avoid log(0)
    eps = 1e-10
    kl_pm = np.sum(p * np.log((p + eps) / (m + eps)))
    kl_qm = np.sum(q * np.log((q + eps) / (m + eps)))
    
    # Jensen-Shannon divergence
    js_divergence = 0.5 * kl_pm + 0.5 * kl_qm
    
    # Jensen-Shannon distance (square root of divergence)
    js_distance = np.sqrt(js_divergence)
    
    return js_distance

def calculate_jensen_shannon_distances_with_weighted_entropy_by_alpha(base_path="."):
    """
    Revised version: Calculate Jensen-Shannon distances between nodes in each layer and document-weighted average Renyi entropy
    Use fixed eta=0.05 as Dirichlet smoothing parameter, alpha value is only used for folder identification
    
    Note: Renyi entropy calculation uses natural logarithm (loge), unit is nats
    """
    # Find all iteration_node_word_distributions.csv files
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"Found {len(files)} word distribution files to process")
    
    # In step3, eta value is fixed at 0.05 (for Dirichlet smoothing)
    eta = 0.05  # Revised: fixed use of 0.05 as smoothing parameter
    
    # Group files by alpha value for display
    files_by_alpha = {}
    for file_path in files:
        folder_name = os.path.basename(os.path.dirname(file_path))
        alpha = 0.1  # Default value
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha not in files_by_alpha:
            files_by_alpha[alpha] = []
        files_by_alpha[alpha].append(file_path)
    
    print("File distribution:")
    for alpha in sorted(files_by_alpha.keys()):
        print(f"  Alpha {alpha}: {len(files_by_alpha[alpha])} files")
    print(f"Using fixed Eta value: {eta}")
    print()
    
    # Process each file
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # Extract alpha value and run information from folder name (for recording only)
        alpha = 0.1  # Default value
        run_id = "unknown"
        
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if '_run_' in folder_name:
            try:
                run_id = folder_name.split('_run_')[1]
            except:
                pass
        
        print("=" * 80)
        print(f"[{idx}/{len(files)}] Processing Alpha={alpha}, Run={run_id}")
        print(f"Using fixed Eta={eta} for Dirichlet smoothing")
        print("=" * 80)
        
        try:
            # Read word distribution data
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # Get last iteration data
            max_iteration = word_df['iteration'].max()
            last_iteration_data = word_df[word_df['iteration'] == max_iteration]
            
            # Get complete vocabulary
            all_words = sorted(list(last_iteration_data['word'].dropna().unique()))
            
            # Read entropy file to get layer information
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print(f"⚠️  Entropy file not found, skipping this file")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            # Basic information
            print(f"📊 Basic Information:")
            print(f"   Vocabulary size: {len(all_words)}")
            print(f"   Last iteration: {max_iteration}")
            
            # Group nodes by layer
            layers = entropy_df.groupby('layer')['node_id'].apply(list).to_dict()
            print(f"   Layer distribution: {[(layer, len(nodes)) for layer, nodes in layers.items()]}")
            
            # Build probability distributions for each node
            print(f"🔄 Building probability distributions...")
            node_distributions = {}
            
            for node_id in entropy_df['node_id'].unique():
                # Get word distribution for this node
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # Initialize count vector
                counts = np.zeros(len(all_words))
                word_to_idx = {word: idx for idx, word in enumerate(all_words)}
                
                # Fill actual counts
                for _, row in node_words.iterrows():
                    word = row['word']
                    if pd.notna(word) and word in word_to_idx:
                        counts[word_to_idx[word]] = row['count']
                
                # Revised: Use fixed eta value for Dirichlet smoothing
                smoothed_counts = counts + eta  # Use eta=0.05 instead of alpha
                
                # Calculate probability distribution
                probabilities = smoothed_counts / np.sum(smoothed_counts)
                node_distributions[node_id] = probabilities
            
            print(f"   ✓ Completed {len(node_distributions)} node probability distributions")
            
            # Calculate JS distances and weighted average entropy within each layer
            all_js_distances = []
            layer_avg_distances = []
            
            print(f"📐 Calculating JS distances...")
            for layer, layer_nodes in layers.items():
                layer_js_distances = []
                n = len(layer_nodes)
                
                # Calculate JS distances for all node pairs within this layer
                for i, node1 in enumerate(layer_nodes):
                    for j, node2 in enumerate(layer_nodes):
                        if i < j:  # Only calculate upper triangular matrix, avoid duplicates and self-comparison
                            if node1 in node_distributions and node2 in node_distributions:
                                p = node_distributions[node1]
                                q = node_distributions[node2]
                                
                                # Calculate Jensen-Shannon distance
                                js_distance = jensen_shannon_distance(p, q)
                                
                                layer_js_distances.append({
                                    'layer': layer,
                                    'node1_id': node1,
                                    'node2_id': node2,
                                    'js_distance': js_distance,
                                    'node1_doc_count': entropy_df[entropy_df['node_id'] == node1]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node1]) > 0 else 0,
                                    'node2_doc_count': entropy_df[entropy_df['node_id'] == node2]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node2]) > 0 else 0
                                })
                
                all_js_distances.extend(layer_js_distances)
                
                # Calculate average JS distance for this layer
                avg_js_distance = 0.0
                if layer_js_distances and n > 1:
                    total_js_distance = sum(d['js_distance'] for d in layer_js_distances)
                    max_pairs = n * (n - 1) // 2
                    avg_js_distance = total_js_distance / max_pairs
                
                # Calculate document-weighted average Renyi entropy for this layer
                layer_entropy_data = entropy_df[entropy_df['layer'] == layer]
                total_docs = layer_entropy_data['document_count'].sum()
                
                if total_docs > 0:
                    weighted_entropy = (layer_entropy_data['document_count'] * layer_entropy_data['renyi_entropy_corrected']).sum() / total_docs
                else:
                    weighted_entropy = 0.0
                
                layer_avg_distances.append({
                    'layer': layer,
                    'node_count': n,
                    'total_pairs': len(layer_js_distances),
                    'max_pairs': n * (n - 1) // 2 if n > 1 else 0,
                    'sum_js_distance': sum(d['js_distance'] for d in layer_js_distances),
                    'avg_js_distance': avg_js_distance,
                    'total_documents': total_docs,
                    'weighted_avg_renyi_entropy': weighted_entropy,
                    'eta_used': eta,  # Revised: record actually used eta value
                    'alpha_folder': alpha  # Revised: record folder's alpha value
                })
                
                # Concise layer statistics output
                print(f"   Layer {layer}: {n}nodes, JS={avg_js_distance:.4f}, entropy={weighted_entropy:.4f}")
            
            # Save result files
            if all_js_distances:
                js_df = pd.DataFrame(all_js_distances)
                output_path = os.path.join(folder_path, 'jensen_shannon_distances.csv')
                js_df.to_csv(output_path, index=False)
            
            if layer_avg_distances:
                avg_df = pd.DataFrame(layer_avg_distances)
                avg_output_path = os.path.join(folder_path, 'layer_average_js_distances.csv')
                avg_df.to_csv(avg_output_path, index=False)
            
            print(f"💾 Results saved")
            
        except Exception as e:
            print(f"❌ Processing failed: {str(e)}")
    
    print("\n" + "=" * 80)
    print("✅ All files processed!")
    print("=" * 80)

In [7]:
import numpy as np
import os
import glob
import pandas as pd 

# Main function: Calculate Jensen-Shannon distances and weighted average Renyi entropy
base_path = "/Volumes/My Passport/收敛结果/step3"  # Root directory

print("=" * 50)
print("Step3: Starting calculation of Jensen-Shannon distances and weighted average Renyi entropy (automatically adjusted by alpha value)...")
print("=" * 50)
calculate_jensen_shannon_distances_with_weighted_entropy_by_alpha(base_path)

print("=" * 50)
print("Starting aggregation of layer statistics by alpha value...")
print("=" * 50)
# Add the missing aggregation function call
aggregate_layer_statistics_by_alpha(base_path)

print("=" * 50)
print("Step3: Jensen-Shannon distances and weighted average Renyi entropy calculation completed!")
print("=" * 50)

Step3: Starting calculation of Jensen-Shannon distances and weighted average Renyi entropy (automatically adjusted by alpha value)...
Found 17 word distribution files to process
File distribution:
  Alpha 0.01: 3 files
  Alpha 0.05: 3 files
  Alpha 0.1: 3 files
  Alpha 0.2: 3 files
  Alpha 0.5: 3 files
  Alpha 1.0: 2 files
Using fixed Eta value: 0.05

[1/17] Processing Alpha=1.0, Run=3
Using fixed Eta=0.05 for Dirichlet smoothing
📊 Basic Information:
   Vocabulary size: 1490
   Last iteration: 265
   Layer distribution: [(0, 1), (1, 64), (2, 242)]
🔄 Building probability distributions...
   ✓ Completed 307 node probability distributions
📐 Calculating JS distances...
   Layer 0: 1nodes, JS=0.0000, entropy=4.9219
   Layer 1: 64nodes, JS=0.6081, entropy=4.6239
   Layer 2: 242nodes, JS=0.5566, entropy=4.5810
💾 Results saved
[2/17] Processing Alpha=1.0, Run=2
Using fixed Eta=0.05 for Dirichlet smoothing
📊 Basic Information:
   Vocabulary size: 1490
   Last iteration: 245
   Layer distributio

In [8]:
import os, glob, math
import numpy as np
import pandas as pd

# =========================
# 1) JSD (ln basis, returns distance)
# =========================
def jensen_shannon_distance(p, q, eps=1e-12):
    """
    Jensen–Shannon distance (natural logarithm ln basis, returns distance = sqrt(JSD_divergence)).
    p, q: 1D probability vectors (can be non-normalized, function will normalize internally)
    """
    p = np.asarray(p, dtype=float)
    q = np.asarray(q, dtype=float)

    # Normalize + numerical smoothing
    p = np.clip(p, eps, None); p = p / p.sum()
    q = np.clip(q, eps, None); q = q / q.sum()
    m = 0.5 * (p + q)

    def _kl(a, b):
        # ln basis
        return np.sum(a * (np.log(a) - np.log(b)))

    js_div = 0.5 * _kl(p, m) + 0.5 * _kl(q, m)  # JSD (divergence, ln)
    js_dist = math.sqrt(js_div)                 # distance
    return js_dist


# ==============================
# 2) Tools: weighted/unweighted statistical functions
# ==============================
def weighted_mean(values, weights, eps=1e-12):
    v = np.asarray(values, dtype=float)
    w = np.asarray(weights, dtype=float)
    s = w.sum()
    return float((v * w).sum() / max(s, eps))

def dict_to_prob_vector(count_df_for_node, vocab, count_col='count', eta=0.05):
    """
    Convert word counts (or weights) for a specific node to probability vector; supports Dirichlet smoothing eta.
    count_df_for_node: DataFrame (subset) containing only (word, count) rows for this node
    vocab: list[str] unified vocabulary ordering
    count_col: count field name (default 'count')
    eta: Dirichlet smoothing (commonly 0.05)
    """
    word_to_idx = {w:i for i, w in enumerate(vocab)}
    vec = np.zeros(len(vocab), dtype=float)
    for _, row in count_df_for_node.iterrows():
        w = row['word']
        if pd.isna(w): 
            continue
        i = word_to_idx.get(w)
        if i is not None:
            vec[i] = float(row[count_col])
    vec = vec + eta
    s = vec.sum()
    if s <= 0:
        return np.full(len(vocab), 1.0/len(vocab))
    return vec / s


# ===========================================================
# 3) Directory-wise calculation: inter-layer parent-child JSD (weighted & unweighted output)
# ===========================================================
def calculate_inter_layer_jensen_shannon_distances_weighted(
    base_path=".",
    count_filename="iteration_node_word_distributions.csv",
    entropy_filename="corrected_renyi_entropy.csv",
    count_col='count',
    eta=0.05,
    default_alpha=0.1,
    weight_field_candidates=('child_token_count', 'child_doc_count'),  # weight field priority
    output_suffix="_weighted"  # output filename suffix
):
    """
    Calculate parent-child JSD between adjacent layers (inter-layer JSD), output both weighted and unweighted statistics.
    - Weights prioritize the first existing column from weight_field_candidates; if none exist, use equal weights.
    - JSD basis: ln (natural logarithm), returns "distance" (sqrt JSD divergence).
    - Probability construction: Add eta Dirichlet smoothing to node word counts then normalize.
    """
    pattern = os.path.join(base_path, "**", count_filename)
    files = glob.glob(pattern, recursive=True)
    print("="*80)
    print("Starting inter-layer Jensen–Shannon distance calculation (weighted version, ln basis, distance)...")
    print("="*80)
    print(f"Found {len(files)} word distribution files to process\n")

    def parse_alpha_from_folder(folder_name, default_val=default_alpha):
        alpha = default_val
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
                return alpha
            except Exception:
                pass
        # Fallback: pattern matching by naming convention
        if 'a001' in folder_name: alpha = 0.01
        elif 'a005' in folder_name: alpha = 0.05
        elif 'a02'  in folder_name: alpha = 0.2
        elif 'a05'  in folder_name and 'a005' not in folder_name: alpha = 0.5
        elif 'a1_'  in folder_name or folder_name.split('_')[-1] == 'a1': alpha = 1.0
        elif 'a01'  in folder_name: alpha = 0.1
        return alpha

    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)

        alpha = parse_alpha_from_folder(folder_name, default_val=default_alpha)

        print("\n" + "="*80)
        print(f"[{idx}/{len(files)}] Calculating inter-layer JSD (weighted version): Alpha={alpha}")
        print("="*80)

        try:
            # 1) Read node-word counts
            word_df = pd.read_csv(file_path)
            word_df.columns = [str(c).strip("'\" ") for c in word_df.columns]

            max_iter = word_df['iteration'].max()
            last_iter_df = word_df[word_df['iteration'] == max_iter].copy()

            # Unified vocabulary
            vocab = sorted(list(last_iter_df['word'].dropna().unique()))
            print(f"📊 Basic Information:\n   Vocabulary size: {len(vocab)}\n   Last iteration: {max_iter}")

            # 2) Read layer and parent-child information
            entropy_path = os.path.join(folder_path, entropy_filename)
            if not os.path.exists(entropy_path):
                print("⚠️ Entropy file not found, skipping")
                continue
            ent = pd.read_csv(entropy_path)
            # Expected fields: node_id, layer, child_ids (format "[1, 5, 9]"), document_count, (optional)token_count
            ent.columns = [str(c).strip() for c in ent.columns]
            print(f"   Node count: {ent.shape[0]}")

            # 3) Construct probability distribution φ for each node (based on last iteration)
            node_ids = ent['node_id'].unique().tolist()
            node_phi = {}
            for nid in node_ids:
                sub = last_iter_df[last_iter_df['node_id'] == nid]
                phi = dict_to_prob_vector(sub, vocab, count_col=count_col, eta=eta)
                node_phi[nid] = phi
            print(f"   ✓ Completed {len(node_phi)} node probability distributions (eta={eta})")

            # 4) Calculate JSD for adjacent layer parent-child edges and construct weights
            layers = sorted([l for l in ent['layer'].unique() if l >= 0])
            print("📐 Calculating inter-layer JSD...")
            print(f"   Available layers: {layers}")

            records = []
            for i in range(len(layers) - 1):
                parent_layer = layers[i]
                child_layer  = layers[i + 1]
                print(f"   Calculating Layer {parent_layer} -> Layer {child_layer}")

                parent_nodes = ent[ent['layer'] == parent_layer]
                child_nodes  = ent[ent['layer'] == child_layer]
                # For quick lookup
                child_meta   = child_nodes.set_index('node_id')

                layer_jsd_vals   = []
                layer_weights    = []

                for _, prow in parent_nodes.iterrows():
                    pid = prow['node_id']
                    if pid not in node_phi: 
                        continue

                    # Parse child_ids
                    child_ids_raw = prow.get('child_ids', '')
                    if pd.isna(child_ids_raw) or str(child_ids_raw).strip() == '':
                        continue
                    try:
                        cid_list = str(child_ids_raw).strip('[]')
                        child_ids = [int(x.strip()) for x in cid_list.split(',') if str(x).strip()!='']
                    except Exception:
                        child_ids = []

                    for cid in child_ids:
                        if cid not in node_phi: 
                            continue
                        # Confirm child node layer is correct
                        if cid in child_meta.index and int(child_meta.loc[cid, 'layer']) != child_layer:
                            continue

                        # Get φ and calculate JSD distance (ln basis)
                        jsd_dist = jensen_shannon_distance(node_phi[pid], node_phi[cid])

                        # Select weight
                        w = 1.0
                        # Prioritize first existing field from candidates
                        if cid in child_meta.index:
                            for wf in weight_field_candidates:
                                if wf in child_meta.columns:
                                    val = child_meta.loc[cid, wf]
                                    if pd.notna(val):
                                        w = float(val)
                                        break
                            # If none exist, try parent/child document_count as approximation
                            if w == 1.0:
                                # child_doc_count priority, then parent_doc_count
                                if 'document_count' in child_meta.columns:
                                    val = child_meta.loc[cid, 'document_count']
                                    if pd.notna(val): 
                                        w = float(val)
                                if (w == 1.0) and ('document_count' in parent_nodes.columns):
                                    pv = prow.get('document_count', np.nan)
                                    if pd.notna(pv):
                                        w = float(pv)

                        # Record
                        rec = {
                            'parent_layer': int(parent_layer),
                            'child_layer':  int(child_layer),
                            'parent_node_id': int(pid),
                            'child_node_id':  int(cid),
                            'js_distance':    float(jsd_dist),
                            'weight_edge':    float(w),
                            'alpha':          float(alpha),
                            'eta_used':       float(eta),
                        }
                        # For convenience, also include doc counts (if exist)
                        if 'document_count' in parent_nodes.columns:
                            rec['parent_doc_count'] = float(prow.get('document_count', np.nan))
                        if cid in child_meta.index and 'document_count' in child_meta.columns:
                            rec['child_doc_count'] = float(child_meta.loc[cid, 'document_count'])
                        if cid in child_meta.index and weight_field_candidates[0] in child_meta.columns:
                            rec[weight_field_candidates[0]] = float(child_meta.loc[cid, weight_field_candidates[0]])
                        layer_jsd_vals.append(jsd_dist)
                        layer_weights.append(w)
                        records.append(rec)

                # Layer-wise statistics
                if layer_jsd_vals:
                    simple_avg = float(np.mean(layer_jsd_vals))
                    weighted_avg = weighted_mean(layer_jsd_vals, layer_weights)
                    std_val = float(np.std(layer_jsd_vals))
                    print(f"     📊 Layer {parent_layer}->{child_layer}: "
                          f"{len(layer_jsd_vals)} pairs, simple mean={simple_avg:.4f}, weighted mean={weighted_avg:.4f}, σ={std_val:.4f}")

            # 5) Output details and layer summary
            if records:
                inter_df = pd.DataFrame(records)
                out_detail = os.path.join(folder_path, f'inter_layer_jensen_shannon_distances{output_suffix}.csv')
                inter_df.to_csv(out_detail, index=False)

                # Layer summary
                sums = []
                for pl in sorted(inter_df['parent_layer'].unique()):
                    cl = pl + 1
                    sub = inter_df[(inter_df['parent_layer']==pl) & (inter_df['child_layer']==cl)]
                    if sub.empty: 
                        continue
                    sa = sub['js_distance'].mean()
                    wa = weighted_mean(sub['js_distance'].values, sub['weight_edge'].values)
                    med = sub['js_distance'].median()
                    std = sub['js_distance'].std()
                    mn = sub['js_distance'].min()
                    mx = sub['js_distance'].max()
                    sums.append({
                        'parent_layer': pl,
                        'child_layer':  cl,
                        'pair_count':   int(len(sub)),
                        'simple_avg_js_distance':   float(sa),
                        'weighted_avg_js_distance': float(wa),
                        'std_js_distance':           float(std),
                        'median_js_distance':        float(med),
                        'min_js_distance':           float(mn),
                        'max_js_distance':           float(mx),
                        'alpha': float(alpha),
                        'eta_used': float(eta),
                    })
                sum_df = pd.DataFrame(sums)
                out_summary = os.path.join(folder_path, f'inter_layer_js_summary{output_suffix}.csv')
                sum_df.to_csv(out_summary, index=False)

                print("💾 Inter-layer JSD results saved:")
                print(f"   Detailed results: {out_detail}")
                print(f"   Summary results: {out_summary}")
                print("📊 Inter-layer JSD layer summary (weighted + unweighted):")
                for _, row in sum_df.iterrows():
                    pl, cl = int(row['parent_layer']), int(row['child_layer'])
                    pc     = int(row['pair_count'])
                    sa     = row['simple_avg_js_distance']
                    wa     = row['weighted_avg_js_distance']
                    sd     = row['std_js_distance']
                    md     = row['median_js_distance']
                    print(f"   L{pl}->{cl}: {pc} pairs, simple mean={sa:.4f}, weighted mean={wa:.4f}, median={md:.4f}, σ={sd:.4f}")

        except Exception as e:
            import traceback
            print(f"❌ Processing failed: {e}")
            traceback.print_exc()

    print("\n✅ Inter-layer JSD calculation completed! (weighted version, ln basis)")


# ===========================================================
# 4) Aggregate by alpha (read *weighted.csv)
# ===========================================================
def aggregate_inter_layer_jsd_by_alpha_weighted(
    base_path=".",
    summary_filename_pattern="inter_layer_js_summary_weighted.csv",
    output_overall='alpha_inter_layer_jsd_comparison_weighted.csv'
):
    """
    Aggregate all inter_layer_js_summary_weighted.csv files from all directories, group by alpha & layer pairs,
    output mean and standard deviation of (simple mean/weighted mean).
    """
    pattern = os.path.join(base_path, "**", summary_filename_pattern)
    files = glob.glob(pattern, recursive=True)
    print("="*80)
    print("Starting aggregation of inter-layer JSD by alpha value (weighted version)...")
    print("="*80)
    print(f"🔍 Found {len(files)} inter-layer JSD summary files (weighted)")

    all_rows = []
    for f in files:
        try:
            df = pd.read_csv(f)
            # Infer alpha & run_id (adjust according to your directory structure)
            folder = os.path.dirname(f)
            folder_name = os.path.basename(folder)
            parent_folder = os.path.dirname(folder)

            # Parse alpha
            alpha = None
            if 'alpha_' in folder_name:
                try:
                    alpha = float(folder_name.split('alpha_')[1].split('_')[0])
                except Exception:
                    pass
            if alpha is None:
                # Fallback
                if 'a001' in folder_name: alpha = 0.01
                elif 'a005' in folder_name: alpha = 0.05
                elif 'a02'  in folder_name: alpha = 0.2
                elif 'a05'  in folder_name and 'a005' not in folder_name: alpha = 0.5
                elif 'a1_'  in folder_name or folder_name.split('_')[-1]=='a1': alpha = 1.0
                elif 'a01'  in folder_name: alpha = 0.1

            # Parse run_id (adjust according to your naming)
            run_id = None
            if '_run_' in folder_name:
                run_id = folder_name.split('_run_')[-1]

            for _, r in df.iterrows():
                all_rows.append({
                    'alpha': alpha,
                    'parent_layer': r['parent_layer'],
                    'child_layer':  r['child_layer'],
                    'pair_count':   r['pair_count'],
                    'simple_avg_js_distance':   r['simple_avg_js_distance'],
                    'weighted_avg_js_distance': r['weighted_avg_js_distance'],
                    'std_js_distance':          r['std_js_distance'],
                    'median_js_distance':       r['median_js_distance'],
                    'min_js_distance':          r['min_js_distance'],
                    'max_js_distance':          r['max_js_distance'],
                    'eta_used':                 r.get('eta_used', np.nan),
                    'run_id': run_id,
                    'parent_folder': parent_folder
                })
        except Exception as e:
            print(f"Error reading {f}: {e}")

    if not all_rows:
        print("No valid data found, ending.")
        return

    big = pd.DataFrame(all_rows)

    print("="*70)
    print("Inter-layer JSD summary by ALPHA (weighted + unweighted)")
    print("="*70)

    # Aggregate by alpha & layer pairs
    grouped = big.groupby(['alpha', 'parent_layer', 'child_layer'])
    summary = grouped.agg({
        'simple_avg_js_distance':   ['mean', 'std', 'count'],
        'weighted_avg_js_distance': ['mean', 'std', 'count'],
        'median_js_distance':       ['mean', 'std'],
        'pair_count':               'mean',
    }).round(6)

    # Flatten column names
    summary.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in summary.columns]
    summary = summary.reset_index()

    # Save
    out_overall = os.path.join(base_path, output_overall)
    summary.to_csv(out_overall, index=False)
    print(f"\nOverall comparison file saved to: {out_overall}")

    # Brief display
    for _, row in summary.iterrows():
        alpha = row['alpha']
        pl, cl = int(row['parent_layer']), int(row['child_layer'])
        sa_m, sa_s, sa_n = row['simple_avg_js_distance_mean'], row['simple_avg_js_distance_std'], int(row['simple_avg_js_distance_count'])
        wa_m, wa_s, wa_n = row['weighted_avg_js_distance_mean'], row['weighted_avg_js_distance_std'], int(row['weighted_avg_js_distance_count'])
        print(f"  α={alpha} | L{pl}->{cl}: "
              f"simple mean={sa_m:.4f}±{(sa_s or 0):.4f} (n={sa_n}), "
              f"weighted mean={wa_m:.4f}±{(wa_s or 0):.4f} (n={wa_n})")

In [9]:
calculate_inter_layer_jensen_shannon_distances_weighted(
    base_path="/Volumes/My Passport/收敛结果/step3",      # Top-level path
    count_filename="iteration_node_word_distributions.csv",
    entropy_filename="corrected_renyi_entropy.csv",
    count_col="count",                    # Your count field
    eta=0.05,                             # Consistent with experiment
    default_alpha=0.1,
    weight_field_candidates=("child_token_count","child_doc_count"),
    output_suffix="_weighted"             # Output filename suffix
)

Starting inter-layer Jensen–Shannon distance calculation (weighted version, ln basis, distance)...
Found 17 word distribution files to process


[1/17] Calculating inter-layer JSD (weighted version): Alpha=1.0
📊 Basic Information:
   Vocabulary size: 1490
   Last iteration: 265
   Node count: 307
   ✓ Completed 307 node probability distributions (eta=0.05)
📐 Calculating inter-layer JSD...
   Available layers: [0, 1, 2]
   Calculating Layer 0 -> Layer 1
     📊 Layer 0->1: 64 pairs, simple mean=0.6598, weighted mean=0.6508, σ=0.0161
   Calculating Layer 1 -> Layer 2
     📊 Layer 1->2: 242 pairs, simple mean=0.6084, weighted mean=0.6265, σ=0.0610
💾 Inter-layer JSD results saved:
   Detailed results: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/inter_layer_jensen_shannon_distances_weighted.csv
   Summary results: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/inter_layer_js_summary_weighted.

In [10]:
aggregate_inter_layer_jsd_by_alpha_weighted(
    base_path="/Volumes/My Passport/收敛结果/step3",
    summary_filename_pattern="inter_layer_js_summary_weighted.csv",
    output_overall="alpha_inter_layer_jsd_comparison_weighted.csv"
)

Starting aggregation of inter-layer JSD by alpha value (weighted version)...
🔍 Found 18 inter-layer JSD summary files (weighted)
Inter-layer JSD summary by ALPHA (weighted + unweighted)

Overall comparison file saved to: /Volumes/My Passport/收敛结果/step3/alpha_inter_layer_jsd_comparison_weighted.csv
  α=0.01 | L0->1: simple mean=0.6275±0.0033 (n=3), weighted mean=0.5373±0.0359 (n=3)
  α=0.01 | L1->2: simple mean=0.5752±0.0083 (n=3), weighted mean=0.6042±0.0049 (n=3)
  α=0.05 | L0->1: simple mean=0.6303±0.0032 (n=3), weighted mean=0.5938±0.0158 (n=3)
  α=0.05 | L1->2: simple mean=0.5758±0.0101 (n=3), weighted mean=0.6119±0.0048 (n=3)
  α=0.1 | L0->1: simple mean=0.6461±0.0031 (n=3), weighted mean=0.6247±0.0163 (n=3)
  α=0.1 | L1->2: simple mean=0.5995±0.0008 (n=3), weighted mean=0.6292±0.0089 (n=3)
  α=0.2 | L0->1: simple mean=0.6536±0.0019 (n=3), weighted mean=0.6277±0.0254 (n=3)
  α=0.2 | L1->2: simple mean=0.6018±0.0015 (n=3), weighted mean=0.6282±0.0194 (n=3)
  α=0.5 | L0->1: simple m

In [11]:
import pandas as pd
import numpy as np
import os
import glob
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

def calculate_standard_coherence_from_corpus_corrected(corpus, word_distributions_df, top_k=15):
    """
    Corrected version: Calculate comprehensive node-level and global-level coherence metrics (adapted for step3)
    """
    
    print(f"📊 Preparing to calculate standard coherence metrics...")
    print(f"   Corpus document count: {len(corpus)}")
    print(f"   Node count: {word_distributions_df['node_id'].nunique()}")
    
    # 1. Prepare texts and dictionary
    texts = list(corpus.values())
    dictionary = Dictionary(texts)
    
    print(f"   Total documents: {len(texts)}")
    print(f"   Dictionary size: {len(dictionary)}")
    
    # 2. Prepare topics and node mapping
    topics = []
    node_topic_mapping = {}
    node_to_topic_idx = {}  # New: Direct mapping from node ID to topic index
    
    topic_idx = 0
    for node_id in word_distributions_df['node_id'].unique():
        node_data = word_distributions_df[word_distributions_df['node_id'] == node_id]
        top_words = node_data.nlargest(top_k, 'count')['word'].tolist()
        
        valid_words = []
        for word in top_words:
            if pd.notna(word) and word in dictionary.token2id:
                valid_words.append(word)
        
        if len(valid_words) >= 2:
            topics.append(valid_words)
            node_topic_mapping[node_id] = valid_words
            node_to_topic_idx[node_id] = topic_idx  # Direct mapping
            topic_idx += 1
    
    print(f"   Valid topic count: {len(topics)}")
    
    if len(topics) == 0:
        return {}, {}, {}
    
    # 3. Calculate all coherence metrics (global + per-topic)
    coherence_measures = ['c_npmi', 'c_v', 'u_mass']
    global_coherence = {}
    per_topic_coherence = {}
    
    for measure in coherence_measures:
        try:
            print(f"   Calculating {measure}...")
            
            cm = CoherenceModel(
                topics=topics,
                texts=texts,
                dictionary=dictionary,
                coherence=measure,
                processes=1
            )
            
            # Global average coherence
            global_score = cm.get_coherence()
            global_coherence[measure] = global_score
            
            # Per-topic coherence
            per_topic_scores = cm.get_coherence_per_topic()
            per_topic_coherence[measure] = per_topic_scores
            
            print(f"   ✓ {measure}: Global={global_score:.4f}, Range=[{min(per_topic_scores):.4f}, {max(per_topic_scores):.4f}]")
            
        except Exception as e:
            print(f"   ❌ Error calculating {measure}: {e}")
            global_coherence[measure] = 0.0
            per_topic_coherence[measure] = [0.0] * len(topics)
    
    return global_coherence, per_topic_coherence, node_to_topic_idx

def process_coherence_with_original_corpus_corrected_step3(base_path=".", corpus=None, top_k=15):
    """
    Corrected version: Comprehensive calculation of node-level and global-level coherence metrics (adapted for step3 alpha parameters)
    """
    
    if corpus is None:
        print("❌ Must provide original corpus")
        return
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(files)} word distribution files to process")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # Extract alpha value from folder name (adapted for step3)
        alpha = 0.1  # Default value
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError):
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        # Fixed parameters (adapted for step3)
        eta = 0.05  # eta is fixed at 0.05 in step3
        gamma = 0.05  # Fixed value
        depth = 3  # Fixed value
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] Processing file: {folder_name}")
        print(f"Parameters - Alpha: {alpha}, Eta: {eta}, Gamma: {gamma}, Depth: {depth}")
        print(f"{'='*80}")
        
        try:
            # Read data
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            print(f"📈 Last iteration: {max_iteration}")
            print(f"📈 Node count: {last_iteration_data['node_id'].nunique()}")
            
            # Calculate coherence (corrected version)
            global_coherence, per_topic_coherence, node_to_topic_idx = calculate_standard_coherence_from_corpus_corrected(
                corpus, last_iteration_data, top_k=top_k
            )
            
            if not global_coherence:
                print("⚠️ Coherence calculation failed, skipping this file")
                continue
            
            # Prepare data for saving
            results_data = []
            
            for node_id in last_iteration_data['node_id'].unique():
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                top_words = node_words.nlargest(top_k, 'count')['word'].tolist()
                top_words = [word for word in top_words if pd.notna(word)]
                
                # Get coherence metrics for this node (corrected version)
                node_coherence_scores = {}
                
                if node_id in node_to_topic_idx:
                    # Get metrics directly through index
                    topic_idx = node_to_topic_idx[node_id]
                    
                    for measure in ['c_npmi', 'c_v', 'u_mass']:
                        if measure in per_topic_coherence:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = per_topic_coherence[measure][topic_idx]
                        else:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = 0.0
                else:
                    # If node is not in mapping, set to 0
                    for measure in ['npmi', 'v', 'u_mass']:
                        node_coherence_scores[f'node_{measure}'] = 0.0
                
                results_data.append({
                    'node_id': node_id,
                    'alpha': alpha,  # Corrected: use alpha instead of eta
                    'eta': eta,  # Record fixed eta value
                    'gamma': gamma, 
                    'depth': depth,
                    'top_k': top_k,
                    'top_words': ', '.join(top_words[:10]),
                    'word_count': len(top_words),
                    
                    # Node-level coherence metrics (corrected)
                    'node_npmi': node_coherence_scores.get('node_npmi', 0.0),
                    'node_c_v': node_coherence_scores.get('node_v', 0.0),
                    'node_u_mass': node_coherence_scores.get('node_u_mass', 0.0),
                    
                    # Global-level coherence metrics
                    'global_npmi': global_coherence.get('c_npmi', 0.0),
                    'global_c_v': global_coherence.get('c_v', 0.0),
                    'global_u_mass': global_coherence.get('u_mass', 0.0),
                    
                    'iteration': max_iteration
                })
            
            # Save results
            results_df = pd.DataFrame(results_data)
            output_path = os.path.join(folder_path, 'standard_coherence.csv')
            results_df.to_csv(output_path, index=False)
            
            print(f"💾 Standard coherence results saved to: {output_path}")
            print(f"📊 Results summary:")
            print(f"   - Global NPMI: {global_coherence.get('c_npmi', 0.0):.4f}")
            print(f"   - Global C_V: {global_coherence.get('c_v', 0.0):.4f}")
            print(f"   - Global U_Mass: {global_coherence.get('u_mass', 0.0):.4f}")
            
            # Display node-level metric ranges
            if len(results_df) > 0:
                print(f"   - Node NPMI range: [{results_df['node_npmi'].min():.4f}, {results_df['node_npmi'].max():.4f}]")
                print(f"   - Node C_V range: [{results_df['node_c_v'].min():.4f}, {results_df['node_c_v'].max():.4f}]")
                print(f"   - Node U_Mass range: [{results_df['node_u_mass'].min():.4f}, {results_df['node_u_mass'].max():.4f}]")
            
        except Exception as e:
            import traceback
            print(f"❌ Error processing file {file_path}: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ Standard coherence calculation for all files completed!")

In [12]:
""" 0. Setup part: import necessary libraries and set up environment """

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from collections import Counter, defaultdict
import numpy as np
import math
import copy
import itertools
import matplotlib.pyplot as plt
import matplotlib as mpl

import joblib
from joblib import Parallel, delayed
from threading import Thread

import os
import pickle
import time

import operator
from functools import reduce
import json
import cProfile

# Download NLTK data one time
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')

# Chinese character support in matplotlib
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']  
plt.rcParams['axes.unicode_minus'] = False  


""" 1.1 Data Preprocessing: load data, clean text, lemmatization, remove low-frequency words"""

# Map POS tags to WordNet format, Penn Treebank annotation: fine-grained (45 tags), WordNet annotation: coarse-grained (4 tags: a, v, n, r)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default noun

# Text cleaning and lemmatization preprocessing function
def clean_and_lemmatize(text):
    if pd.isnull(text):
        return []
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters using regex
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    pos_tags = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags]
    return lemmatized  

#-----------------Load data----------------
data = pd.read_excel('/Volumes/My Passport/收敛结果/step2/papers_CM.xlsx', usecols=['PaperID', 'Abstract', 'Keywords', 'Year'])

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Clean and lemmatize the abstracts
data['Lemmatized_Tokens'] = data['Abstract'].apply(clean_and_lemmatize)

# Count word frequencies
all_tokens = [word for tokens in data['Lemmatized_Tokens'] for word in tokens]
word_counts = Counter(all_tokens)

# Set a minimum frequency threshold for valid words
min_freq = 10
valid_words = set([word for word, freq in word_counts.items() if freq >= min_freq])

# Remove rare words based on frequency threshold
def remove_rare_words(tokens):
    return [word for word in tokens if word in valid_words]

data['Filtered_Tokens'] = data['Lemmatized_Tokens'].apply(remove_rare_words)

# Join tokens back into cleaned abstracts
data['Cleaned_Abstract'] = data['Filtered_Tokens'].apply(lambda x: " ".join(x))

# Create a cleaned DataFrame with relevant columns
cleaned_data = data[['PaperID', 'Year', 'Cleaned_Abstract']]
cleaned_data = cleaned_data[~(cleaned_data['PaperID'] == 57188)] # this paper has no abstract
cleaned_data = cleaned_data.reset_index(drop=True) 
cleaned_data.insert(0, 'Document_ID', range(len(cleaned_data))) 
abstract_list = cleaned_data['Cleaned_Abstract'].apply(lambda x: x.split()).tolist()

corpus = {doc_id: abstract_list for doc_id, abstract_list in enumerate(abstract_list)}
# cleaned_data.to_csv('./data/processed/cleaned_data.xlsx', index=False, encoding='utf-8-sig')

In [13]:
# Delete old incomplete files
def clean_old_coherence_files(base_path="."):
    """Delete old incomplete standard coherence files"""
    pattern = os.path.join(base_path, "**", "standard_coherence.csv")
    files = glob.glob(pattern, recursive=True)
    
    deleted_count = 0
    for file_path in files:
        try:
            os.remove(file_path)
            print(f"✓ Deleted old file: {os.path.basename(os.path.dirname(file_path))}")
            deleted_count += 1
        except Exception as e:
            print(f"❌ Deletion failed: {file_path} - {e}")
    
    print(f"🗑️ Total {deleted_count} old coherence files deleted")

# Execute corrected version calculation
base_path = "/Volumes/My Passport/收敛结果/step3"
top_k = 5

print("=" * 80)
print("🗑️ Cleaning old incomplete files...")
print("=" * 80)
clean_old_coherence_files(base_path)

print("\n" + "=" * 80)
print("🔄 Starting recalculation of complete coherence metrics...")
print("=" * 80)

# Use corrected version function
process_coherence_with_original_corpus_corrected_step3(base_path, corpus, top_k)

print("=" * 80)
print("✅ Corrected version coherence calculation completed!")
print("=" * 80)

🗑️ Cleaning old incomplete files...
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_1_run_2
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.2_run_3
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.2_run_1
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.2_run_2
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.5_run_3
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.5_run_1
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.5_run_2
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.05_run_3
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.05_run_1
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.05_run_2
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_2
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_3
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.1_run_1
✓ Deleted old file: depth_3_gamma_0.05_eta_0.05_alpha_0.01_run_3
✓ De

In [14]:
import pandas as pd
import os
import glob

def add_layer_and_document_info_to_coherence(base_path="."):
    """
    Add layer and document_count information from corrected_renyi_entropy.csv to standard_coherence.csv
    
    Parameters:
    base_path: str, root directory of result files
    """
    
    # Find all folders containing standard_coherence.csv
    pattern = os.path.join(base_path, "**", "standard_coherence.csv")
    coherence_files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(coherence_files)} standard coherence files to process")
    
    for idx, coherence_file_path in enumerate(coherence_files, 1):
        folder_path = os.path.dirname(coherence_file_path)
        folder_name = os.path.basename(folder_path)
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(coherence_files)}] Processing folder: {folder_name}")
        print(f"{'='*80}")
        
        # Check if corresponding corrected_renyi_entropy.csv exists
        entropy_file_path = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
        
        if not os.path.exists(entropy_file_path):
            print(f"⚠️  Corresponding entropy file not found: {entropy_file_path}")
            continue
        
        try:
            # Read both files
            print("📖 Reading files...")
            coherence_df = pd.read_csv(coherence_file_path)
            entropy_df = pd.read_csv(entropy_file_path)
            
            print(f"   Coherence file: {len(coherence_df)} rows")
            print(f"   Entropy file: {len(entropy_df)} rows")
            
            # Check if layer and document_count columns already exist
            existing_cols = coherence_df.columns.tolist()
            has_layer = 'layer' in existing_cols
            has_doc_count = 'document_count' in existing_cols
            
            print(f"   Current columns: {existing_cols}")
            print(f"   Has layer column: {has_layer}")
            print(f"   Has document_count column: {has_doc_count}")
            
            # Create node_id to layer and document_count mappings
            node_layer_map = entropy_df.set_index('node_id')['layer'].to_dict()
            node_doc_count_map = entropy_df.set_index('node_id')['document_count'].to_dict()
            
            print(f"   Mappable nodes: {len(node_layer_map)}")
            
            # Add or update layer column
            if not has_layer:
                coherence_df['layer'] = coherence_df['node_id'].map(node_layer_map)
                print("   ✓ Added layer column")
            else:
                coherence_df['layer'] = coherence_df['node_id'].map(node_layer_map)
                print("   ✓ Updated layer column")
            
            # Add or update document_count column
            if not has_doc_count:
                coherence_df['document_count'] = coherence_df['node_id'].map(node_doc_count_map)
                print("   ✓ Added document_count column")
            else:
                coherence_df['document_count'] = coherence_df['node_id'].map(node_doc_count_map)
                print("   ✓ Updated document_count column")
            
            # Check mapping results
            layer_null_count = coherence_df['layer'].isnull().sum()
            doc_count_null_count = coherence_df['document_count'].isnull().sum()
            
            if layer_null_count > 0:
                print(f"   ⚠️  {layer_null_count} nodes missing layer information")
            
            if doc_count_null_count > 0:
                print(f"   ⚠️  {doc_count_null_count} nodes missing document_count information")
            
            # Display layer distribution statistics
            layer_stats = coherence_df['layer'].value_counts().sort_index()
            print(f"   📊 Layer distribution: {layer_stats.to_dict()}")
            
            # Display document count statistics
            doc_stats = coherence_df['document_count'].describe()
            print(f"   📊 Document count statistics:")
            print(f"      Min: {doc_stats['min']:.0f}")
            print(f"      Max: {doc_stats['max']:.0f}")
            print(f"      Mean: {doc_stats['mean']:.1f}")
            
            # Save updated file
            coherence_df.to_csv(coherence_file_path, index=False)
            print(f"💾 Updated and saved: {coherence_file_path}")
            
            # Display updated column structure
            updated_cols = coherence_df.columns.tolist()
            print(f"   Updated columns: {updated_cols}")
            
        except Exception as e:
            import traceback
            print(f"❌ Error processing file {coherence_file_path}: {str(e)}")
            print("Detailed error information:")
            traceback.print_exc()
    
    print(f"\n✅ Layer and document_count information update completed for all standard coherence files!")

def verify_coherence_files_update(base_path="."):
    """
    Verify update status of standard_coherence.csv files (adapted for step3)
    """
    pattern = os.path.join(base_path, "**", "standard_coherence.csv")
    coherence_files = glob.glob(pattern, recursive=True)
    
    print("🔍 Verifying update results (Step3):")
    print("="*80)
    
    all_have_layer = True
    all_have_doc_count = True
    
    for file_path in coherence_files:
        folder_name = os.path.basename(os.path.dirname(file_path))
        
        try:
            df = pd.read_csv(file_path)
            has_layer = 'layer' in df.columns
            has_doc_count = 'document_count' in df.columns
            
            layer_null = df['layer'].isnull().sum() if has_layer else "No column"
            doc_null = df['document_count'].isnull().sum() if has_doc_count else "No column"
            
            status = "✅" if (has_layer and has_doc_count and layer_null == 0 and doc_null == 0) else "⚠️"
            
            print(f"{status} {folder_name}")
            print(f"   Layer column: {'Yes' if has_layer else 'No'} (null values: {layer_null})")
            print(f"   DocCount column: {'Yes' if has_doc_count else 'No'} (null values: {doc_null})")
            
            if not has_layer:
                all_have_layer = False
            if not has_doc_count:
                all_have_doc_count = False
                
        except Exception as e:
            print(f"❌ {folder_name}: Read failed - {e}")
    
    print("="*80)
    print(f"📋 Summary:")
    print(f"   Total files: {len(coherence_files)}")
    print(f"   All have layer column: {'Yes' if all_have_layer else 'No'}")
    print(f"   All have document_count column: {'Yes' if all_have_doc_count else 'No'}")

In [15]:
# Execute update
base_path = "/Volumes/My Passport/收敛结果/step3"

print("=" * 80)
print("Starting to add layer and document_count information to standard_coherence.csv...")
print("=" * 80)

# Add layer and document_count information
add_layer_and_document_info_to_coherence(base_path)

print("\n" + "=" * 80)
print("Verifying update results...")
print("=" * 80)

# Verify update results
verify_coherence_files_update(base_path)

print("\n" + "=" * 80)
print("✅ Layer and document_count information addition completed!")
print("=" * 80)

Starting to add layer and document_count information to standard_coherence.csv...
🔍 Found 17 standard coherence files to process

[1/17] Processing folder: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3
📖 Reading files...
   Coherence file: 307 rows
   Entropy file: 307 rows
   Current columns: ['node_id', 'alpha', 'eta', 'gamma', 'depth', 'top_k', 'top_words', 'word_count', 'node_npmi', 'node_c_v', 'node_u_mass', 'global_npmi', 'global_c_v', 'global_u_mass', 'iteration']
   Has layer column: False
   Has document_count column: False
   Mappable nodes: 307
   ✓ Added layer column
   ✓ Added document_count column
   📊 Layer distribution: {0: 1, 1: 64, 2: 242}
   📊 Document count statistics:
      Min: 1
      Max: 970
      Mean: 9.5
💾 Updated and saved: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/standard_coherence.csv
   Updated columns: ['node_id', 'alpha', 'eta', 'gamma', 'depth', 'top_k', 'top_words', 'word_count', 'node_npmi', 'node_c

In [16]:
def calculate_coherence_layered_analysis(base_path=".", corpus=None, top_k=15):
    """
    Calculate node coherence metrics and perform weighted hierarchical analysis (adapted for step3)
    """
    
    if corpus is None:
        print("❌ Must provide original corpus")
        return
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(files)} word distribution files to process (top_k={top_k})")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # Corrected: Adapt step3 parameter extraction (mainly alpha parameter)
        eta = 0.05  # eta is fixed at 0.05 in step3
        gamma = 0.05
        depth = 3
        alpha = 0.1  # Main varying parameter in step3
        
        # Extract alpha value from folder name (adapted for step3)
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError):
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] Processing file: {folder_name} (k={top_k})")
        print(f"Parameters - Alpha: {alpha}, Eta: {eta}, Gamma: {gamma}, Depth: {depth}")  # Corrected: highlight alpha parameter
        print(f"{'='*80}")
        
        try:
            # Read data
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]
            
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            
            # Read layer and document count information
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print("⚠️ Entropy file not found, skipping this file")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            print(f"📈 Last iteration: {max_iteration}")
            print(f"📈 Node count: {last_iteration_data['node_id'].nunique()}")
            
            # Calculate node-level coherence (keep only node-level)
            texts = list(corpus.values())
            dictionary = Dictionary(texts)
            
            topics = []
            node_to_topic_idx = {}
            
            topic_idx = 0
            for node_id in last_iteration_data['node_id'].unique():
                node_data = last_iteration_data[last_iteration_data['node_id'] == node_id]
                top_words = node_data.nlargest(top_k, 'count')['word'].tolist()
                
                valid_words = []
                for word in top_words:
                    if pd.notna(word) and word in dictionary.token2id:
                        valid_words.append(word)
                
                if len(valid_words) >= 2:
                    topics.append(valid_words)
                    node_to_topic_idx[node_id] = topic_idx
                    topic_idx += 1
            
            if len(topics) == 0:
                print("⚠️ No valid topics, skipping this file")
                continue
            
            # Calculate various coherence metrics
            coherence_measures = ['c_npmi', 'c_v', 'u_mass']
            per_topic_coherence = {}
            
            for measure in coherence_measures:
                try:
                    print(f"   Calculating {measure}...")
                    
                    cm = CoherenceModel(
                        topics=topics,
                        texts=texts,
                        dictionary=dictionary,
                        coherence=measure,
                        processes=1
                    )
                    
                    per_topic_scores = cm.get_coherence_per_topic()
                    per_topic_coherence[measure] = per_topic_scores
                    
                    print(f"   ✓ {measure}: Range=[{min(per_topic_scores):.4f}, {max(per_topic_scores):.4f}]")
                    
                except Exception as e:
                    print(f"   ❌ Error calculating {measure}: {e}")
                    per_topic_coherence[measure] = [0.0] * len(topics)
            
            # Merge node-level coherence with layer information
            node_coherence_data = []
            
            for node_id in last_iteration_data['node_id'].unique():
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                top_words = node_words.nlargest(top_k, 'count')['word'].tolist()
                top_words = [word for word in top_words if pd.notna(word)]
                
                # Get layer and document count information
                node_entropy_info = entropy_df[entropy_df['node_id'] == node_id]
                if len(node_entropy_info) > 0:
                    layer = node_entropy_info['layer'].iloc[0]
                    document_count = node_entropy_info['document_count'].iloc[0]
                else:
                    layer = -1
                    document_count = 0
                
                # Get node coherence scores
                node_coherence_scores = {}
                if node_id in node_to_topic_idx:
                    topic_idx = node_to_topic_idx[node_id]
                    for measure in ['c_npmi', 'c_v', 'u_mass']:
                        if measure in per_topic_coherence:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = per_topic_coherence[measure][topic_idx]
                        else:
                            measure_name = measure.replace('c_', '') if measure.startswith('c_') else measure
                            node_coherence_scores[f'node_{measure_name}'] = 0.0
                else:
                    for measure in ['npmi', 'v', 'u_mass']:
                        node_coherence_scores[f'node_{measure}'] = 0.0
                
                node_coherence_data.append({
                    'node_id': node_id,
                    'alpha': alpha,  # Corrected: use alpha as main parameter
                    'eta': eta,  # Record fixed eta value
                    'gamma': gamma, 
                    'depth': depth,
                    'layer': layer,
                    'document_count': document_count,
                    'top_k': top_k,
                    'top_words': ', '.join(top_words[:10]),
                    'word_count': len(top_words),
                    
                    # Keep only node-level coherence metrics
                    'node_npmi': node_coherence_scores.get('node_npmi', 0.0),
                    'node_c_v': node_coherence_scores.get('node_v', 0.0),
                    'node_u_mass': node_coherence_scores.get('node_u_mass', 0.0),
                    
                    'iteration': max_iteration
                })
            
            # Save node-level coherence results (with k value)
            coherence_df = pd.DataFrame(node_coherence_data)
            node_output_path = os.path.join(folder_path, f'node_coherence_k{top_k}.csv')
            coherence_df.to_csv(node_output_path, index=False)
            
            # Calculate layer weighted average coherence
            layer_coherence_summary = []
            
            for layer in coherence_df['layer'].unique():
                if layer == -1:  # Skip invalid layers
                    continue
                    
                layer_data = coherence_df[coherence_df['layer'] == layer]
                total_docs = layer_data['document_count'].sum()
                
                if total_docs > 0:
                    # Document count weighted average
                    weighted_npmi = (layer_data['document_count'] * layer_data['node_npmi']).sum() / total_docs
                    weighted_c_v = (layer_data['document_count'] * layer_data['node_c_v']).sum() / total_docs
                    weighted_u_mass = (layer_data['document_count'] * layer_data['node_u_mass']).sum() / total_docs
                    
                    # Simple average (unweighted)
                    simple_npmi = layer_data['node_npmi'].mean()
                    simple_c_v = layer_data['node_c_v'].mean()
                    simple_u_mass = layer_data['node_u_mass'].mean()
                    
                    layer_coherence_summary.append({
                        'layer': layer,
                        'node_count': len(layer_data),
                        'total_documents': total_docs,
                        'avg_documents_per_node': total_docs / len(layer_data),
                        
                        # Document count weighted average coherence
                        'weighted_avg_npmi': weighted_npmi,
                        'weighted_avg_c_v': weighted_c_v,
                        'weighted_avg_u_mass': weighted_u_mass,
                        
                        # Simple average coherence
                        'simple_avg_npmi': simple_npmi,
                        'simple_avg_c_v': simple_c_v,
                        'simple_avg_u_mass': simple_u_mass,
                        
                        # Standard deviation
                        'std_npmi': layer_data['node_npmi'].std(),
                        'std_c_v': layer_data['node_c_v'].std(),
                        'std_u_mass': layer_data['node_u_mass'].std(),
                        
                        'top_k': top_k,  # Add k value record
                        'alpha': alpha,  # Corrected: use alpha
                        'eta': eta,
                        'gamma': gamma,
                        'depth': depth
                    })
            
            # Save layer summary results (with k value)
            if layer_coherence_summary:
                layer_summary_df = pd.DataFrame(layer_coherence_summary)
                layer_output_path = os.path.join(folder_path, f'layer_coherence_summary_k{top_k}.csv')
                layer_summary_df.to_csv(layer_output_path, index=False)
                
                print(f"💾 Node coherence results saved to: {node_output_path}")
                print(f"💾 Layer summary results saved to: {layer_output_path}")
                
                print(f"📊 Layer coherence summary (k={top_k}):")
                for _, row in layer_summary_df.iterrows():
                    layer_num = int(row['layer'])
                    node_count = int(row['node_count'])
                    w_npmi = row['weighted_avg_npmi']
                    w_cv = row['weighted_avg_c_v']
                    w_umass = row['weighted_avg_u_mass']
                    print(f"   Layer {layer_num} ({node_count} nodes): NPMI={w_npmi:.4f}, C_V={w_cv:.4f}, U_Mass={w_umass:.4f}")
            
        except Exception as e:
            import traceback
            print(f"❌ Error processing file {file_path}: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ Coherence layered analysis completed for all files! (k={top_k})")

def aggregate_coherence_by_alpha(base_path=".", top_k=15):
    """
    Aggregate coherence statistics by alpha value for each layer (adapted for step3)
    """
    # Find all layer_coherence_summary_k{top_k}.csv files
    pattern = os.path.join(base_path, "**", f"layer_coherence_summary_k{top_k}.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Search pattern: layer_coherence_summary_k{top_k}.csv")
    print(f"🔍 Found {len(files)} layer summary files")
    
    all_data = []
    alpha_groups = {}
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # Corrected: Extract alpha value (adapted for step3)
        alpha = None
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha is None:
            continue
        
        # Extract run number
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        if alpha not in alpha_groups:
            alpha_groups[alpha] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'alpha': alpha,  # Corrected: use alpha
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'total_documents': row['total_documents'],
                    'weighted_avg_npmi': row['weighted_avg_npmi'],
                    'weighted_avg_c_v': row['weighted_avg_c_v'],
                    'weighted_avg_u_mass': row['weighted_avg_u_mass'],
                    'simple_avg_npmi': row['simple_avg_npmi'],
                    'simple_avg_c_v': row['simple_avg_c_v'],
                    'simple_avg_u_mass': row['simple_avg_u_mass'],
                    'top_k': top_k,  # Add k value record
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Convert to DataFrame and aggregate by alpha groups
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("No valid data found")
        return
    
    print("=" * 70)
    print(f"Coherence layer summary statistics by ALPHA value (k={top_k})")  # Corrected: display ALPHA
    print("=" * 70)
    
    # Generate summary files grouped by alpha
    for alpha, group_data in summary_df.groupby('alpha'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\nProcessing Alpha={alpha} (k={top_k})")  # Corrected: display Alpha
        
        layer_summary = group_data.groupby('layer').agg({
            'weighted_avg_npmi': ['mean', 'std', 'count'],
            'weighted_avg_c_v': ['mean', 'std', 'count'],
            'weighted_avg_u_mass': ['mean', 'std', 'count'],
            'simple_avg_npmi': ['mean', 'std'],
            'simple_avg_c_v': ['mean', 'std'],
            'simple_avg_u_mass': ['mean', 'std'],
            'node_count': 'mean',
            'total_documents': 'mean',
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # Flatten column names
        layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
        layer_summary = layer_summary.reset_index()
        layer_summary.insert(0, 'alpha', alpha)  # Corrected: use alpha
        layer_summary.insert(1, 'top_k', top_k)  # Add k value column
        
        # Save summary results (filename includes k value and alpha)
        output_filename = f'alpha_{alpha}_coherence_layer_summary_k{top_k}.csv'  # Corrected: use alpha
        output_path = os.path.join(parent_folder, output_filename)
        layer_summary.to_csv(output_path, index=False)
        
        print(f"  Summary file saved: {output_path}")
        print(f"  Number of layers: {len(layer_summary)}")
        
        # Display brief statistics
        for _, row in layer_summary.iterrows():
            layer_num = int(row['layer'])
            w_npmi = row['weighted_avg_npmi_mean']
            w_cv = row['weighted_avg_c_v_mean']
            w_umass = row['weighted_avg_u_mass_mean']
            run_count = int(row['weighted_avg_npmi_count'])
            
            print(f"    Layer {layer_num}: W_NPMI={w_npmi:.4f}, W_C_V={w_cv:.4f}, W_U_Mass={w_umass:.4f}, runs={run_count}")
    
    # Generate overall comparison file (filename includes k value and alpha)
    overall_summary = summary_df.groupby(['alpha', 'layer']).agg({
        'weighted_avg_npmi': ['mean', 'std'],
        'weighted_avg_c_v': ['mean', 'std'],
        'weighted_avg_u_mass': ['mean', 'std'],
        'run_id': 'count'
    }).round(4)
    
    overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
    overall_summary = overall_summary.reset_index()
    overall_summary.insert(2, 'top_k', top_k)  # Add k value column
    
    overall_output_path = os.path.join(base_path, f'alpha_coherence_layer_comparison_k{top_k}.csv')  # Corrected: use alpha
    overall_summary.to_csv(overall_output_path, index=False)
    print(f"\nOverall comparison file saved: {overall_output_path}")

In [17]:
# Execute streamlined coherence hierarchical analysis (filename includes k value)
base_path = "/Volumes/My Passport/收敛结果/step3"
top_k = 5

print("=" * 80)
print(f"Starting node coherence metric calculation and hierarchical analysis (k={top_k})...")
print("=" * 80)

# Calculate node coherence and layer summary
calculate_coherence_layered_analysis(base_path, corpus, top_k)

print("\n" + "=" * 80)
print(f"Starting aggregation of layer coherence statistics by eta value (k={top_k})...")
print("=" * 80)

# Aggregate by eta (pass top_k parameter)
aggregate_coherence_by_alpha(base_path, top_k)

print("=" * 80)
print(f"✅ Coherence hierarchical analysis completed! (k={top_k})")
print("=" * 80)

Starting node coherence metric calculation and hierarchical analysis (k=5)...
🔍 Found 17 word distribution files to process (top_k=5)

[1/17] Processing file: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3 (k=5)
Parameters - Alpha: 1.0, Eta: 0.05, Gamma: 0.05, Depth: 3
📈 Last iteration: 265
📈 Node count: 307
   Calculating c_npmi...
   ✓ c_npmi: Range=[-0.5374, 0.5347]
   Calculating c_v...
   ✓ c_v: Range=[0.1659, 0.9386]
   Calculating u_mass...
   ✓ u_mass: Range=[-13.0335, -0.5808]
💾 Node coherence results saved to: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/node_coherence_k5.csv
💾 Layer summary results saved to: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/layer_coherence_summary_k5.csv
📊 Layer coherence summary (k=5):
   Layer 0 (1 nodes): NPMI=-0.0013, C_V=0.4948, U_Mass=-0.5808
   Layer 2 (242 nodes): NPMI=0.0459, C_V=0.5337, U_Mass=-2.7186
   Layer 1 (64 nodes): NPMI=0.0520, C_V=

In [18]:
# First run complete perplexity calculation (adapted for step3)
from sklearn.model_selection import train_test_split
import math
import pandas as pd
import numpy as np
import os
import glob

def compute_perplexity_with_path_mapping_fixed(word_data, path_mapping_data, corpus, test_doc_ids, eta_smoothing=0.05):
    """
    Corrected version: Calculate perplexity using path mapping
    
    Parameters:
    word_data: DataFrame, node word distribution data
    path_mapping_data: DataFrame, document path mapping data
    corpus: dict, original corpus {doc_id: [words]}
    test_doc_ids: list, test document ID list
    eta_smoothing: float, smoothing parameter
    
    Returns:
    dict: perplexity calculation results
    """
    
    print(f"🔄 Starting perplexity calculation...")
    print(f"   Test document count: {len(test_doc_ids)}")
    print(f"   Path mapping count: {len(path_mapping_data)}")
    print(f"   Smoothing parameter: {eta_smoothing}")
    
    # Get vocabulary
    vocabulary = sorted(list(word_data['word'].dropna().unique()))
    vocab_size = len(vocabulary)
    word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
    
    print(f"   Vocabulary size: {vocab_size}")
    
    # Build node word distribution dictionary
    node_word_dist = {}
    for node_id in word_data['node_id'].unique():
        node_words = word_data[word_data['node_id'] == node_id]
        
        # Initialize count vector
        counts = np.zeros(vocab_size)
        
        # Fill word counts
        for _, row in node_words.iterrows():
            word = row['word']
            if pd.notna(word) and word in word_to_idx:
                counts[word_to_idx[word]] = row['count']
        
        # Add smoothing
        smoothed_counts = counts + eta_smoothing
        
        # Calculate probability distribution
        probabilities = smoothed_counts / np.sum(smoothed_counts)
        node_word_dist[node_id] = probabilities
    
    print(f"   Built word distributions for {len(node_word_dist)} nodes")
    
    # Calculate perplexity for test documents
    total_log_likelihood = 0.0
    total_words = 0
    valid_docs = 0
    matched_docs = 0
    doc_perplexities = []
    path_lengths = []
    
    for doc_id in test_doc_ids:
        if doc_id not in corpus:
            continue
            
        # Get document path mapping
        doc_path_mapping = path_mapping_data[path_mapping_data['document_id'] == doc_id]
        
        if len(doc_path_mapping) == 0:
            continue
        
        matched_docs += 1
        
        # Get document words
        doc_words = corpus[doc_id]
        
        if len(doc_words) == 0:
            continue
        
        valid_docs += 1
        
        # Get document path (assume taking the first path)
        if len(doc_path_mapping) > 0:
            path_row = doc_path_mapping.iloc[0]
            
            # Get leaf node from path
            leaf_node_id = path_row['leaf_node_id']
            
            # Record path length
            path_length = 0
            layer_cols = [col for col in path_row.index if col.startswith('layer_') and col.endswith('_node_id')]
            for col in layer_cols:
                if pd.notna(path_row[col]):
                    path_length += 1
            path_lengths.append(path_length)
            
            if leaf_node_id in node_word_dist:
                # Use leaf node word distribution to calculate likelihood
                node_probs = node_word_dist[leaf_node_id]
                
                doc_log_likelihood = 0.0
                doc_word_count = 0
                
                for word in doc_words:
                    if word in word_to_idx:
                        word_idx = word_to_idx[word]
                        word_prob = node_probs[word_idx]
                        
                        if word_prob > 0:
                            doc_log_likelihood += np.log(word_prob)
                            doc_word_count += 1
                
                if doc_word_count > 0:
                    total_log_likelihood += doc_log_likelihood
                    total_words += doc_word_count
                    
                    # Calculate document perplexity
                    doc_perplexity = np.exp(-doc_log_likelihood / doc_word_count)
                    doc_perplexities.append(doc_perplexity)
    
    # Calculate overall perplexity
    if total_words > 0:
        perplexity = np.exp(-total_log_likelihood / total_words)
        avg_doc_perplexity = np.mean(doc_perplexities) if doc_perplexities else 0.0
        match_rate = matched_docs / len(test_doc_ids) if len(test_doc_ids) > 0 else 0.0
        avg_path_length = np.mean(path_lengths) if path_lengths else 0.0
        
        results = {
            'perplexity': perplexity,
            'avg_doc_perplexity': avg_doc_perplexity,
            'log_likelihood': total_log_likelihood,
            'total_words': total_words,
            'valid_docs': valid_docs,
            'matched_docs': matched_docs,
            'match_rate': match_rate,
            'avg_path_length': avg_path_length
        }
        
        print(f"✅ Perplexity calculation completed:")
        print(f"   Matched documents: {matched_docs}/{len(test_doc_ids)}")
        print(f"   Valid documents: {valid_docs}")
        print(f"   Total words: {total_words}")
        print(f"   Perplexity: {perplexity:.4f}")
        
        return results
    
    else:
        print("❌ No valid words for perplexity calculation")
        return None

def calculate_hlda_perplexity_with_path_mapping_step3(base_path=".", corpus=None, test_ratio=0.2, random_state=42):
    """
    Step3 version: hLDA perplexity calculation based on iteration_path_document_mapping.csv
    Adapted for step3 alpha parameter instead of eta parameter
    """
    
    if corpus is None:
        print("❌ Must provide original corpus")
        return
    
    # Split training and test sets
    doc_ids = list(corpus.keys())
    train_ids, test_ids = train_test_split(doc_ids, test_size=test_ratio, random_state=random_state)
    
    print(f"📊 Dataset split:")
    print(f"   Total documents: {len(doc_ids)}")
    print(f"   Training set: {len(train_ids)} documents")
    print(f"   Test set: {len(test_ids)} documents")
    
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(files)} model result files to process")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # Step3 parameter extraction (mainly alpha parameter)
        eta = 0.05  # eta is fixed at 0.05 in step3
        gamma = 0.05  # fixed value
        depth = 3  # fixed value
        alpha = 0.1  # main varying parameter
        
        # Extract alpha value from folder name (adapted for step3)
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError):
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        print(f"\n{'='*80}")
        print(f"[{idx}/{len(files)}] Calculating perplexity: {folder_name}")
        print(f"Parameters - Alpha: {alpha}, Eta: {eta}, Gamma: {gamma}, Depth: {depth}")  # Corrected: highlight alpha parameter
        print(f"{'='*80}")
        
        try:
            # Read word distribution data
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # Read path mapping data
            path_mapping_file = os.path.join(folder_path, 'iteration_path_document_mapping.csv')
            if not os.path.exists(path_mapping_file):
                print("⚠️ Path mapping file not found, skipping this file")
                continue
                
            path_mapping_df = pd.read_csv(path_mapping_file)
            path_mapping_df.columns = [col.strip("'\" ") for col in path_mapping_df.columns]
            
            # Get last iteration data
            max_iteration = word_df['iteration'].max()
            last_word_data = word_df[word_df['iteration'] == max_iteration]
            last_path_mapping_data = path_mapping_df[path_mapping_df['iteration'] == max_iteration]
            
            print(f"📈 Last iteration: {max_iteration}")
            print(f"📈 Node count: {last_word_data['node_id'].nunique()}")
            print(f"📈 Path mapping count: {len(last_path_mapping_data)}")
            
            # Use corrected version to calculate perplexity (using fixed eta=0.05 as smoothing parameter)
            perplexity_results = compute_perplexity_with_path_mapping_fixed(
                last_word_data, 
                last_path_mapping_data, 
                corpus, 
                test_ids, 
                eta  # Use fixed eta=0.05 as smoothing parameter
            )
            
            if perplexity_results is not None:
                # Save perplexity results (corrected: use alpha as main parameter)
                perplexity_data = [{
                    'alpha': alpha,  # Corrected: use alpha instead of eta as main parameter
                    'eta': eta,  # Record fixed eta value
                    'gamma': gamma,
                    'depth': depth,
                    'iteration': max_iteration,
                    'test_docs_count': len(test_ids),
                    'valid_test_docs': perplexity_results['valid_docs'],
                    'matched_docs': perplexity_results['matched_docs'],
                    'total_test_words': perplexity_results['total_words'],
                    'log_likelihood': perplexity_results['log_likelihood'],
                    'perplexity': perplexity_results['perplexity'],
                    'avg_doc_perplexity': perplexity_results['avg_doc_perplexity'],
                    'doc_match_rate': perplexity_results['match_rate'],
                    'avg_path_length': perplexity_results['avg_path_length']
                }]
                
                perplexity_df = pd.DataFrame(perplexity_data)
                output_path = os.path.join(folder_path, 'perplexity_results_step3.csv')
                perplexity_df.to_csv(output_path, index=False)
                
                print(f"💾 Perplexity results saved to: {output_path}")
                print(f"📊 Perplexity results:")
                print(f"   - Overall perplexity: {perplexity_results['perplexity']:.4f}")
                print(f"   - Average document perplexity: {perplexity_results['avg_doc_perplexity']:.4f}")
                print(f"   - Document match rate: {perplexity_results['match_rate']:.1%}")
                print(f"   - Average path length: {perplexity_results['avg_path_length']:.1f}")
                print(f"   - Valid test documents: {perplexity_results['valid_docs']}/{len(test_ids)}")
            
        except Exception as e:
            import traceback
            print(f"❌ Error processing file {file_path}: {str(e)}")
            traceback.print_exc()
    
    print(f"\n✅ Perplexity calculation for all files completed! (Step3)")

def aggregate_perplexity_by_alpha_step3(base_path="."):
    """
    Step3 version: Aggregate average perplexity and other metrics by alpha value across multiple runs
    """
    
    # Find all perplexity_results_step3.csv files
    pattern = os.path.join(base_path, "**", "perplexity_results_step3.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(files)} Step3 perplexity result files")
    
    if len(files) == 0:
        print("❌ No Step3 perplexity result files found")
        return
    
    all_data = []
    alpha_groups = {}
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # Extract alpha value (adapted for step3)
        alpha = None
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha is None:
            print(f"Warning: Cannot extract alpha value from folder name {folder_name}")
            continue
        
        # Extract run number
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            print(f"Warning: Cannot extract run number from folder name {folder_name}")
            run_id = "unknown"
        
        if alpha not in alpha_groups:
            alpha_groups[alpha] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            print(f"📖 Reading file: {folder_name} - {len(df)} rows of data")
            
            for _, row in df.iterrows():
                # Check if required fields exist
                if 'perplexity' not in row:
                    print(f"Warning: {file_path} missing perplexity column")
                    continue
                    
                all_data.append({
                    'alpha': alpha,  # Corrected: use alpha as main parameter
                    'run_id': run_id,
                    'eta': row.get('eta', 0.05),  # Record fixed eta value
                    'gamma': row.get('gamma', 0.05),
                    'depth': row.get('depth', 3),
                    'perplexity': row.get('perplexity', 0),
                    'avg_doc_perplexity': row.get('avg_doc_perplexity', row.get('perplexity', 0)),
                    'valid_test_docs': row.get('valid_test_docs', 0),
                    'total_test_words': row.get('total_test_words', 0),
                    'doc_match_rate': row.get('doc_match_rate', 0),
                    'avg_path_length': row.get('avg_path_length', 0),
                    'log_likelihood': row.get('log_likelihood', 0),
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Convert to DataFrame
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("No valid data found")
        return
    
    print(f"📊 Data summary:")
    print(f"   Total data rows: {len(summary_df)}")
    print(f"   Unique alpha values: {sorted(summary_df['alpha'].unique())}")
    print(f"   Data count per alpha: {summary_df['alpha'].value_counts().sort_index().to_dict()}")
    
    print("=" * 80)
    print("Perplexity summary statistics by ALPHA value (Step3)")  # Corrected: show ALPHA
    print("=" * 80)
    
    # Generate summary files grouped by alpha
    for alpha, group_data in summary_df.groupby('alpha'):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\nProcessing Alpha={alpha}")  # Corrected: show Alpha
        print(f"Output directory: {parent_folder}")
        print(f"Data count for this group: {len(group_data)}")
        
        # Check if group_data is empty
        if len(group_data) == 0:
            print(f"Warning: Alpha={alpha} group has no data, skipping")
            continue
        
        # Build aggregation dictionary
        agg_dict = {}
        
        # Check if each column has valid data
        numeric_cols = ['perplexity', 'avg_doc_perplexity', 'valid_test_docs', 
                       'total_test_words', 'doc_match_rate', 'avg_path_length', 'log_likelihood']
        
        for col in numeric_cols:
            if col in group_data.columns:
                valid_data = group_data[col].dropna()
                if len(valid_data) > 0:
                    if col in ['perplexity', 'avg_doc_perplexity', 'doc_match_rate', 'avg_path_length', 'log_likelihood']:
                        agg_dict[col] = ['mean', 'std', 'min', 'max']
                    else:
                        agg_dict[col] = ['mean', 'std']
                else:
                    print(f"   Warning: {col} column has no valid data")
        
        # Add count
        if 'run_id' in group_data.columns:
            agg_dict['run_id'] = 'count'
        
        # Parameter columns
        for col in ['eta', 'gamma', 'depth']:
            if col in group_data.columns:
                agg_dict[col] = 'first'
        
        if not agg_dict:
            print(f"Warning: Alpha={alpha} group has no aggregatable columns, skipping")
            continue
        
        try:
            # Perform aggregation operation
            print(f"   Aggregation dictionary: {list(agg_dict.keys())}")
            alpha_summary = group_data.agg(agg_dict).round(4)
            
            # Flatten column names
            alpha_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in alpha_summary.columns]
            alpha_summary = alpha_summary.reset_index()
            alpha_summary.insert(0, 'alpha', alpha)  # Corrected: insert alpha column
            
            # Add run_id list
            run_ids = ', '.join(sorted(group_data['run_id'].unique()))
            alpha_summary['run_ids'] = run_ids
            
            # Save summary results (corrected: filename uses alpha)
            output_filename = f'alpha_{alpha}_perplexity_summary.csv'
            output_path = os.path.join(parent_folder, output_filename)
            alpha_summary.to_csv(output_path, index=False)
            
            print(f"  ✓ Saved summary file: {output_path}")
            
            # Show statistics
            if 'run_id_count' in alpha_summary.columns:
                print(f"  Run count: {int(alpha_summary['run_id_count'].iloc[0])}")
            
            if 'perplexity_mean' in alpha_summary.columns:
                mean_perp = alpha_summary['perplexity_mean'].iloc[0]
                std_perp = alpha_summary.get('perplexity_std', pd.Series([0])).iloc[0]
                print(f"  Average perplexity: {mean_perp:.4f} (±{std_perp:.4f})")
            
            print(f"  Included runs: {run_ids}")
            
        except Exception as e:
            print(f"❌ Error processing Alpha={alpha}: {e}")
            import traceback
            traceback.print_exc()
    
    # Generate overall comparison file (corrected: filename uses alpha)
    print(f"\n" + "=" * 80)
    print("Generating overall comparison file")
    print("=" * 80)
    
    try:
        # Build overall aggregation dictionary
        overall_agg_dict = {}
        
        for col in ['perplexity', 'avg_doc_perplexity', 'doc_match_rate', 'avg_path_length', 
                   'valid_test_docs', 'total_test_words', 'log_likelihood']:
            if col in summary_df.columns:
                valid_data = summary_df[col].dropna()
                if len(valid_data) > 0:
                    overall_agg_dict[col] = ['mean', 'std']
                    if col in ['perplexity', 'avg_doc_perplexity']:
                        overall_agg_dict[col].extend(['min', 'max'])
        
        if 'run_id' in summary_df.columns:
            overall_agg_dict['run_id'] = 'count'
        
        if not overall_agg_dict:
            print("Warning: No aggregatable columns for overall comparison")
            return None
        
        overall_summary = summary_df.groupby('alpha').agg(overall_agg_dict).round(4)  # Corrected: group by alpha
        
        # Flatten column names
        overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
        overall_summary = overall_summary.reset_index()
        
        overall_output_path = os.path.join(base_path, 'alpha_perplexity_comparison.csv')  # Corrected: filename uses alpha
        overall_summary.to_csv(overall_output_path, index=False)
        print(f"✓ Overall comparison file saved to: {overall_output_path}")
        
        # Show cross-alpha comparison
        print(f"\nCross-Alpha Perplexity Comparison:")  # Corrected: show Alpha
        print("Alpha Value     Average Perplexity(±std)     Run Count")
        print("-" * 50)
        
        for _, row in overall_summary.iterrows():
            alpha = row['alpha']  # Corrected: use alpha
            run_count = int(row.get('run_id_count', 0))
            
            if 'perplexity_mean' in row:
                mean_perp = row['perplexity_mean']
                std_perp = row.get('perplexity_std', 0)
                print(f"{alpha:7.3f}    {mean_perp:8.4f}(±{std_perp:6.4f})        {run_count:4d}")
            else:
                print(f"{alpha:7.3f}    Missing data                    {run_count:4d}")
        
        return overall_summary
        
    except Exception as e:
        print(f"❌ Error generating overall comparison: {e}")
        import traceback
        traceback.print_exc()
        return None

# Execute Step3 perplexity calculation and aggregation
base_path = "/Volumes/My Passport/收敛结果/step3"

print("=" * 80)
print("Starting Step3 perplexity calculation...")
print("=" * 80)

# 1. Calculate perplexity (Step3 version)
calculate_hlda_perplexity_with_path_mapping_step3(base_path, corpus, test_ratio=0.2)

print("\n" + "=" * 80)
print("Starting aggregation of perplexity statistics by alpha value...")
print("=" * 80)

# 2. Aggregate by alpha (Step3 version)
overall_summary = aggregate_perplexity_by_alpha_step3(base_path)

print("=" * 80)
print("✅ Step3 perplexity calculation and aggregation completed!")

Starting Step3 perplexity calculation...
📊 Dataset split:
   Total documents: 970
   Training set: 776 documents
   Test set: 194 documents
🔍 Found 17 model result files to process

[1/17] Calculating perplexity: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3
Parameters - Alpha: 1.0, Eta: 0.05, Gamma: 0.05, Depth: 3
📈 Last iteration: 265
📈 Node count: 307
📈 Path mapping count: 970
🔄 Starting perplexity calculation...
   Test document count: 194
   Path mapping count: 970
   Smoothing parameter: 0.05
   Vocabulary size: 1490
   Built word distributions for 307 nodes
✅ Perplexity calculation completed:
   Matched documents: 194/194
   Valid documents: 194
   Total words: 16508
   Perplexity: 1087.6382
💾 Perplexity results saved to: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/perplexity_results_step3.csv
📊 Perplexity results:
   - Overall perplexity: 1087.6382
   - Average document perplexity: 1569.2452
   - Document match rate: 100.0%
   - A

Traceback (most recent call last):
  File "/var/folders/v5/6mdkg5713kxgwg5xs24g8rvr0000gn/T/ipykernel_55394/1702905664.py", line 464, in aggregate_perplexity_by_alpha_step3
    alpha_summary = group_data.agg(agg_dict).round(4)
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/frame.py", line 9342, in aggregate
    result = op.agg()
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 776, in agg
    result = super().agg()
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 172, in agg
    return self.agg_dict_like()
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 504, in agg_dict_like
    results = {
  File "/Users/wenlinsuniverse/opt/anaconda3/envs/huggingface/lib/python3.8/site-packages/pandas/core/apply.py", line 505, in <dictcomp>
  

In [19]:
import pandas as pd
import numpy as np
import os
import glob

def calculate_branching_and_gini_metrics_step3_no_global(base_path="."):
    """
    Step3 version: Calculate branching factor and Gini coefficient metrics for each model (adapted for alpha parameter)
    Only generates layer-level metrics, no global metrics
    """
    pattern = os.path.join(base_path, "**", "corrected_renyi_entropy.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(files)} entropy files to process (Step3 - Layer only)")
    
    for idx, file_path in enumerate(files, 1):
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        
        # Step3 parameter extraction (mainly alpha parameter)
        alpha = 0.1  # Default value
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except (IndexError, ValueError):
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        print(f"\n[{idx}/{len(files)}] Processing folder: {folder_name} (Alpha={alpha})")
        
        try:
            # Read entropy file
            entropy_df = pd.read_csv(file_path)
            
            # Check if necessary columns exist
            required_cols = ['node_id', 'layer', 'document_count', 'child_count']
            missing_cols = [col for col in required_cols if col not in entropy_df.columns]
            
            if missing_cols:
                print(f"⚠️ Missing required columns: {missing_cols}, skipping this file")
                continue
            
            # Calculate layer-wise branching factor and Gini coefficient metrics only
            layer_metrics = []
            
            for layer in entropy_df['layer'].unique():
                if layer == -1:  # Skip invalid layers
                    continue
                    
                layer_nodes = entropy_df[entropy_df['layer'] == layer]
                
                # Basic statistics
                node_count = len(layer_nodes)
                total_documents = layer_nodes['document_count'].sum()
                
                # Branching factor statistics
                child_counts = layer_nodes['child_count'].values
                total_branches = child_counts.sum()
                
                # Non-leaf node statistics
                non_leaf_nodes = (child_counts > 0).sum()
                non_leaf_counts = child_counts[child_counts > 0]
                
                # Branching factor statistics
                if len(non_leaf_counts) > 0:
                    avg_branching_factor = non_leaf_counts.mean()
                    std_branching_factor = non_leaf_counts.std()
                    non_leaf_avg_branching = non_leaf_counts.mean()
                else:
                    avg_branching_factor = 0.0
                    std_branching_factor = 0.0
                    non_leaf_avg_branching = 0.0
                
                # Gini coefficient calculation
                def gini_coefficient(values):
                    """Calculate Gini coefficient"""
                    if len(values) == 0:
                        return 0.0
                    values = np.array(values)
                    values = values[values > 0]  # Only consider positive values
                    if len(values) <= 1:
                        return 0.0
                    
                    values = np.sort(values)
                    n = len(values)
                    cumsum = np.cumsum(values)
                    return (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
                
                # Document distribution Gini coefficient
                doc_counts = layer_nodes['document_count'].values
                gini_doc_distribution = gini_coefficient(doc_counts)
                
                # Branching distribution Gini coefficient
                gini_branch_distribution = gini_coefficient(child_counts)
                
                layer_metrics.append({
                    'layer': layer,
                    'node_count': node_count,
                    'total_branches': total_branches,
                    'avg_branching_factor': avg_branching_factor,
                    'std_branching_factor': std_branching_factor,
                    'non_leaf_nodes': non_leaf_nodes,
                    'non_leaf_avg_branching': non_leaf_avg_branching,
                    'total_documents': total_documents,
                    'gini_doc_distribution': gini_doc_distribution,
                    'gini_branch_distribution': gini_branch_distribution,
                    'alpha': alpha  # Add alpha parameter record
                })
            
            # Save layer metrics only
            if layer_metrics:
                layer_df = pd.DataFrame(layer_metrics)
                layer_output_path = os.path.join(folder_path, 'layer_branching_gini_metrics.csv')
                layer_df.to_csv(layer_output_path, index=False)
                print(f"✓ Layer metrics saved to: {layer_output_path}")
            
            # Display brief statistics for layers only
            print(f"📊 Layer metrics summary (Alpha={alpha}):")
            total_nodes = len(entropy_df)
            total_layers = len(entropy_df['layer'].unique()) - (1 if -1 in entropy_df['layer'].unique() else 0)
            print(f"   Total nodes: {total_nodes}")
            print(f"   Total layers: {total_layers}")
            
            if layer_metrics:
                for layer_metric in layer_metrics:
                    layer_num = layer_metric['layer']
                    layer_branching = layer_metric['avg_branching_factor']
                    layer_doc_gini = layer_metric['gini_doc_distribution']
                    print(f"   Layer {layer_num}: Avg branching={layer_branching:.2f}, Doc Gini={layer_doc_gini:.4f}")
            
        except Exception as e:
            import traceback
            print(f"❌ Error processing file {file_path}: {str(e)}")
            traceback.print_exc()

def aggregate_branching_gini_by_alpha_step3_layer_only(base_path="."):
    """
    Step3 version: Aggregate branching factor and Gini coefficient statistics by alpha value (layer metrics only)
    """
    
    print("\n" + "=" * 80)
    print("Aggregating layer branching factor and Gini coefficient metrics... (Step3 - Layer only)")
    print("=" * 80)
    
    pattern = os.path.join(base_path, "**", "layer_branching_gini_metrics.csv")
    files = glob.glob(pattern, recursive=True)
    
    print(f"🔍 Found {len(files)} layer metrics files")
    
    all_layer_data = []
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # Extract alpha value (adapted for step3)
        alpha = None
        if 'alpha_' in folder_name:
            try:
                alpha_part = folder_name.split('alpha_')[1].split('_')[0]
                alpha = float(alpha_part)
            except:
                # Pattern matching through folder name
                if 'a001' in folder_name:
                    alpha = 0.01
                elif 'a005' in folder_name:
                    alpha = 0.05
                elif 'a02' in folder_name:
                    alpha = 0.2
                elif 'a05' in folder_name and 'a005' not in folder_name:
                    alpha = 0.5
                elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                    alpha = 1.0
                elif 'a01' in folder_name:
                    alpha = 0.1
        else:
            # Pattern matching through folder name
            if 'a001' in folder_name:
                alpha = 0.01
            elif 'a005' in folder_name:
                alpha = 0.05
            elif 'a02' in folder_name:
                alpha = 0.2
            elif 'a05' in folder_name and 'a005' not in folder_name:
                alpha = 0.5
            elif 'a1_' in folder_name or 'a1' in folder_name.split('_')[-1]:
                alpha = 1.0
            elif 'a01' in folder_name:
                alpha = 0.1
        
        if alpha is None:
            print(f"⚠️ Unable to extract alpha value: {folder_name}")
            continue
        
        # Extract run number
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            print(f"⚠️ Unable to extract run number: {folder_name}")
            continue
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_layer_data.append({
                    'alpha': alpha,
                    'run_id': run_id,
                    'layer': row.get('layer', -1),
                    'node_count': row.get('node_count', 0),
                    'total_branches': row.get('total_branches', 0),
                    'avg_branching_factor': row.get('avg_branching_factor', 0),
                    'std_branching_factor': row.get('std_branching_factor', 0),
                    'non_leaf_nodes': row.get('non_leaf_nodes', 0),
                    'non_leaf_avg_branching': row.get('non_leaf_avg_branching', 0),
                    'total_documents': row.get('total_documents', 0),
                    'gini_doc_distribution': row.get('gini_doc_distribution', 0),
                    'gini_branch_distribution': row.get('gini_branch_distribution', 0),
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"❌ Error reading file {file_path}: {e}")
    
    # Convert to DataFrame and aggregate by alpha groups
    if all_layer_data:
        layer_summary_df = pd.DataFrame(all_layer_data)
        
        print(f"\n📊 Layer data check:")
        print(f"   Total data rows: {len(layer_summary_df)}")
        print(f"   Alpha value distribution: {layer_summary_df['alpha'].value_counts().to_dict()}")
        
        print("Layer branching factor and Gini coefficient aggregated statistics by ALPHA value")
        print("=" * 80)
        
        # Generate layer summary files grouped by alpha
        for alpha, group_data in layer_summary_df.groupby('alpha'):
            parent_folder = group_data['parent_folder'].iloc[0]
            
            print(f"\nProcessing Alpha={alpha} layer metrics")
            print(f"   Data count: {len(group_data)}")
            print(f"   Run IDs: {list(group_data['run_id'].unique())}")
            
            try:
                # Calculate layer summary statistics
                layer_summary = group_data.groupby('layer').agg({
                    'avg_branching_factor': ['mean', 'std', 'count'],
                    'gini_doc_distribution': ['mean', 'std', 'count'],
                    'gini_branch_distribution': ['mean', 'std', 'count'],
                    'node_count': ['mean', 'std'],
                    'total_documents': 'mean',
                    'run_id': lambda x: ', '.join(sorted(x.unique()))
                }).round(4)
                
                # Flatten column names
                layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
                layer_summary = layer_summary.reset_index()
                layer_summary.insert(0, 'alpha', alpha)
                
                # Save layer summary results
                output_filename = f'alpha_{alpha}_layer_branching_gini_summary.csv'
                output_path = os.path.join(parent_folder, output_filename)
                layer_summary.to_csv(output_path, index=False)
                
                print(f"  ✅ Saved layer summary file: {output_path}")
                print(f"  Number of layers: {len(layer_summary)}")
                
            except Exception as e:
                print(f"❌ Error processing Alpha={alpha}: {e}")
                import traceback
                traceback.print_exc()
                
        # Generate overall layer comparison file
        overall_layer_summary = layer_summary_df.groupby(['alpha', 'layer']).agg({
            'avg_branching_factor': ['mean', 'std'],
            'gini_doc_distribution': ['mean', 'std'],
            'gini_branch_distribution': ['mean', 'std'],
            'node_count': ['mean', 'std'],
            'run_id': 'count'
        }).round(4)
        
        # Flatten column names
        overall_layer_summary.columns = ['_'.join(col).strip() for col in overall_layer_summary.columns]
        overall_layer_summary = overall_layer_summary.reset_index()
        
        overall_output_path = os.path.join(base_path, 'alpha_layer_branching_gini_comparison.csv')
        overall_layer_summary.to_csv(overall_output_path, index=False)
        print(f"\nOverall layer comparison file saved to: {overall_output_path}")
        
    else:
        print("❌ No valid layer data found")

def display_branching_gini_summary_step3_layer_only(base_path="."):
    """
    Step3 version: Display branching factor and Gini coefficient summary report (layer metrics only)
    """
    print("=" * 100)
    print("Branching Factor and Gini Coefficient Analysis Summary Report (Step3 - Alpha Parameter, Layer Only)")
    print("=" * 100)
    
    # Read layer comparison file only
    layer_comparison_file = os.path.join(base_path, 'alpha_layer_branching_gini_comparison.csv')
    
    if os.path.exists(layer_comparison_file):
        print("\n📊 Layer-wise branching factor and Gini coefficient analysis:")
        print("-" * 60)
        
        df = pd.read_csv(layer_comparison_file)
        
        # Find correct count column name
        count_col = None
        for col in df.columns:
            if 'run_id' in col and ('count' in col or col.endswith('_count')):
                count_col = col
                break
        
        for layer in sorted(df['layer'].unique()):
            print(f"\nLayer {int(layer)} Cross-Alpha Comparison:")
            print("Alpha Value     Avg Branching(±std)     Doc Gini(±std)     Branch Gini(±std)     Run Count")
            print("-" * 75)
            
            layer_data = df[df['layer'] == layer]
            for _, row in layer_data.iterrows():
                alpha = row['alpha']
                avg_branch = row['avg_branching_factor_mean']
                branch_std = row['avg_branching_factor_std']
                doc_gini = row['gini_doc_distribution_mean']
                doc_gini_std = row['gini_doc_distribution_std']
                branch_gini = row['gini_branch_distribution_mean']
                branch_gini_std = row['gini_branch_distribution_std']
                run_count = int(row[count_col]) if count_col else 0
                
                print(f"{alpha:7.3f}    {avg_branch:6.2f}(±{branch_std:4.2f})     {doc_gini:6.4f}(±{doc_gini_std:5.4f})     {branch_gini:6.4f}(±{branch_gini_std:5.4f})     {run_count:4d}")
    else:
        print("⚠️ Layer comparison file not found")
    
    print("\n" + "=" * 100)
    print("✅ Step3 branching factor and Gini coefficient analysis completed! (Layer metrics only)")
    print("=" * 100)

# Execute Step3 branching factor and Gini coefficient analysis (layer metrics only)
base_path = "/Volumes/My Passport/收敛结果/step3"

print("=" * 80)
print("Starting calculation of Step3 branching factor and Gini coefficient metrics (layer only)...")
print("=" * 80)

# 1. Calculate branching factor and Gini coefficient for each model (Step3 version, layer only)
calculate_branching_and_gini_metrics_step3_no_global(base_path)

print("\n" + "=" * 80)
print("Starting aggregation of branching factor and Gini coefficient statistics by alpha value (layer only)...")
print("=" * 80)

# 2. Aggregate by alpha (Step3 version, layer only)
aggregate_branching_gini_by_alpha_step3_layer_only(base_path)

print("\n" + "=" * 80)
print("Displaying Step3 branching factor and Gini coefficient summary report (layer only)...")
print("=" * 80)

# 3. Display summary (Step3 version, layer only)
display_branching_gini_summary_step3_layer_only(base_path)

Starting calculation of Step3 branching factor and Gini coefficient metrics (layer only)...
🔍 Found 17 entropy files to process (Step3 - Layer only)

[1/17] Processing folder: depth_3_gamma_0.05_eta_0.05_alpha_1_run_3 (Alpha=1.0)
✓ Layer metrics saved to: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_3/layer_branching_gini_metrics.csv
📊 Layer metrics summary (Alpha=1.0):
   Total nodes: 307
   Total layers: 3
   Layer 0: Avg branching=64.00, Doc Gini=0.0000
   Layer 2: Avg branching=0.00, Doc Gini=0.4731
   Layer 1: Avg branching=3.78, Doc Gini=0.5599

[2/17] Processing folder: depth_3_gamma_0.05_eta_0.05_alpha_1_run_2 (Alpha=1.0)
✓ Layer metrics saved to: /Volumes/My Passport/收敛结果/step3/step3_d3_g005_e005_a1/depth_3_gamma_0.05_eta_0.05_alpha_1_run_2/layer_branching_gini_metrics.csv
📊 Layer metrics summary (Alpha=1.0):
   Total nodes: 321
   Total layers: 3
   Layer 0: Avg branching=57.00, Doc Gini=0.0000
   Layer 1: Avg branching=4.61, D