In [1]:
import pandas as pd
import numpy as np
import os
import glob
from scipy.special import gammaln

def calculate_renyi_entropy_vectorized(node_data, all_words, eta_prior=1.0, renyi_alpha=2.0):
    """
    Vectorized Renyi entropy calculation.

    Parameters:
    node_data: DataFrame, node data containing 'word' and 'count' columns
    all_words: list, full vocabulary list
    eta_prior: float, Dirichlet prior smoothing parameter
    renyi_alpha: float, order parameter of Renyi entropy

    Returns:
    tuple: (entropy, nonzero_word_count) Renyi entropy value and number of nonzero words
    """
    if len(all_words) == 0:
        return 0.0, 0

    # Create mapping from vocabulary word to index
    word_to_idx = {word: idx for idx, word in enumerate(all_words)}

    # Initialize count vector
    counts = np.zeros(len(all_words))

    # Fill in actual counts
    for _, row in node_data.iterrows():
        word = row['word']
        if pd.notna(word) and word in word_to_idx:
            counts[word_to_idx[word]] = row['count']

    # Count nonzero words (before smoothing)
    nonzero_word_count = np.sum(counts > 0)

    # Apply eta smoothing
    smoothed_counts = counts + eta_prior

    # Compute probability distribution
    probabilities = smoothed_counts / np.sum(smoothed_counts)

    # Compute Renyi entropy (using natural logarithm)
    if renyi_alpha == 1.0:
        # Shannon entropy (with smoothing all probabilities > 0, no small constant needed)
        entropy = -np.sum(probabilities * np.log(probabilities))
    else:
        # General Renyi entropy
        entropy = (1 / (1 - renyi_alpha)) * np.log(np.sum(probabilities ** renyi_alpha))

    return entropy, int(nonzero_word_count)

def process_all_iteration_files(base_path=".", eta_prior=1.0, renyi_alpha=2.0):
    """
    Process each iteration_node_word_distributions.csv file individually and save results.
    """
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)

    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\nProcessing file: {file_path}")

        try:
            df = pd.read_csv(file_path)

            # Clean column names: remove single/double quotes and extra spaces
            df.columns = [col.strip("'\" ") for col in df.columns]

            if 'node_id' not in df.columns:
                print(f"Warning: {file_path} is missing 'node_id' column, skipping this file")
                continue

            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]
            all_words = list(last_iteration_data['word'].dropna().unique())

            print(f"Last iteration: {max_iteration}, vocabulary size: {len(all_words)}, node count: {last_iteration_data['node_id'].nunique()}")

            results = []
            for node_id in last_iteration_data['node_id'].unique():
                node_data = last_iteration_data[last_iteration_data['node_id'] == node_id]

                entropy, nonzero_words = calculate_renyi_entropy_vectorized(
                    node_data, all_words, eta_prior, renyi_alpha
                )

                # Calculate sparsity (proportion of nonzero words)
                sparsity_ratio = nonzero_words / len(all_words) if len(all_words) > 0 else 0

                results.append({
                    'node_id': node_id,
                    'renyi_entropy_corrected': entropy,
                    'nonzero_word_count': nonzero_words,
                    'total_vocabulary_size': len(all_words),
                    'sparsity_ratio': sparsity_ratio,
                    'eta_prior': eta_prior,
                    'renyi_alpha': renyi_alpha,
                    'iteration': max_iteration
                })

            # Save the new corrected_renyi_entropy.csv file
            results_df = pd.DataFrame(results)
            output_path = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            results_df.to_csv(output_path, index=False)
            print(f"Saved corrected Renyi entropy results to: {output_path}")

            # Print some summary statistics
            print("Node vocabulary sparsity statistics:")
            print(f"  - Average nonzero word count: {results_df['nonzero_word_count'].mean():.1f}")
            print(f"  - Nonzero word count range: {results_df['nonzero_word_count'].min()}-{results_df['nonzero_word_count'].max()}")
            print(f"  - Average sparsity: {results_df['sparsity_ratio'].mean():.3f}")
            print("=" * 50)

        except Exception as e:
            import traceback
            print(f"Error processing file {file_path}: {str(e)}")
            print("Detailed traceback:")
            traceback.print_exc()

In [2]:
# Set parameters
base_path = "/Volumes/My Passport/收敛结果/step1/2"  # root directory
eta_prior = 0.1  # Dirichlet prior smoothing parameter
renyi_alpha = 2.0  # Renyi entropy order parameter

print("=" * 50)
print("Start batch computing corrected Renyi entropy...")
print("=" * 50)
process_all_iteration_files(base_path, eta_prior, renyi_alpha)
print("=" * 50)
print("All processing completed!")
print("=" * 50)

Start batch computing corrected Renyi entropy...

Processing file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_node_word_distributions.csv
Last iteration: 115, vocabulary size: 1490, node count: 228
Saved corrected Renyi entropy results to: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/corrected_renyi_entropy.csv
Node vocabulary sparsity statistics:
  - Average nonzero word count: 71.7
  - Nonzero word count range: 0-842
  - Average sparsity: 0.048

Processing file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/iteration_node_word_distributions.csv
Last iteration: 115, vocabulary size: 1490, node count: 235
Saved corrected Renyi entropy results to: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/corrected_renyi_entropy.csv
Node vocabulary sparsity statistics:
  - Average nonzero word count: 72.6
  - Nonzero word count range: 0-888
  - Average sparsity: 0.049

Processing fil

In [3]:
def calculate_node_document_counts(path_structures_df):
    """
    Aggregate document counts from leaf nodes upward and compute node hierarchy.

    Parameters:
    path_structures_df: DataFrame, data from iteration_path_structures.csv (filtered to last iteration)

    Returns:
    dict: mapping {node_id: {'document_count': int, 'layer': int, 'parent_id': int, 'child_ids': list, 'child_count': int}}
    """
    # Find all layer columns
    layer_columns = [col for col in path_structures_df.columns if col.startswith('layer_') and col.endswith('_node_id')]
    layer_columns.sort()
    max_layer_idx = len(layer_columns) - 1

    print(f"[DEBUG] Found layer columns: {layer_columns}")
    print(f"[DEBUG] Max layer index: {max_layer_idx}")

    # Initialize node info dictionary
    node_info = {}

    # Handle leaf nodes using leaf_node_id column
    for _, row in path_structures_df.iterrows():
        leaf_node = row.get('leaf_node_id')
        if pd.notna(leaf_node):
            if leaf_node not in node_info:
                node_info[leaf_node] = {
                    'document_count': 0,
                    'layer': max_layer_idx,
                    'parent_id': None,
                    'child_ids': [],
                    'child_count': 0
                }
            node_info[leaf_node]['document_count'] += row.get('document_count', 0)

    # Build parent-child relationships and layer info
    for _, row in path_structures_df.iterrows():
        path_nodes = []
        for layer_idx in range(max_layer_idx + 1):
            layer_col = f'layer_{layer_idx}_node_id'
            if layer_col in path_structures_df.columns and pd.notna(row.get(layer_col)):
                path_nodes.append(row[layer_col])
            else:
                break

        # Set layer and parent-child for each node in the path
        for i, node in enumerate(path_nodes):
            if node not in node_info:
                node_info[node] = {
                    'document_count': 0,
                    'layer': i,
                    'parent_id': None,
                    'child_ids': [],
                    'child_count': 0
                }
            else:
                node_info[node]['layer'] = i

            if i > 0:
                parent_node = path_nodes[i - 1]
                node_info[node]['parent_id'] = parent_node

                if parent_node not in node_info:
                    node_info[parent_node] = {
                        'document_count': 0,
                        'layer': i - 1,
                        'parent_id': None,
                        'child_ids': [],
                        'child_count': 0
                    }

                if node not in node_info[parent_node]['child_ids']:
                    node_info[parent_node]['child_ids'].append(node)

    # Aggregate document counts upward from second-last layer to root
    for layer_idx in range(max_layer_idx - 1, -1, -1):
        layer_col = f'layer_{layer_idx}_node_id'
        if layer_col not in path_structures_df.columns:
            continue

        layer_nodes = path_structures_df[layer_col].dropna().unique()
        for node in layer_nodes:
            if node in node_info and node_info[node]['document_count'] == 0:
                total_docs = path_structures_df[path_structures_df[layer_col] == node]['document_count'].sum()
                node_info[node]['document_count'] = int(total_docs)

    # Compute child counts
    for node_id, info in node_info.items():
        info['child_count'] = len(info.get('child_ids', []))

    return node_info

def add_document_counts_to_entropy_files(base_path="."):
    """
    Add document counts and hierarchy info to corrected_renyi_entropy.csv files.
    """
    pattern = os.path.join(base_path, "**", "iteration_path_structures.csv")
    files = glob.glob(pattern, recursive=True)

    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\nProcessing path structure file: {file_path}")

        try:
            # Read path structures file
            df = pd.read_csv(file_path)
            df.columns = [col.strip("'\" ") for col in df.columns]

            # Get last iteration
            max_iteration = df['iteration'].max()
            last_iteration_data = df[df['iteration'] == max_iteration]

            print(f"Last iteration: {max_iteration}, number of paths: {len(last_iteration_data)}")

            # Compute node document counts and hierarchy
            node_info = calculate_node_document_counts(last_iteration_data)

            print(f"Computed info for {len(node_info)} nodes")

            # Read corresponding corrected_renyi_entropy.csv
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if os.path.exists(entropy_file):
                entropy_df = pd.read_csv(entropy_file)

                # Add new columns: document_count, layer, parent_id, child_ids, child_count
                entropy_df['document_count'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('document_count', 0))
                entropy_df['layer'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('layer', -1))
                entropy_df['parent_id'] = entropy_df['node_id'].map(lambda x: node_info.get(x, {}).get('parent_id', None))

                # Format child_ids as bracketed list string (empty string if no children)
                entropy_df['child_ids'] = entropy_df['node_id'].map(
                    lambda x: '[' + ','.join(map(str, node_info.get(x, {}).get('child_ids', []))) + ']'
                    if node_info.get(x, {}).get('child_ids') else ''
                )

                entropy_df['child_count'] = entropy_df['node_id'].map(lambda x: len(node_info.get(x, {}).get('child_ids', [])))

                # Save updated file
                entropy_df.to_csv(entropy_file, index=False)
                print(f"Updated {entropy_file} with columns: document_count, layer, parent_id, child_ids, child_count")

                # Print some statistics
                print("Node hierarchy statistics:")
                print(f"  - Layer distribution: {entropy_df['layer'].value_counts().sort_index().to_dict()}")
                print(f"  - Document count range: {entropy_df['document_count'].min()} - {entropy_df['document_count'].max()}")
                print(f"  - Root node count: {entropy_df[entropy_df['parent_id'].isna()].shape[0]}")
                print(f"  - Leaf node count: {entropy_df[entropy_df['child_ids'] == ''].shape[0]}")
                print(f"  - Child count distribution: {entropy_df['child_count'].value_counts().sort_index().to_dict()}")
            else:
                print(f"Warning: corresponding entropy file not found: {entropy_file}")

        except Exception as e:
            import traceback
            print(f"Error processing file {file_path}: {str(e)}")
            print("Traceback:")
            traceback.print_exc()

In [4]:
# Main function: add document counts and hierarchy info to entropy files
import os
import glob
import pandas as pd

base_path = "/Volumes/My Passport/收敛结果/step1/2"  # root directory

print("=" * 50)
print("Start adding document counts and hierarchy information to entropy files...")
print("=" * 50)
add_document_counts_to_entropy_files(base_path)
print("=" * 50)
print("Document counts and hierarchy information added.")
print("=" * 50)


Start adding document counts and hierarchy information to entropy files...

Processing path structure file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_path_structures.csv
Last iteration: 115, number of paths: 141
[DEBUG] Found layer columns: ['layer_0_node_id', 'layer_1_node_id', 'layer_2_node_id', 'layer_3_node_id']
[DEBUG] Max layer index: 3
Computed info for 228 nodes
Updated /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/corrected_renyi_entropy.csv with columns: document_count, layer, parent_id, child_ids, child_count
Node hierarchy statistics:
  - Layer distribution: {0: 1, 1: 19, 2: 67, 3: 141}
  - Document count range: 1 - 970
  - Root node count: 1
  - Leaf node count: 141
  - Child count distribution: {0: 141, 1: 37, 2: 28, 3: 8, 4: 9, 5: 1, 7: 1, 19: 2, 24: 1}

Processing path structure file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/iteration_path_structures.csv
Last iteration: 115,

In [5]:
def calculate_jensen_shannon_distances_with_weighted_entropy(base_path=".", eta=0.1):
    """
    Calculate Jensen-Shannon distances between nodes in each layer and the document-count-weighted average Renyi entropy.
    
    Parameters:
    base_path: str, root directory path
    eta: float, Dirichlet smoothing parameter
    """
    # Find all iteration_node_word_distributions.csv files
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\nProcessing file: {file_path}")
        
        try:
            # Read word distribution data
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # Get data from the last iteration
            max_iteration = word_df['iteration'].max()
            last_iteration_data = word_df[word_df['iteration'] == max_iteration]
            
            # Get the full vocabulary
            all_words = sorted(list(last_iteration_data['word'].dropna().unique()))
            print(f"Vocabulary size: {len(all_words)}")
            
            # Read the entropy file to get hierarchy information
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print(f"Warning: Entropy file not found: {entropy_file}")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            # Group nodes by layer
            layers = entropy_df.groupby('layer')['node_id'].apply(list).to_dict()
            print(f"Layer distribution: {[(layer, len(nodes)) for layer, nodes in layers.items()]}")
            
            # Build probability distribution for each node
            node_distributions = {}
            
            for node_id in entropy_df['node_id'].unique():
                # Get the word distribution for this node
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # Initialize count vector
                counts = np.zeros(len(all_words))
                word_to_idx = {word: idx for idx, word in enumerate(all_words)}
                
                # Fill in actual counts
                for _, row in node_words.iterrows():
                    word = row['word']
                    if pd.notna(word) and word in word_to_idx:
                        counts[word_to_idx[word]] = row['count']
                
                # Add Dirichlet smoothing
                smoothed_counts = counts + eta
                
                # Calculate probability distribution
                probabilities = smoothed_counts / np.sum(smoothed_counts)
                node_distributions[node_id] = probabilities
            
            # Calculate JS distances and weighted average entropy for nodes within each layer
            all_js_distances = []
            layer_avg_distances = []
            
            for layer, layer_nodes in layers.items():
                print(f"\nCalculating JS distance and weighted average entropy for Layer {layer} ({len(layer_nodes)} nodes)")
                
                layer_js_distances = []
                n = len(layer_nodes)
                
                # Calculate JS distance for all pairs of nodes in this layer
                for i, node1 in enumerate(layer_nodes):
                    for j, node2 in enumerate(layer_nodes):
                        if i < j:  # Only calculate for the upper triangle to avoid duplicates and self-comparison
                            if node1 in node_distributions and node2 in node_distributions:
                                p = node_distributions[node1]
                                q = node_distributions[node2]
                                
                                # Calculate Jensen-Shannon distance
                                js_distance = jensen_shannon_distance(p, q)
                                
                                layer_js_distances.append({
                                    'layer': layer,
                                    'node1_id': node1,
                                    'node2_id': node2,
                                    'js_distance': js_distance,
                                    'node1_doc_count': entropy_df[entropy_df['node_id'] == node1]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node1]) > 0 else 0,
                                    'node2_doc_count': entropy_df[entropy_df['node_id'] == node2]['document_count'].iloc[0] if len(entropy_df[entropy_df['node_id'] == node2]) > 0 else 0
                                })
                
                all_js_distances.extend(layer_js_distances)
                
                # Calculate the average JS distance for the layer
                avg_js_distance = 0.0
                if layer_js_distances and n > 1:
                    total_js_distance = sum(d['js_distance'] for d in layer_js_distances)
                    max_pairs = n * (n - 1) // 2  # n*(n-1)/2
                    avg_js_distance = total_js_distance / max_pairs
                
                # Calculate the document-count-weighted average Renyi entropy for the layer
                layer_entropy_data = entropy_df[entropy_df['layer'] == layer]
                total_docs = layer_entropy_data['document_count'].sum()
                
                if total_docs > 0:
                    # Calculate weighted average entropy: sum(doc_count * entropy) / total_doc_count
                    weighted_entropy = (layer_entropy_data['document_count'] * layer_entropy_data['renyi_entropy_corrected']).sum() / total_docs
                else:
                    weighted_entropy = 0.0
                
                layer_avg_distances.append({
                    'layer': layer,
                    'node_count': n,
                    'total_pairs': len(layer_js_distances),
                    'max_pairs': n * (n - 1) // 2 if n > 1 else 0,
                    'sum_js_distance': sum(d['js_distance'] for d in layer_js_distances),
                    'avg_js_distance': avg_js_distance,
                    'total_documents': total_docs,
                    'weighted_avg_renyi_entropy': weighted_entropy
                })
                
                print(f"  - Node count: {n}")
                print(f"  - Calculated node pairs: {len(layer_js_distances)}")
                print(f"  - Theoretical max node pairs: {n * (n - 1) // 2 if n > 1 else 0}")
                print(f"  - Average JS distance: {avg_js_distance:.4f}")
                print(f"  - Total documents: {total_docs}")
                print(f"  - Document-weighted average Renyi entropy: {weighted_entropy:.4f}")
                print("=" * 50)
            
            # Save detailed JS distance results
            if all_js_distances:
                js_df = pd.DataFrame(all_js_distances)
                output_path = os.path.join(folder_path, 'jensen_shannon_distances.csv')
                js_df.to_csv(output_path, index=False)
                print(f"\nSaved detailed JS distance results to: {output_path}")
            
            # Save average JS distance and weighted entropy results for each layer
            if layer_avg_distances:
                avg_df = pd.DataFrame(layer_avg_distances)
                avg_output_path = os.path.join(folder_path, 'layer_average_js_distances.csv')
                avg_df.to_csv(avg_output_path, index=False)
                print(f"Saved layer average JS distance and weighted entropy results to: {avg_output_path}")
                
                # Overall statistics
                print(f"\nOverall Statistics:")
                print(f"  - Total layers: {len(layer_avg_distances)}")
                print(f"  - Layer statistics:")
                for row in layer_avg_distances:
                    print(f"    Layer {row['layer']}: JS distance={row['avg_js_distance']:.4f}, Weighted entropy={row['weighted_avg_renyi_entropy']:.4f} (based on {row['node_count']} nodes, {row['total_documents']} documents)")
            
        except Exception as e:
            import traceback
            print(f"Error processing file {file_path}: {str(e)}")
            print("Detailed traceback:")
            traceback.print_exc()

def jensen_shannon_distance(p, q):
    """
    Calculate the Jensen-Shannon distance between two probability distributions.
    
    Parameters:
    p, q: numpy arrays, probability distributions
    
    Returns:
    float: Jensen-Shannon distance
    """
    # Ensure probability distributions are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # Calculate the average distribution
    m = 0.5 * (p + q)
    
    # Calculate KL divergence (using natural logarithm)
    def kl_divergence(x, y):
        # Avoid log(0)
        mask = (x > 0) & (y > 0)
        if np.sum(mask) == 0:
            return 0.0
        return np.sum(x[mask] * np.log(x[mask] / y[mask]))
    
    # Calculate Jensen-Shannon divergence
    js_divergence = 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)
    
    # Convert to distance (square root)
    js_distance = np.sqrt(js_divergence)
    
    return js_distance

In [None]:
import numpy as np
import os
import glob
import pandas as pd 
# Main function: Calculate Jensen-Shannon distance and weighted average Renyi entropy
base_path = "/Volumes/My Passport/收敛结果/step1/2"  # Root directory
eta = 0.1  # Dirichlet smoothing parameter

print("=" * 50)
print("Start calculating Jensen-Shannon distances and weighted average Renyi entropy...")
print("=" * 50)
calculate_jensen_shannon_distances_with_weighted_entropy(base_path, eta)
print("=" * 50)
print("Jensen-Shannon distance and weighted average Renyi entropy calculation completed!")
print("=" * 50)

Start calculating Jensen-Shannon distances and weighted average Renyi entropy...

Processing file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_node_word_distributions.csv
Vocabulary size: 1490
Layer distribution: [(0, 1), (1, 19), (2, 67), (3, 141)]

Calculating JS distance and weighted average entropy for Layer 0 (1 nodes)
  - Node count: 1
  - Calculated node pairs: 0
  - Theoretical max node pairs: 0
  - Average JS distance: 0.0000
  - Total documents: 970
  - Document-weighted average Renyi entropy: 4.8932

Calculating JS distance and weighted average entropy for Layer 1 (19 nodes)
  - Node count: 19
  - Calculated node pairs: 171
  - Theoretical max node pairs: 171
  - Average JS distance: 0.3923
  - Total documents: 970
  - Document-weighted average Renyi entropy: 5.0848

Calculating JS distance and weighted average entropy for Layer 2 (67 nodes)
  - Node count: 67
  - Calculated node pairs: 2211
  - Theoretical max node pairs: 2211
  - Avera

In [None]:
def aggregate_layer_statistics_by_gamma(base_path="."):
    """
    Aggregate layer-wise JS distance and weighted entropy statistics by gamma value, 
    and generate a summary table at the same level as the run folder.
    """
    # Find all layer_average_js_distances.csv files
    pattern = os.path.join(base_path, "**", "layer_average_js_distances.csv")
    files = glob.glob(pattern, recursive=True)
    
    # Store all data and grouping information
    all_data = []
    gamma_experiment_groups = {}  # Used to store the parent directory for each gamma_experiment combination
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)  # parent directory of the run folder
        
        # Extract gamma value from the folder name
        if 'gamma_0.001' in folder_name:
            if '2chains' in parent_folder: # Assuming '2条链' is '2chains' in the path
                gamma = 0.001
                experiment_type = '2chains'
            else:
                gamma = 0.001
                experiment_type = 'single'
        elif 'gamma_0.005' in folder_name:
            gamma = 0.005
            experiment_type = 'single'
        elif 'gamma_0.01' in folder_name:
            gamma = 0.01
            experiment_type = 'single'
        elif 'gamma_0.05' in folder_name:
            gamma = 0.05
            experiment_type = 'single'
        elif 'gamma_0.1' in folder_name:
            gamma = 0.1
            experiment_type = 'single'
        else:
            continue
        
        # Extract run number
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        # Record the parent directory for the gamma_experiment combination
        group_key = f"{gamma}_{experiment_type}"
        if group_key not in gamma_experiment_groups:
            gamma_experiment_groups[group_key] = parent_folder
        
        try:
            df = pd.read_csv(file_path)
            
            for _, row in df.iterrows():
                all_data.append({
                    'gamma': gamma,
                    'experiment_type': experiment_type,
                    'run_id': run_id,
                    'layer': row['layer'],
                    'node_count': row['node_count'],
                    'avg_js_distance': row['avg_js_distance'],
                    'weighted_avg_renyi_entropy': row['weighted_avg_renyi_entropy'],
                    'total_documents': row['total_documents'],
                    'parent_folder': parent_folder
                })
                
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Convert to DataFrame
    summary_df = pd.DataFrame(all_data)
    
    if summary_df.empty:
        print("No valid data found")
        return
    
    print("=" * 70)
    print("Summary Statistics for Each GAMMA Value by Layer")
    print("=" * 70)
    
    # Group by gamma, experiment_type, and parent_folder to generate summary files
    for (gamma, experiment_type), group_data in summary_df.groupby(['gamma', 'experiment_type']):
        parent_folder = group_data['parent_folder'].iloc[0]
        
        print(f"\nProcessing Gamma={gamma:.3f}, Experiment Type={'2 chains' if experiment_type == '2chains' else 'single chain'}")
        print(f"Output directory: {parent_folder}")
        
        # Calculate summary statistics for each layer
        layer_summary = group_data.groupby('layer').agg({
            'avg_js_distance': ['mean', 'std', 'count'],
            'weighted_avg_renyi_entropy': ['mean', 'std', 'count'],
            'node_count': ['mean', 'std'],
            'total_documents': 'mean',
            'run_id': lambda x: ', '.join(sorted(x.unique()))
        }).round(4)
        
        # Flatten column names
        layer_summary.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in layer_summary.columns]
        layer_summary = layer_summary.reset_index()
        
        # Rename columns to be more descriptive
        column_mapping = {
            'avg_js_distance_mean': 'avg_js_distance_mean',
            'avg_js_distance_std': 'avg_js_distance_std', 
            'avg_js_distance_count': 'run_count',
            'weighted_avg_renyi_entropy_mean': 'weighted_avg_renyi_entropy_mean',
            'weighted_avg_renyi_entropy_std': 'weighted_avg_renyi_entropy_std',
            'weighted_avg_renyi_entropy_count': 'entropy_run_count',
            'node_count_mean': 'avg_node_count',
            'node_count_std': 'node_count_std',
            'total_documents_mean': 'avg_total_documents',
            'run_id_<lambda>': 'included_runs'
        }
        
        for old_name, new_name in column_mapping.items():
            if old_name in layer_summary.columns:
                layer_summary = layer_summary.rename(columns={old_name: new_name})
        
        # Add gamma and experiment_type information
        layer_summary.insert(0, 'gamma', gamma)
        layer_summary.insert(1, 'experiment_type', experiment_type)
        
        # Save the summary results to the same level as the run folder
        if experiment_type == '2chains':
            output_filename = f'gamma_{gamma:.3f}_2chains_layer_summary.csv'
        else:
            output_filename = f'gamma_{gamma:.3f}_single_layer_summary.csv'
        
        output_path = os.path.join(parent_folder, output_filename)
        layer_summary.to_csv(output_path, index=False)
        
        print(f"  Saved summary file: {output_path}")
        print(f"  Included runs: {layer_summary['included_runs'].iloc[0] if 'included_runs' in layer_summary.columns else 'N/A'}")
        print(f"  Number of layers: {len(layer_summary)}")
        
        # Display brief statistics
        for _, row in layer_summary.iterrows():
            layer_num = int(row['layer'])
            js_mean = row['avg_js_distance_mean']
            js_std = row['avg_js_distance_std'] if 'avg_js_distance_std' in row else 0
            entropy_mean = row['weighted_avg_renyi_entropy_mean']
            entropy_std = row['weighted_avg_renyi_entropy_std'] if 'weighted_avg_renyi_entropy_std' in row else 0
            node_count = row['avg_node_count']
            run_count = int(row['run_count']) if 'run_count' in row else 0
            
            print(f"    Layer {layer_num}: JS={js_mean:.4f}(±{js_std:.4f}), Entropy={entropy_mean:.4f}(±{entropy_std:.4f}), Nodes={node_count:.1f}, runs={run_count}")
    
    # Generate overall comparison file (saved under base_path)
    print(f"\n" + "=" * 70)
    print("Generating overall comparison file")
    print("=" * 70)
    
    # Only analyze the cross-gamma comparison for single-chain experiments
    single_chain_data = summary_df[summary_df['experiment_type'] == 'single']
    
    if not single_chain_data.empty:
        overall_summary = single_chain_data.groupby(['gamma', 'layer']).agg({
            'avg_js_distance': ['mean', 'std'],
            'weighted_avg_renyi_entropy': ['mean', 'std'],
            'node_count': ['mean', 'std'],
            'run_id': 'count'
        }).round(4)
        
        # Flatten column names
        overall_summary.columns = ['_'.join(col).strip() for col in overall_summary.columns]
        overall_summary = overall_summary.reset_index()
        
        overall_output_path = os.path.join(base_path, 'gamma_layer_comparison.csv')
        overall_summary.to_csv(overall_output_path, index=False)
        print(f"Overall comparison file saved to: {overall_output_path}")
        
        # Display cross-gamma comparison
        for layer in sorted(single_chain_data['layer'].unique()):
            print(f"\nLayer {int(layer)} Cross-Gamma Comparison:")
            print("Gamma      JS Distance(±std)   Weighted Entropy(±std)   Node Count(±std)   Runs")
            print("-" * 75)
            
            layer_data = overall_summary[overall_summary['layer'] == layer]
            for _, row in layer_data.iterrows():
                gamma = row['gamma']
                js_mean = row['avg_js_distance_mean']
                js_std = row['avg_js_distance_std']
                entropy_mean = row['weighted_avg_renyi_entropy_mean']
                entropy_std = row['weighted_avg_renyi_entropy_std']
                node_mean = row['node_count_mean']
                node_std = row['node_count_std']
                run_count = int(row['run_id_count'])
                
                print(f"{gamma:6.3f}    {js_mean:6.4f}(±{js_std:5.4f})   {entropy_mean:6.4f}(±{entropy_std:5.4f})   {node_mean:6.1f}(±{node_std:4.1f})   {run_count:4d}")

In [None]:
# Execute summary analysis
base_path = "/Volumes/My Passport/收敛结果/step1/2"
print("=" * 70)
print("Start aggregating layer statistics for each Gamma value...")
print("=" * 70)
aggregate_layer_statistics_by_gamma(base_path)
print("=" * 70)
print("Summary analysis completed!")
print("=" * 70)

Start aggregating layer statistics for each Gamma value...
Summary Statistics for Each GAMMA Value by Layer

Processing Gamma=0.001, Experiment Type=single chain
Output directory: /Volumes/My Passport/收敛结果/step1/3/d3_g0001_2条链收敛
  Saved summary file: /Volumes/My Passport/收敛结果/step1/3/d3_g0001_2条链收敛/gamma_0.001_single_layer_summary.csv
  Included runs: 1, 2, 3
  Number of layers: 3
    Layer 0: JS=0.0000(±0.0000), Entropy=4.9490(±0.0399), Nodes=1.0, runs=5
    Layer 1: JS=0.4870(±0.0201), Entropy=5.0720(±0.1288), Nodes=42.2, runs=5
    Layer 2: JS=0.5069(±0.0121), Entropy=5.0823(±0.1317), Nodes=145.8, runs=5

Processing Gamma=0.005, Experiment Type=single chain
Output directory: /Volumes/My Passport/收敛结果/step1/3/d3_g0005_收敛
  Saved summary file: /Volumes/My Passport/收敛结果/step1/3/d3_g0005_收敛/gamma_0.005_single_layer_summary.csv
  Included runs: 1, 2, 3
  Number of layers: 3
    Layer 0: JS=0.0000(±0.0000), Entropy=4.9424(±0.0206), Nodes=1.0, runs=3
    Layer 1: JS=0.4915(±0.0111), Entrop

In [11]:
import pandas as pd
import glob
import os

base_path = "/Volumes/My Passport/收敛结果/step1/3"
pattern = os.path.join(base_path, "**", "result_layers.csv")
files = glob.glob(pattern, recursive=True)

all_rows = []
for file in files:
    df = pd.read_csv(file)
    # Add parameter information
    for col in ['depth', 'gamma', 'eta', 'alpha']:
        if col not in df.columns:
            # Extract parameters from the file path
            folder = os.path.dirname(file)
            if f"{col}_" in folder:
                try:
                    value = float(folder.split(f"{col}_")[1].split("_")[0])
                except:
                    value = None
                df[col] = value
            else:
                df[col] = None
    all_rows.append(df)

merged = pd.concat(all_rows, ignore_index=True)

# Group by parameter set and layer to calculate mean and standard deviation
group_cols = ['depth', 'gamma', 'eta', 'alpha', 'layer']
summary = merged.groupby(group_cols).agg({
    'entropy_wavg': ['mean', 'std'],
    'distinctiveness_wavg_jsd': ['mean', 'std'],
    'nodes_in_layer': ['mean', 'std'],
}).reset_index()

# Flatten the multi-level column names
summary.columns = ['_'.join(col).strip('_') for col in summary.columns]

summary.to_csv(os.path.join(base_path, "all_params_layer_mean.csv"), index=False)
print("Generated mean values per layer for each parameter set: all_params_layer_mean.csv")

Generated mean values per layer for each parameter set: all_params_layer_mean.csv


In [5]:
def calculate_parent_child_jsd(base_path=".", eta=0.1):
    """
    Calculate Jensen-Shannon distances between parent and child nodes across layers.
    
    Parameters:
    base_path: str, root directory path
    eta: float, Dirichlet smoothing parameter
    """
    # Find all iteration_node_word_distributions.csv files
    pattern = os.path.join(base_path, "**", "iteration_node_word_distributions.csv")
    files = glob.glob(pattern, recursive=True)
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        print(f"\nProcessing file: {file_path}")
        
        try:
            # Read word distribution data
            word_df = pd.read_csv(file_path)
            word_df.columns = [col.strip("'\" ") for col in word_df.columns]
            
            # Get data from the last iteration
            max_iteration = word_df['iteration'].max()
            last_iteration_data = word_df[word_df['iteration'] == max_iteration]
            
            # Get the full vocabulary
            all_words = sorted(list(last_iteration_data['word'].dropna().unique()))
            print(f"Vocabulary size: {len(all_words)}")
            
            # Read the entropy file to get hierarchy information
            entropy_file = os.path.join(folder_path, 'corrected_renyi_entropy.csv')
            if not os.path.exists(entropy_file):
                print(f"Warning: Entropy file not found: {entropy_file}")
                continue
                
            entropy_df = pd.read_csv(entropy_file)
            
            # Build probability distribution for each node
            node_distributions = {}
            
            for node_id in entropy_df['node_id'].unique():
                # Get the word distribution for this node
                node_words = last_iteration_data[last_iteration_data['node_id'] == node_id]
                
                # Initialize count vector
                counts = np.zeros(len(all_words))
                word_to_idx = {word: idx for idx, word in enumerate(all_words)}
                
                # Fill in actual counts
                for _, row in node_words.iterrows():
                    word = row['word']
                    if pd.notna(word) and word in word_to_idx:
                        counts[word_to_idx[word]] = row['count']
                
                # Add Dirichlet smoothing
                smoothed_counts = counts + eta
                
                # Calculate probability distribution
                probabilities = smoothed_counts / np.sum(smoothed_counts)
                node_distributions[node_id] = probabilities
            
            # Calculate parent-child JSD
            parent_child_distances = []
            layer_transitions = []
            
            for _, row in entropy_df.iterrows():
                child_id = row['node_id']
                parent_id = row['parent_id']
                
                if pd.notna(parent_id) and parent_id in node_distributions and child_id in node_distributions:
                    parent_id = int(parent_id)
                    
                    # Get parent layer info
                    parent_info = entropy_df[entropy_df['node_id'] == parent_id].iloc[0]
                    parent_layer = parent_info['layer']
                    parent_doc_count = parent_info['document_count']
                    
                    child_layer = row['layer']
                    child_doc_count = row['document_count']
                    
                    # Calculate JSD
                    p_parent = node_distributions[parent_id]
                    p_child = node_distributions[child_id]
                    js_distance = jensen_shannon_distance(p_parent, p_child)
                    
                    parent_child_distances.append({
                        'parent_id': parent_id,
                        'parent_layer': parent_layer,
                        'child_id': child_id,
                        'child_layer': child_layer,
                        'js_distance': js_distance,
                        'parent_doc_count': parent_doc_count,
                        'child_doc_count': child_doc_count
                    })
            
            print(f"Calculated {len(parent_child_distances)} parent-child JSD pairs")
            
            # Calculate layer transition statistics
            if parent_child_distances:
                pc_df = pd.DataFrame(parent_child_distances)
                
                # Group by layer transition (parent_layer -> child_layer)
                transition_stats = pc_df.groupby(['parent_layer', 'child_layer']).agg({
                    'js_distance': ['mean', 'std', 'count'],
                    'parent_doc_count': 'sum',
                    'child_doc_count': 'sum'
                }).round(4)
                
                # Flatten column names
                transition_stats.columns = ['_'.join(col).strip() for col in transition_stats.columns]
                transition_stats = transition_stats.reset_index()
                
                # Rename columns
                transition_stats = transition_stats.rename(columns={
                    'js_distance_mean': 'avg_js_distance',
                    'js_distance_std': 'js_distance_std',
                    'js_distance_count': 'pair_count',
                    'parent_doc_count_sum': 'total_parent_docs',
                    'child_doc_count_sum': 'total_child_docs'
                })
                
                layer_transitions = transition_stats.to_dict('records')
            
            # Save detailed parent-child JSD results
            if parent_child_distances:
                pc_df = pd.DataFrame(parent_child_distances)
                output_path = os.path.join(folder_path, 'parent_child_js_distances.csv')
                pc_df.to_csv(output_path, index=False)
                print(f"Saved detailed parent-child JS distance results to: {output_path}")
            
            # Save layer transition summary
            if layer_transitions:
                lt_df = pd.DataFrame(layer_transitions)
                lt_output_path = os.path.join(folder_path, 'layer_transition_js_distances.csv')
                lt_df.to_csv(lt_output_path, index=False)
                print(f"Saved layer transition JS distance summary to: {lt_output_path}")
                
                # Print transition statistics
                print(f"\nLayer Transition Statistics:")
                for transition in layer_transitions:
                    parent_layer = int(transition['parent_layer'])
                    child_layer = int(transition['child_layer'])
                    avg_jsd = transition['avg_js_distance']
                    std_jsd = transition['js_distance_std']
                    pair_count = int(transition['pair_count'])
                    print(f"  Layer {parent_layer} -> Layer {child_layer}: JSD={avg_jsd:.4f}(±{std_jsd:.4f}), pairs={pair_count}")
            
            print("=" * 50)
            
        except Exception as e:
            import traceback
            print(f"Error processing file {file_path}: {str(e)}")
            print("Detailed traceback:")
            traceback.print_exc()

def jensen_shannon_distance(p, q):
    """
    Calculate the Jensen-Shannon distance between two probability distributions.
    
    Parameters:
    p, q: numpy arrays, probability distributions
    
    Returns:
    float: Jensen-Shannon distance
    """
    # Ensure probability distributions are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # Calculate the average distribution
    m = 0.5 * (p + q)
    
    # Calculate KL divergence (using natural logarithm)
    def kl_divergence(x, y):
        # Avoid log(0)
        mask = (x > 0) & (y > 0)
        if np.sum(mask) == 0:
            return 0.0
        return np.sum(x[mask] * np.log(x[mask] / y[mask]))
    
    # Calculate Jensen-Shannon divergence
    js_divergence = 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)
    
    # Convert to distance (square root)
    js_distance = np.sqrt(js_divergence)
    
    return js_distance

In [6]:
# Main function: Calculate parent-child Jensen-Shannon distances
import numpy as np
import os
import glob
import pandas as pd

base_path = "/Volumes/My Passport/收敛结果/step1/2"  # Root directory
eta = 0.1  # Dirichlet smoothing parameter

print("=" * 50)
print("Start calculating parent-child Jensen-Shannon distances...")
print("=" * 50)
calculate_parent_child_jsd(base_path, eta)
print("=" * 50)
print("Parent-child Jensen-Shannon distance calculation completed!")
print("=" * 50)

Start calculating parent-child Jensen-Shannon distances...

Processing file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/iteration_node_word_distributions.csv
Vocabulary size: 1490
Calculated 227 parent-child JSD pairs
Saved detailed parent-child JS distance results to: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/parent_child_js_distances.csv
Saved layer transition JS distance summary to: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_1/layer_transition_js_distances.csv

Layer Transition Statistics:
  Layer 0 -> Layer 1: JSD=0.6029(±0.0416), pairs=19
  Layer 1 -> Layer 2: JSD=0.4641(±0.1583), pairs=67
  Layer 2 -> Layer 3: JSD=0.4623(±0.1051), pairs=141

Processing file: /Volumes/My Passport/收敛结果/step1/2/d4_g0005_收敛/depth_4_gamma_0.005_run_2/iteration_node_word_distributions.csv
Vocabulary size: 1490
Calculated 234 parent-child JSD pairs
Saved detailed parent-child JS distance results to: /Volumes/My Passpor

In [7]:
def aggregate_parent_child_jsd_by_gamma(base_path="."):
    """
    Aggregate parent-child JS distance data by gamma value across all runs,
    with proper weighted averages by individual parent-child pairs' child document counts relative to total 970 documents.
    """
    # Find all parent_child_js_distances.csv files (detailed data)
    pattern = os.path.join(base_path, "**", "parent_child_js_distances.csv")
    files = glob.glob(pattern, recursive=True)
    
    all_detailed_data = []
    
    # 假设总文档数为970（可以从数据中动态获取）
    TOTAL_DOCUMENTS = 970
    
    for file_path in files:
        folder_path = os.path.dirname(file_path)
        folder_name = os.path.basename(folder_path)
        parent_folder = os.path.dirname(folder_path)
        
        # Extract gamma value from the folder name
        if 'gamma_0.001' in folder_name:
            if '2chains' in parent_folder:
                gamma = 0.001
                experiment_type = '2chains'
            else:
                gamma = 0.001
                experiment_type = 'single'
        elif 'gamma_0.005' in folder_name:
            gamma = 0.005
            experiment_type = 'single'
        elif 'gamma_0.01' in folder_name:
            gamma = 0.01
            experiment_type = 'single'
        elif 'gamma_0.05' in folder_name:
            gamma = 0.05
            experiment_type = 'single'
        elif 'gamma_0.1' in folder_name:
            gamma = 0.1
            experiment_type = 'single'
        else:
            continue
        
        # Extract run number
        run_match = folder_name.split('_run_')
        if len(run_match) > 1:
            run_id = run_match[1]
        else:
            continue
        
        try:
            df = pd.read_csv(file_path)
            
            # Add run and gamma information to each detailed parent-child pair
            for _, row in df.iterrows():
                all_detailed_data.append({
                    'gamma': gamma,
                    'experiment_type': experiment_type,
                    'run_id': run_id,
                    'parent_id': row['parent_id'],
                    'parent_layer': row['parent_layer'],
                    'child_id': row['child_id'],
                    'child_layer': row['child_layer'],
                    'layer_transition': f"{int(row['parent_layer'])}->{int(row['child_layer'])}",
                    'js_distance': row['js_distance'],
                    'parent_doc_count': row['parent_doc_count'],
                    'child_doc_count': row['child_doc_count']
                })
                
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    # Convert to DataFrame
    detailed_df = pd.DataFrame(all_detailed_data)
    
    if detailed_df.empty:
        print("No valid detailed parent-child JSD data found")
        return
    
    print("=" * 70)
    print("Parent-Child JSD Data Collection by GAMMA Value (weighted by child_doc_count/970)")
    print("=" * 70)
    
    # Calculate properly weighted averages by gamma and layer transition
    weighted_summary = []
    
    for (gamma, experiment_type), gamma_group in detailed_df.groupby(['gamma', 'experiment_type']):
        for transition, transition_group in gamma_group.groupby('layer_transition'):
            # 计算基于全局970文档的加权平均
            # 每个child的权重 = child_doc_count / 970
            weights = transition_group['child_doc_count'] / TOTAL_DOCUMENTS
            
            # 加权平均JSD = sum(js_distance * weight) / sum(weight)
            # 但这等价于：sum(js_distance * child_doc_count) / sum(child_doc_count)
            # 因为970是常数，会被约掉
            
            # 方法1：使用归一化权重
            total_weight = weights.sum()
            if total_weight > 0:
                weighted_avg_jsd_normalized = (transition_group['js_distance'] * weights).sum() / total_weight
            else:
                weighted_avg_jsd_normalized = 0.0
            
            # 方法2：直接使用文档数作为权重（与方法1结果相同）
            total_child_docs = transition_group['child_doc_count'].sum()
            if total_child_docs > 0:
                weighted_avg_jsd_direct = (transition_group['js_distance'] * transition_group['child_doc_count']).sum() / total_child_docs
            else:
                weighted_avg_jsd_direct = 0.0
            
            # Calculate simple average for comparison
            simple_avg_jsd = transition_group['js_distance'].mean()
            
            # Get additional statistics
            parent_layer = transition_group['parent_layer'].iloc[0]
            child_layer = transition_group['child_layer'].iloc[0]
            pair_count = len(transition_group)
            run_count = len(transition_group['run_id'].unique())
            runs_included = ', '.join(sorted(transition_group['run_id'].unique()))
            
            # 计算权重占比（相对于970）
            weight_proportion = total_child_docs / TOTAL_DOCUMENTS
            
            weighted_summary.append({
                'gamma': gamma,
                'experiment_type': experiment_type,
                'layer_transition': transition,
                'parent_layer': parent_layer,
                'child_layer': child_layer,
                'weighted_avg_jsd': weighted_avg_jsd_direct,  # 两种方法结果相同
                'simple_avg_jsd': simple_avg_jsd,
                'total_child_docs': total_child_docs,
                'weight_proportion_of_970': weight_proportion,  # 权重占970的比例
                'total_pairs': pair_count,
                'run_count': run_count,
                'runs_included': runs_included,
                'jsd_std': transition_group['js_distance'].std(),
                'child_doc_count_mean': transition_group['child_doc_count'].mean(),
                'child_doc_count_std': transition_group['child_doc_count'].std()
            })
    
    # Save detailed individual pair data
    detailed_output_path = os.path.join(base_path, 'all_runs_detailed_parent_child_jsd_by_gamma.csv')
    detailed_df.to_csv(detailed_output_path, index=False)
    print(f"Saved complete detailed parent-child JSD data to: {detailed_output_path}")
    
    # Save weighted summary
    weighted_df = pd.DataFrame(weighted_summary)
    weighted_output_path = os.path.join(base_path, 'properly_weighted_parent_child_jsd_by_gamma.csv')
    weighted_df.to_csv(weighted_output_path, index=False)
    print(f"Saved properly weighted parent-child JSD summary to: {weighted_output_path}")
    
    # Print summary statistics for each gamma
    for gamma in sorted(detailed_df['gamma'].unique()):
        gamma_data = detailed_df[detailed_df['gamma'] == gamma]
        experiment_types = gamma_data['experiment_type'].unique()
        
        for exp_type in experiment_types:
            exp_data = gamma_data[gamma_data['experiment_type'] == exp_type]
            run_count = len(exp_data['run_id'].unique())
            transition_count = len(exp_data['layer_transition'].unique())
            total_pairs = len(exp_data)
            
            print(f"\nGamma {gamma:.3f} ({'2 chains' if exp_type == '2chains' else 'single chain'}):")
            print(f"  - Runs: {run_count}")
            print(f"  - Layer transitions: {transition_count}")
            print(f"  - Total parent-child pairs: {total_pairs}")
            
            # Show transition summary with properly weighted averages
            print(f"  - Layer transition summary (weighted by child_doc_count/970):")
            gamma_weighted = weighted_df[(weighted_df['gamma'] == gamma) & (weighted_df['experiment_type'] == exp_type)]
            
            for _, row in gamma_weighted.iterrows():
                transition = row['layer_transition']
                weighted_jsd = row['weighted_avg_jsd']
                simple_jsd = row['simple_avg_jsd']
                total_child_docs = int(row['total_child_docs'])
                weight_prop = row['weight_proportion_of_970']
                total_pairs_trans = int(row['total_pairs'])
                run_count_trans = int(row['run_count'])
                jsd_std = row['jsd_std']
                
                print(f"    {transition}: Weighted JSD={weighted_jsd:.4f}, Simple JSD={simple_jsd:.4f}(±{jsd_std:.4f})")
                print(f"      Child docs={total_child_docs}, Weight prop={weight_prop:.3f}, pairs={total_pairs_trans}, runs={run_count_trans}")
                
                # Show the difference between weighted and simple averages
                diff = abs(weighted_jsd - simple_jsd)
                if diff > 0.0001:  # Only show if there's a meaningful difference
                    print(f"      -> Difference: {diff:.4f} (Weighted vs Simple)")
    
    print(f"\n" + "=" * 70)
    print("Data collection with properly weighted averages (relative to 970 total documents) completed!")
    print("=" * 70)

In [8]:
# Execute parent-child JSD data collection with weighted averages
import pandas as pd
import numpy as np
import os
import glob

base_path = "/Volumes/My Passport/收敛结果/step1/2"
print("=" * 70)
print("Start collecting parent-child JSD data with weighted averages for each Gamma value...")
print("=" * 70)
aggregate_parent_child_jsd_by_gamma(base_path)
print("=" * 70)
print("Parent-child JSD data collection with weighted averages completed!")
print("=" * 70)

Start collecting parent-child JSD data with weighted averages for each Gamma value...
Parent-Child JSD Data Collection by GAMMA Value (weighted by child_doc_count/970)
Saved complete detailed parent-child JSD data to: /Volumes/My Passport/收敛结果/step1/2/all_runs_detailed_parent_child_jsd_by_gamma.csv
Saved properly weighted parent-child JSD summary to: /Volumes/My Passport/收敛结果/step1/2/properly_weighted_parent_child_jsd_by_gamma.csv

Gamma 0.001 (single chain):
  - Runs: 3
  - Layer transitions: 3
  - Total parent-child pairs: 728
  - Layer transition summary (weighted by child_doc_count/970):
    0->1: Weighted JSD=0.4798, Simple JSD=0.6105(±0.0457)
      Child docs=2910, Weight prop=3.000, pairs=51, runs=3
      -> Difference: 0.1308 (Weighted vs Simple)
    1->2: Weighted JSD=0.6074, Simple JSD=0.4998(±0.1316)
      Child docs=2910, Weight prop=3.000, pairs=204, runs=3
      -> Difference: 0.1076 (Weighted vs Simple)
    2->3: Weighted JSD=0.5523, Simple JSD=0.4772(±0.1064)
      Chil