In [1]:
""" 0. set-up part:  import necessary libraries and set up environment """

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from collections import Counter, defaultdict
import numpy as np
import math
import copy
import itertools
import matplotlib.pyplot as plt
import matplotlib as mpl

import joblib
from joblib import Parallel, delayed
from threading import Thread

import os
import pickle
import time

import operator
from functools import reduce
import json
import cProfile

import gensim
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# download nltk data once time
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')

#  chinese character support in matplotlib
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS' 'SimHei' 'DejaVu Sans']  
plt.rcParams['axes.unicode_minus'] = False

In [2]:
""" 1.1 Data Preprocessing: load data, clean text, lemmatization, remove low-frequency words"""

# Map POS tags to WordNet format， Penn Treebank annotation: fine-grained (45 tags), WordNet annotation: coarse-grained (4 tags: a, v, n, r)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # 形容词
    elif treebank_tag.startswith('V'):
        return 'v'  # 动词
    elif treebank_tag.startswith('N'):
        return 'n'  # 名词
    elif treebank_tag.startswith('R'):
        return 'r'  # 副词
    else:
        return 'n'  # 默认名词

# Text cleaning and lemmatization preprocessing function
def clean_and_lemmatize(text):
    if pd.isnull(text):
        return []
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters using regex
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    pos_tags = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags]
    return lemmatized  

#-----------------Load data----------------
data = pd.read_excel('./data/raw/papers_CM.xlsx', usecols=['PaperID', 'Abstract', 'Keywords', 'Year'])

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# clean and lemmatize the abstracts
data['Lemmatized_Tokens'] = data['Abstract'].apply(clean_and_lemmatize)

# count word frequencies
all_tokens = [word for tokens in data['Lemmatized_Tokens'] for word in tokens]
word_counts = Counter(all_tokens)

# set a minimum frequency threshold for valid words
min_freq = 10
valid_words = set([word for word, freq in word_counts.items() if freq >= min_freq])

# remove rare words based on frequency threshold
def remove_rare_words(tokens):
    return [word for word in tokens if word in valid_words]

data['Filtered_Tokens'] = data['Lemmatized_Tokens'].apply(remove_rare_words)

# join tokens back into cleaned abstracts
data['Cleaned_Abstract'] = data['Filtered_Tokens'].apply(lambda x: " ".join(x))

# create a cleaned DataFrame with relevant columns
cleaned_data = data[['PaperID', 'Year', 'Cleaned_Abstract']]
cleaned_data = cleaned_data[~(cleaned_data['PaperID'] == 57188)] # this paper has no abstract
cleaned_data = cleaned_data.reset_index(drop=True) 
cleaned_data.insert(0, 'Document_ID', range(len(cleaned_data))) 
abstract_list = cleaned_data['Cleaned_Abstract'].apply(lambda x: x.split()).tolist()

corpus = {doc_id: abstract_list for doc_id, abstract_list in enumerate(abstract_list)}
# cleaned_data.to_csv('./data/processed/cleaned_data.xlsx', index=False, encoding='utf-8-sig')

In [3]:
# ===== General Function Library: Applicable to All Topic Models =====

import math, random
import tomotopy as tp
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

print("🔧 General Topic Model Function Library")
print("=" * 60)

# ===== General Parameter Definitions =====
# These parameters can be shared by all models
ALPHA = 0.1          # Dirichlet prior parameter for document-topic distribution
ETA = 0.05           # Dirichlet prior parameter for topic-word distribution
K_LEAF = 252         # Number of topics (aligned with the number of hLDA leaf nodes)

MAX_ITERS = 1000     # Maximum number of training iterations
INTERVAL = 100       # Interval for checking convergence
PPL_TOL = 0.01       # Perplexity change tolerance (1%)
COH_TOL = 0.005      # Coherence change tolerance
STABLE_K = 5         # Number of consecutive times the early stopping condition is met
B1, B2 = 59, 4       # Branching factor parameters for PAM/hPAM models

print(f"✅ General parameters defined: K={K_LEAF}, α={ALPHA}, η={ETA}")

# ===== Enhanced Coherence Calculation Function (Supports Multiple Metrics Including NPMI) =====
def calculate_multiple_coherence_metrics(mdl, corpus_docs, metrics=['c_v', 'c_npmi'], fast_mode=True, top_n=5):
    """
    Enhanced coherence calculation function - supports multiple coherence metrics including NPMI
    
    Supported coherence metrics:
    - c_v: Vector space-based coherence (default)
    - c_npmi: Normalized Pointwise Mutual Information (NPMI)
    
    Supported model types:
    - Single-layer models: LDA, CTM
    - Hierarchical models: hLDA, PAM, hPAM
    - Non-parametric models: HDP
    
    Returns: dict containing various coherence metrics
    """
    try:
        docs = list(corpus_docs)
        dictionary = Dictionary(docs)
        
        # Check if it is a hierarchical model
        model_type_str = str(type(mdl))
        is_hierarchical = hasattr(mdl, 'depth') or 'HLDA' in model_type_str or 'PAM' in model_type_str
        
        if is_hierarchical:
            # Hierarchical model: calculate coherence by level, weighted average
            metrics_results = {metric: [] for metric in metrics}
            layer_weights = []
            
            try:
                for level in range(getattr(mdl, 'depth', 3)):
                    level_topics = []
                    level_doc_count = 0
                    
                    # Iterate through all nodes of this level
                    for k in range(getattr(mdl, 'k', 100)):
                        try:
                            topic_words = mdl.get_topic_words(k, top_n=top_n)
                            if topic_words:
                                words = [word for word, prob in topic_words]
                                level_topics.append(words)
                                level_doc_count += 1
                        except:
                            continue
                    
                    if level_topics:
                        # Calculate coherence for each metric
                        for metric in metrics:
                            try:
                                cm = CoherenceModel(
                                    topics=level_topics,
                                    texts=docs,
                                    dictionary=dictionary,
                                    coherence=metric,
                                    processes=1
                                )
                                score = cm.get_coherence()
                                if score and not math.isnan(score):
                                    metrics_results[metric].append(score)
                                else:
                                    metrics_results[metric].append(0.1)
                            except Exception as e:
                                print(f"Hierarchical model {metric} calculation warning: {e}")
                                metrics_results[metric].append(0.1)
                        
                        layer_weights.append(level_doc_count)
            except:
                pass
            
            # Calculate weighted average
            final_results = {}
            for metric in metrics:
                if metrics_results[metric] and layer_weights:
                    total_weight = sum(layer_weights)
                    if total_weight > 0:
                        weighted_avg = sum(score * w for score, w in zip(metrics_results[metric], layer_weights)) / total_weight
                        final_results[metric] = weighted_avg
                    else:
                        final_results[metric] = 0.1
                else:
                    final_results[metric] = 0.1
            
            return final_results
        
        # Single-layer model: calculate coherence for all topics
        num_topics = getattr(mdl, 'k', None) or getattr(mdl, 'num_topics', None) or 100
        topics = []
        
        for k in range(num_topics):
            try:
                topic_words = mdl.get_topic_words(k, top_n=top_n)
                if topic_words:
                    words = [word for word, prob in topic_words]
                    topics.append(words)
            except:
                continue
        
        if not topics:
            return {metric: 0.1 for metric in metrics}
        
        # Calculate coherence for each metric
        final_results = {}
        for metric in metrics:
            try:
                cm = CoherenceModel(
                    topics=topics,
                    texts=docs,
                    dictionary=dictionary,
                    coherence=metric,
                    processes=1
                )
                score = cm.get_coherence()
                final_results[metric] = score if score and not math.isnan(score) else 0.1
            except Exception as e:
                print(f"Single-layer model {metric} calculation warning: {e}")
                final_results[metric] = 0.1
        
        return final_results
        
    except Exception as e:
        print(f"Coherence calculation error: {e}")
        return {metric: 0.1 for metric in metrics}

def calculate_gensim_coherence(mdl, corpus_docs, coherence='c_v', fast_mode=True):
    """
    Backward-compatible coherence calculation function
    
    Now internally calls the enhanced version but maintains the original interface
    """
    results = calculate_multiple_coherence_metrics(mdl, corpus_docs, [coherence], fast_mode)
    return results.get(coherence, 0.1)

# ===== General Effective Topic Count Calculation Function =====
def effective_k(mdl, top_n=1, min_prob=1e-12):
    """
    General effective topic count estimation function - applicable to all topic models
    
    Supported model types:
    - Parametric models: LDA, CTM, hLDA, PAM, hPAM (fixed number of topics)
    - Non-parametric models: HDP (dynamic number of topics)
    """
    kmax = getattr(mdl, 'k', None) or getattr(mdl, 'num_topics', None) or 0
    eff = 0
    for k in range(kmax):
        try:
            tw = mdl.get_topic_words(k, top_n=top_n)
            if tw and tw[0][1] > min_prob: 
                eff += 1
        except:
            continue
    return eff

# ===== Enhanced Auto-Training Function (Supports Multiple Coherence Metrics) =====
def auto_train_with_multiple_metrics(mdl, docs, seed=0, max_iters=MAX_ITERS, interval=INTERVAL,
                                   ppl_tol=PPL_TOL, coh_tol=COH_TOL, stable_k=STABLE_K,
                                   coherence_measures=['c_v', 'c_npmi'], name=None, fast_coherence=True):
    """
    Enhanced auto-training function - supports multiple coherence metrics
    
    Features:
    - Early stopping mechanism (based on perplexity and primary coherence stability)
    - Training history logging
    - Multiple coherence metric calculation (C_v, NPMI)
    - Supports custom training parameters
    
    Parameters:
    - coherence_measures: List of coherence metrics, e.g., ['c_v', 'c_npmi']
    
    Returns:
    - model: Trained model
    - history: Training history [(iter, llpw, ppl, coherence_dict), ...]
    """
    random.seed(seed)
    for d in docs: 
        mdl.add_doc(d)

    prev_ppl = prev_coh = None
    hold_ppl = hold_coh = 0
    hist = []  # (iter, ll_per_word, ppl, coherence_dict)
    
    primary_coherence = coherence_measures[0]  # Use the first metric as the criterion for early stopping

    for it in range(interval, max_iters + 1, interval):
        mdl.train(interval)
        llpw = mdl.ll_per_word
        ppl  = math.exp(-llpw)
        
        print(f"[{name or mdl.__class__.__name__}] iter={it:4d} llpw={llpw:.4f} ppl={ppl:.2f} Calculating multiple coherence metrics...")
        
        # Calculate multiple coherence metrics
        coherence_dict = calculate_multiple_coherence_metrics(mdl, docs, coherence_measures, fast_mode=fast_coherence)
        primary_coh = coherence_dict.get(primary_coherence, 0.1)
        
        hist.append((it, llpw, ppl, coherence_dict))
        
        # Print results
        coh_str = ", ".join([f"{metric}={score:.4f}" for metric, score in coherence_dict.items()])
        print(f"[{name or mdl.__class__.__name__}] iter={it:4d} llpw={llpw:.4f} ppl={ppl:.2f} {coh_str}")

        # Early stopping criterion (based on the primary coherence metric)
        if prev_ppl is not None:
            if abs(prev_ppl - ppl)/max(prev_ppl, 1e-12) < ppl_tol:  
                hold_ppl += 1
            else:
                hold_ppl = 0
        if prev_coh is not None:
            if abs(prev_coh - primary_coh) < coh_tol: 
                hold_coh += 1
            else: 
                hold_coh = 0
        prev_ppl, prev_coh = ppl, primary_coh

        if hold_ppl >= stable_k and hold_coh >= stable_k:
            print(f"[{name or mdl.__class__.__name__}] early stop at iter {it}")
            break
    return mdl, hist

def auto_train(mdl, docs, seed=0, max_iters=MAX_ITERS, interval=INTERVAL,
               ppl_tol=PPL_TOL, coh_tol=COH_TOL, stable_k=STABLE_K,
               coherence_measure='c_v', name=None, fast_coherence=True):
    """
    Backward-compatible auto-training function
    
    Now internally calls the enhanced version but maintains the original interface
    """
    enhanced_model, enhanced_hist = auto_train_with_multiple_metrics(
        mdl, docs, seed, max_iters, interval, ppl_tol, coh_tol, stable_k,
        [coherence_measure], name, fast_coherence
    )
    
    # Convert history format for compatibility
    compatible_hist = []
    for it, llpw, ppl, coh_dict in enhanced_hist:
        primary_coh = coh_dict.get(coherence_measure, 0.1)
        compatible_hist.append((it, llpw, ppl, primary_coh))
    
    return enhanced_model, compatible_hist

# ===== General Topic Analysis Function =====
def analyze_model_topics(model, model_name="Model", top_words=5, min_prob=0.01, max_display=10):
    """
    General topic analysis function - applicable to all topic models
    
    Functionality:
    - Extracts active topics
    - Displays topic words and weights
    - Calculates topic activity rate
    """
    print(f"\n🔍 {model_name} Topic Analysis (showing top {top_words} words):")
    print("=" * 80)
    
    active_topics = 0
    topic_info = []
    
    num_topics = getattr(model, 'k', None) or getattr(model, 'num_topics', None) or 0
    
    for k in range(num_topics):
        try:
            topic_words = model.get_topic_words(k, top_n=top_words)
            if topic_words and topic_words[0][1] > min_prob:
                active_topics += 1
                words_str = ", ".join([f"{word}({prob:.3f})" for word, prob in topic_words[:5]])
                topic_info.append((k, topic_words[0][1], words_str))
                
                if active_topics <= max_display:
                    print(f"Topic {k:3d} (weight:{topic_words[0][1]:.3f}): {words_str}")
        except:
            continue
    
    if active_topics > max_display:
        print(f"... (and {active_topics - max_display} more active topics)")
    
    print(f"\n📊 {model_name} Topic Statistics:")
    print(f"   - Active topics: {active_topics}/{num_topics}")
    print(f"   - Topic activity rate: {active_topics/num_topics*100:.1f}%")
    
    return topic_info

# ===== Multiple Coherence Metrics Evaluation Function =====
def evaluate_model_with_multiple_coherence(model, docs, model_name="Model", 
                                         coherence_metrics=['c_v', 'c_npmi'],
                                         top_words_for_coherence=5):
    """
    Evaluate a model using multiple coherence metrics
    
    Parameters:
    - model: Trained topic model
    - docs: List of documents
    - model_name: Name of the model
    - coherence_metrics: List of coherence metrics to calculate
    - top_words_for_coherence: Number of topic words to use for coherence calculation
    
    Returns:
    - dict: A dictionary containing all coherence metrics
    """
    print(f"\n🔍 {model_name} Multiple Coherence Metrics Evaluation:")
    print("=" * 80)
    

    # Calculate multiple coherence metrics
    coherence_results = calculate_multiple_coherence_metrics(
        model, docs, coherence_metrics, top_n=top_words_for_coherence
    )
    
    print(f"📊 Coherence Metrics Results:")
    for metric, score in coherence_results.items():
        metric_desc = {
            'c_v': 'C_v (Vector Space Coherence)',
            'c_npmi': 'NPMI (Normalized Pointwise Mutual Information)'
        }
        description = metric_desc.get(metric, metric)
        print(f"   📈 {description}: {score:.6f}")
    
    # Calculate basic metrics
    effective_topics = effective_k(model)
    total_topics = getattr(model, 'k', None) or getattr(model, 'num_topics', None) or 0
    
    print(f"\n📊 Basic Metrics:")
    print(f"   📈 Effective topics: {effective_topics}")
    print(f"   📈 Total topics: {total_topics}")
    if total_topics > 0:
        print(f"   📈 Topic utilization: {effective_topics/total_topics*100:.1f}%")
    
    # Metric explanations
    print(f"\n💡 Coherence Metric Explanations:")
    print(f"   🎯 C_v: Range [0,1], higher is better, based on word vector similarity")
    print(f"   🎯 NPMI: Range [-1,1], higher is better, Normalized Pointwise Mutual Information")
    
    # Composite score (optional)
    # Normalize each metric and calculate a weighted average
    normalized_scores = {}
    
    # C_v and NPMI: higher is better
    normalized_scores['c_v'] = max(0, coherence_results.get('c_v', 0))
    if 'c_npmi' in coherence_results:
        # NPMI range is [-1,1], convert to [0,1]
        npmi_score = coherence_results['c_npmi']
        normalized_scores['c_npmi'] = (npmi_score + 1) / 2
    
    
    if normalized_scores:
        composite_score = np.mean(list(normalized_scores.values()))
        print(f"\n🏆 Composite Coherence Score: {composite_score:.4f} (average of normalized scores)")
    
    # Return full results
    result = {
        'coherence_metrics': coherence_results,
        'effective_topics': effective_topics,
        'total_topics': total_topics,
        'topic_utilization': effective_topics/total_topics if total_topics > 0 else 0,
        'normalized_scores': normalized_scores,
        'composite_score': composite_score if normalized_scores else 0
    }
    
    return result

🔧 General Topic Model Function Library
✅ General parameters defined: K=252, α=0.1, η=0.05


In [5]:
# ===== Enhanced LDA Model Training (with Multiple Coherence Metrics Support) =====

def train_lda_with_multiple_coherence_metrics(docs, seed=42, max_iters=1000, K=252, detailed_output=True):
    """
    Train an LDA model using multiple coherence metrics.
    
    This function demonstrates how to use multiple coherence metrics (including NPMI)
    during the training process.
    
    Parameters:
    - docs: List of documents
    - seed: Random seed
    - max_iters: Maximum number of training iterations
    - K: Number of topics
    - detailed_output: Whether to output detailed information
    
    Returns:
    - model: Trained LDA model
    - metrics: A dictionary of evaluation metrics including multiple coherence scores
    - history: Training history
    """
    
    print(f"🚀 Training LDA model with multiple coherence metrics (K={K}, seed={seed})")
    print(f"📊 Data Overview:")
    print(f"   - Number of documents: {len(docs)}")
    print(f"   - Average document length: {np.mean([len(doc) for doc in docs]):.1f} words")
    print(f"   - Vocabulary size: {len(set(word for doc in docs for word in doc))}")
    
    print(f"\n🔧 Model Parameters:")
    print(f"   - Number of topics K = {K}")
    print(f"   - Alpha = {ALPHA}")
    print(f"   - Eta = {ETA}")
    print(f"   - Max iterations = {max_iters}")
    print(f"   - Coherence metrics = ['c_v', 'c_npmi']")
    
    # Create LDA model
    print(f"\n🚀 Creating LDA model...")
    lda_model = tp.LDAModel(k=K, alpha=ALPHA, eta=ETA, seed=seed)
    
    # Use the enhanced multi-metric training function
    print(f"\n🎯 Starting training with multiple coherence metrics...")
    start_time = time.time()
    
    trained_model, training_history = auto_train_with_multiple_metrics(
        lda_model, docs, 
        seed=seed, 
        max_iters=max_iters,
        interval=100,  # Check every 100 iterations
        coherence_measures=['c_v', 'c_npmi'],  # Use multiple coherence metrics
        name='LDA-Multi-Coherence'
    )
    
    training_time = time.time() - start_time
    print(f"\n✅ Training complete! Time taken: {training_time:.1f} seconds")
    
    # Extract final metrics
    if training_history:
        final_iter, final_llpw, final_ppl, final_coherence_dict = training_history[-1]
        
        # Calculate additional metrics
        effective_topics = effective_k(trained_model)
        
        # Perform a full evaluation with multiple coherence metrics
        print(f"\n🔍 Performing full evaluation with multiple coherence metrics...")
        full_evaluation = evaluate_model_with_multiple_coherence(
            trained_model, docs, "LDA", 
            coherence_metrics=['c_v', 'c_npmi'],
            top_words_for_coherence=5
        )
        
        # Build the complete evaluation metrics
        metrics = {
            'iterations': final_iter,
            'log_likelihood_per_word': final_llpw,
            'perplexity': final_ppl,
            'effective_topics': effective_topics,
            'training_time_seconds': training_time,
            'convergence_iterations': len(training_history),
            
            # Multiple coherence metrics
            'coherence_c_v': full_evaluation['coherence_metrics'].get('c_v', 0),
            'coherence_npmi': full_evaluation['coherence_metrics'].get('c_npmi', 0),
            
            # Composite score
            'composite_coherence_score': full_evaluation.get('composite_score', 0),
            'topic_utilization': full_evaluation.get('topic_utilization', 0),
            
            # Full evaluation results
            'full_evaluation': full_evaluation
        }
        
        if detailed_output:
            print(f"\n📈 Final Evaluation Results:")
            print(f"   - Training iterations: {final_iter}")
            print(f"   - Log-likelihood per word: {final_llpw:.6f}")
            print(f"   - Perplexity: {final_ppl:.2f}")
            print(f"   - Effective topics: {effective_topics}/{K}")
            print(f"   - Topic utilization: {metrics['topic_utilization']*100:.1f}%")
            
            print(f"\n📊 Multiple Coherence Metrics Comparison:")
            print(f"   - C_v (Vector Space): {metrics['coherence_c_v']:.6f}")
            print(f"   - NPMI (Pointwise Mutual Information): {metrics['coherence_npmi']:.6f}")
            print(f"   - Composite Score: {metrics['composite_coherence_score']:.6f}")
            
            # Display training process overview
            if len(training_history) > 1:
                first_iter, first_llpw, first_ppl, first_coh_dict = training_history[0]
                first_cv = first_coh_dict.get('c_v', 0)
                first_npmi = first_coh_dict.get('c_npmi', 0)
                final_cv = final_coherence_dict.get('c_v', 0)
                final_npmi = final_coherence_dict.get('c_npmi', 0)
                
                print(f"\n📊 Training Improvement:")
                print(f"   - Perplexity improvement: {first_ppl:.2f} → {final_ppl:.2f} (↓{first_ppl-final_ppl:.2f})")
                print(f"   - C_v improvement: {first_cv:.4f} → {final_cv:.4f} (↑{final_cv-first_cv:.4f})")
                print(f"   - NPMI improvement: {first_npmi:.4f} → {final_npmi:.4f} (↑{final_npmi-first_npmi:.4f})")
                print(f"   - Log-likelihood improvement: {first_llpw:.6f} → {final_llpw:.6f} (↑{final_llpw-first_llpw:.6f})")
    else:
        print("❌ No training history recorded")
        metrics = {}
    
    return trained_model, metrics, training_history

print("🎯 Enhanced LDA training function defined")
print("💡 Features:")
print("   - Supports simultaneous calculation of multiple coherence metrics")
print("   - Includes metrics like NPMI, C_v")
print("   - Provides a composite coherence score")
print("   - Detailed training process tracking")
print("   - Maintains compatibility with the original interface")

print(f"\n📋 Usage:")
print(f"   model, metrics, history = train_lda_with_multiple_coherence_metrics(docs, K=252)")
print("=" * 60)

🎯 Enhanced LDA training function defined
💡 Features:
   - Supports simultaneous calculation of multiple coherence metrics
   - Includes metrics like NPMI, C_v
   - Provides a composite coherence score
   - Detailed training process tracking
   - Maintains compatibility with the original interface

📋 Usage:
   model, metrics, history = train_lda_with_multiple_coherence_metrics(docs, K=252)


In [4]:
# ===== Guide to Using NPMI and Multiple Coherence Metrics in hLDA Comparative Studies =====

print("🎯 Guide to Using NPMI and Multiple Coherence Metrics in hLDA Comparative Studies")
print("=" * 80)

def demonstrate_npmi_usage_for_hlda_comparison():
    """
    Demonstrates how to use NPMI and other coherence metrics in hLDA comparative studies.
    """
    
    print("📋 Guide for Comparative Study Based on Your hLDA Parameters:")
    print("=" * 60)
    
    hlda_params = {
        'depth': 3,
        'gamma': 0.05,
        'eta': 0.05,
        'alpha': 0.1,
        'leaf_nodes': 252,
        'branching': 'Layer 0-1: 59, Layer 1-2: 4'
    }
    
    print("🎯 Your hLDA Configuration:")
    for param, value in hlda_params.items():
        print(f"   - {param}: {value}")
    
    print(f"\n📊 Recommended Coherence Metric Comparison Schemes:")
    
    coherence_strategies = {
        'primary_metrics': {
            'description': 'Primary Comparison Metrics',
            'metrics': ['c_v', 'c_npmi'],
            'reason': 'C_v is a standard metric, NPMI provides a pointwise mutual information perspective'
        },
        'comprehensive_metrics': {
            'description': 'Comprehensive Comparison Metrics',
            'metrics': ['c_v', 'c_npmi'],
            'reason': 'Covers coherence measures with different theoretical foundations'
        },
        'research_focused': {
            'description': 'Research-Focused Metrics',
            'metrics': ['c_v', 'c_npmi'],
            'reason': 'Balances computational efficiency and evaluation comprehensiveness'
        }
    }
    
    for strategy_name, details in coherence_strategies.items():
        print(f"\n📈 {details['description']}:")
        print(f"   - Metrics: {details['metrics']}")
        print(f"   - Rationale: {details['reason']}")
    
    print(f"\n🔧 Implementation Recommendations:")
    
    implementation_steps = [
        "1. Evaluate all models (LDA, PAM, CTM) using the same coherence metrics.",
        "2. Focus on the comparative results of C_v and NPMI.",
        "3. Analyze the consistency of model rankings under different metrics.",
        "4. Consider the trade-off between coherence metrics and perplexity.",
        "5. Record the performance differences of each model under different metrics."
    ]
    
    for step in implementation_steps:
        print(f"   {step}")
    
    print(f"\n💡 Comparative Analysis Suggestions:")
    
    analysis_suggestions = [
        "📊 Multi-angle evaluation: Use both C_v and NPMI to avoid single-metric bias.",
        "🎯 Hierarchical comparison: Compare the coherence differences between hLDA's structure and other models.",
        "⚖️  Trade-off analysis: Analyze the relationship between coherence improvement and computational complexity.",
        "📈 Stability check: Use multiple random seeds to verify the stability of coherence metrics.",
        "🔍 Topic quality: Combine with qualitative analysis to validate the practical significance of coherence metrics."
    ]
    
    for suggestion in analysis_suggestions:
        print(f"   {suggestion}")
    
    print(f"\n🎯 Specific Implementation Code Template:")
    
    code_template = '''
# Compare multiple coherence metrics across different models
models_to_compare = ['LDA', 'PAM', 'CTM', 'hLDA']
coherence_results = {}

for model_name in models_to_compare:
    if model_name == 'hLDA':
        # Get hLDA results from step3
        coherence_results[model_name] = get_hlda_coherence_from_step3()
    else:
        # Calculate coherence for other models
        model = globals()[f'{model_name.lower()}_model']
        evaluation = evaluate_model_with_multiple_coherence(
            model, docs, model_name,
            coherence_metrics=['c_v', 'c_npmi']
        )
        coherence_results[model_name] = evaluation['coherence_metrics']

# Generate a comparison report
generate_coherence_comparison_report(coherence_results)
'''
    
    print(code_template)
    
    print(f"\n📋 Research Value:")
    
    research_values = [
        "🔬 Methodological contribution: Multiple coherence metrics provide a more comprehensive evaluation of topic quality.",
        "📊 Result credibility: Metrics like NPMI enhance the persuasiveness of research findings.",
        "🎯 Comparison fairness: A unified evaluation framework ensures fairness in model comparison.",
        "💡 Theoretical depth: Different coherence metrics reflect different theoretical perspectives on topic modeling.",
        "🚀 Practical application: Multi-metric evaluation better guides model selection in practical applications."
    ]
    
    for value in research_values:
        print(f"   {value}")

# Execute demonstration
demonstrate_npmi_usage_for_hlda_comparison()

🎯 Guide to Using NPMI and Multiple Coherence Metrics in hLDA Comparative Studies
📋 Guide for Comparative Study Based on Your hLDA Parameters:
🎯 Your hLDA Configuration:
   - depth: 3
   - gamma: 0.05
   - eta: 0.05
   - alpha: 0.1
   - leaf_nodes: 252
   - branching: Layer 0-1: 59, Layer 1-2: 4

📊 Recommended Coherence Metric Comparison Schemes:

📈 Primary Comparison Metrics:
   - Metrics: ['c_v', 'c_npmi']
   - Rationale: C_v is a standard metric, NPMI provides a pointwise mutual information perspective

📈 Comprehensive Comparison Metrics:
   - Metrics: ['c_v', 'c_npmi']
   - Rationale: Covers coherence measures with different theoretical foundations

📈 Research-Focused Metrics:
   - Metrics: ['c_v', 'c_npmi']
   - Rationale: Balances computational efficiency and evaluation comprehensiveness

🔧 Implementation Recommendations:
   1. Evaluate all models (LDA, PAM, CTM) using the same coherence metrics.
   2. Focus on the comparative results of C_v and NPMI.
   3. Analyze the consistency

In [7]:
# ===== LDA Model Construction and Training =====

print("🎯 Starting LDA model training")
print("=" * 60)

def build_lda(docs, seed=0, max_iters=MAX_ITERS, use_multiple_coherence=True):
    """Build and train an LDA model - supports multiple coherence metrics"""
    print(f"🚀 Creating LDA model (K={K_LEAF}, α={ALPHA}, η={ETA}, seed={seed})")
    mdl = tp.LDAModel(k=K_LEAF, alpha=ALPHA, eta=ETA, seed=seed)
    
    if use_multiple_coherence:
        # Train with multiple coherence metrics
        return auto_train_with_multiple_metrics(
            mdl, docs, seed=seed, max_iters=max_iters, 
            coherence_measures=['c_v', 'c_npmi'], 
            name='LDA'
        )
    else:
        # Train with a traditional single metric
        return auto_train(mdl, docs, seed=seed, max_iters=max_iters, name='LDA')

def train_and_evaluate_lda(docs, seed=42, max_iters=2000, detailed_output=True, use_multiple_coherence=True):
    """
    Train an LDA model and perform a full evaluation - supports multiple coherence metrics
    
    Parameters:
    - docs: List of documents
    - seed: Random seed
    - max_iters: Maximum number of training iterations
    - detailed_output: Whether to output detailed information
    - use_multiple_coherence: Whether to use multiple coherence metrics
    
    Returns:
    - model: Trained LDA model
    - metrics: Dictionary of evaluation metrics
    - history: Training history
    """
    
    print(f"🚀 Starting to train LDA model (seed={seed})")
    print(f"📊 Data Overview:")
    print(f"   - Number of documents: {len(docs)}")
    print(f"   - Average document length: {np.mean([len(doc) for doc in docs]):.1f} words")
    print(f"   - Vocabulary size: {len(set(word for doc in docs for word in doc))}")
    
    print(f"\n🔧 Model Parameters:")
    print(f"   - Number of topics K = {K_LEAF}")
    print(f"   - Alpha = {ALPHA}")
    print(f"   - Eta = {ETA}")
    print(f"   - Max iterations = {max_iters}")
    print(f"   - Check interval = {INTERVAL}")
    if use_multiple_coherence:
        print(f"   - Coherence metrics = ['c_v', 'c_npmi']")
    
    # Train the model
    start_time = time.time()
    lda_model, training_history = build_lda(docs, seed=seed, max_iters=max_iters, 
                                           use_multiple_coherence=use_multiple_coherence)
    training_time = time.time() - start_time
    
    print(f"\n✅ Training complete! Time taken: {training_time:.1f} seconds")
    
    # Extract final metrics
    if training_history:
        if use_multiple_coherence:
            # Process multiple coherence metric results
            final_iter, final_llpw, final_ppl, final_coherence_dict = training_history[-1]
            final_coh_cv = final_coherence_dict.get('c_v', 0)
            final_coh_npmi = final_coherence_dict.get('c_npmi', 0)
        else:
            # Process single coherence metric results
            final_iter, final_llpw, final_ppl, final_coh_cv = training_history[-1]
            final_coh_npmi = 0
        
        # Calculate additional metrics
        effective_topics = effective_k(lda_model)
        
        # Perform a full evaluation with multiple coherence metrics
        if use_multiple_coherence:
            print(f"\n🔍 Performing full evaluation with multiple coherence metrics...")
            full_evaluation = evaluate_model_with_multiple_coherence(
                lda_model, docs, "LDA", 
                coherence_metrics=['c_v', 'c_npmi'],
                top_words_for_coherence=5
            )
            
            metrics = {
                'iterations': final_iter,
                'log_likelihood_per_word': final_llpw,
                'perplexity': final_ppl,
                'coherence_c_v': full_evaluation['coherence_metrics'].get('c_v', final_coh_cv),
                'coherence_npmi': full_evaluation['coherence_metrics'].get('c_npmi', final_coh_npmi),

                'effective_topics': effective_topics,
                'training_time_seconds': training_time,
                'convergence_iterations': len(training_history),
                'composite_score': full_evaluation.get('composite_score', 0),
                'topic_utilization': full_evaluation.get('topic_utilization', 0)
            }
        else:
            metrics = {
                'iterations': final_iter,
                'log_likelihood_per_word': final_llpw,
                'perplexity': final_ppl,
                'coherence_c_v': final_coh_cv,
                'effective_topics': effective_topics,
                'training_time_seconds': training_time,
                'convergence_iterations': len(training_history)
            }
        
        if detailed_output:
            print(f"\n📈 Final Evaluation Results:")
            print(f"   - Training iterations: {final_iter}")
            print(f"   - Log-likelihood per word: {final_llpw:.6f}")
            print(f"   - Perplexity: {final_ppl:.2f}")
            print(f"   - Effective topics: {effective_topics}/{K_LEAF}")
            
            if use_multiple_coherence:
                print(f"\n📊 Multiple Coherence Metrics:")
                print(f"   - C_v (Vector Space): {metrics['coherence_c_v']:.6f}")
                print(f"   - NPMI (Pointwise Mutual Information): {metrics['coherence_npmi']:.6f}")
                print(f"   - Composite Score: {metrics['composite_score']:.6f}")
                print(f"   - Topic Utilization: {metrics['topic_utilization']*100:.1f}%")
            else:
                print(f"   - Coherence (C_v): {metrics['coherence_c_v']:.4f}")
            
            print(f"   - Convergence history length: {len(training_history)}")
            
            # Display training process overview
            if len(training_history) > 1:
                if use_multiple_coherence:
                    first_iter, first_llpw, first_ppl, first_coh_dict = training_history[0]
                    final_iter, final_llpw, final_ppl, final_coh_dict = training_history[-1]
                    
                    first_cv = first_coh_dict.get('c_v', 0)
                    first_npmi = first_coh_dict.get('c_npmi', 0)
                    final_cv = final_coh_dict.get('c_v', 0)
                    final_npmi = final_coh_dict.get('c_npmi', 0)
                    
                    print(f"\n📊 Training Improvement:")
                    print(f"   - Perplexity improvement: {first_ppl:.2f} → {final_ppl:.2f} (↓{first_ppl-final_ppl:.2f})")
                    print(f"   - C_v improvement: {first_cv:.4f} → {final_cv:.4f} (↑{final_cv-first_cv:.4f})")
                    print(f"   - NPMI improvement: {first_npmi:.4f} → {final_npmi:.4f} (↑{final_npmi-first_npmi:.4f})")
                    print(f"   - Log-likelihood improvement: {first_llpw:.6f} → {final_llpw:.6f} (↑{final_llpw-first_llpw:.6f})")
                else:
                    first_iter, first_llpw, first_ppl, first_coh = training_history[0]
                    print(f"\n📊 Training Improvement:")
                    print(f"   - Perplexity improvement: {first_ppl:.2f} → {final_ppl:.2f} (↓{first_ppl-final_ppl:.2f})")
                    print(f"   - Coherence improvement: {first_coh:.4f} → {final_coh_cv:.4f} (↑{final_coh_cv-first_coh:.4f})")
                    print(f"   - Log-likelihood improvement: {first_llpw:.6f} → {final_llpw:.6f} (↑{final_llpw-first_llpw:.6f})")
    else:
        print("❌ No training history recorded")
        metrics = {}
    
    return lda_model, metrics, training_history

# ===== Execute LDA Modeling and Evaluation =====
print("\n🎯 Training LDA model with the full dataset...")

# Use the full corpus
full_docs = list(corpus.values())
print(f"📚 Using full dataset: {len(full_docs)} documents")

# Train and evaluate - using multiple coherence metrics (including NPMI)
lda_model, lda_metrics, lda_history = train_and_evaluate_lda(
    docs=full_docs,
    seed=42,
    max_iters=300,  # 🔧 You can modify this value to control the max iterations
    detailed_output=True,
    use_multiple_coherence=True  # 🆕 Enable multiple coherence metrics (including NPMI)
)

# Topic analysis
if lda_model:
    topic_info = analyze_model_topics(lda_model, model_name="LDA", top_words=5)
    
    print(f"\n🎯 LDA model evaluation complete!")
    print(f"📊 Key Metrics:")
    if lda_metrics:
        for key, value in lda_metrics.items():
            if isinstance(value, float):
                print(f"   - {key}: {value:.4f}")
            else:
                print(f"   - {key}: {value}")
    
    print(f"\n💾 Model is trained and can be used for:")
    print(f"   1. Topic word extraction")
    print(f"   2. Document-topic distribution analysis") 
    print(f"   3. Comparison with other models")
    print(f"   4. Topic evolution analysis")

else:
    print("❌ LDA model training failed")

print("=" * 60)

🎯 Starting LDA model training

🎯 Training LDA model with the full dataset...
📚 Using full dataset: 970 documents
🚀 Starting to train LDA model (seed=42)
📊 Data Overview:
   - Number of documents: 970
   - Average document length: 85.8 words
   - Vocabulary size: 1490

🔧 Model Parameters:
   - Number of topics K = 252
   - Alpha = 0.1
   - Eta = 0.05
   - Max iterations = 300
   - Check interval = 100
   - Coherence metrics = ['c_v', 'c_npmi']
🚀 Creating LDA model (K=252, α=0.1, η=0.05, seed=42)
[LDA] iter= 100 llpw=-7.5244 ppl=1852.68 Calculating multiple coherence metrics...


  mdl.train(interval)


[LDA] iter= 100 llpw=-7.5244 ppl=1852.68 c_v=0.5816, c_npmi=0.0445
[LDA] iter= 200 llpw=-7.3837 ppl=1609.54 Calculating multiple coherence metrics...
[LDA] iter= 200 llpw=-7.3837 ppl=1609.54 c_v=0.5787, c_npmi=0.0561
[LDA] iter= 300 llpw=-7.3257 ppl=1518.89 Calculating multiple coherence metrics...
[LDA] iter= 300 llpw=-7.3257 ppl=1518.89 c_v=0.5757, c_npmi=0.0554

✅ Training complete! Time taken: 25.9 seconds

🔍 Performing full evaluation with multiple coherence metrics...

🔍 LDA Multiple Coherence Metrics Evaluation:
📊 Coherence Metrics Results:
   📈 C_v (Vector Space Coherence): 0.575673
   📈 NPMI (Normalized Pointwise Mutual Information): 0.055425

📊 Basic Metrics:
   📈 Effective topics: 252
   📈 Total topics: 252
   📈 Topic utilization: 100.0%

💡 Coherence Metric Explanations:
   🎯 C_v: Range [0,1], higher is better, based on word vector similarity
   🎯 NPMI: Range [-1,1], higher is better, Normalized Pointwise Mutual Information

🏆 Composite Coherence Score: 0.5517 (average of no

In [5]:
import numpy as np
def get_topic_doc_counts(model, threshold=0.01):
    """
    Get the number of documents covered by each topic (i.e., in how many documents each topic appears)
    """
    num_topics = getattr(model, 'k', None) or getattr(model, 'num_topics', None) or 0
    topic_doc_counts = [0] * num_topics
    for doc in model.docs:
        # Get the topic distribution for this document
        try:
            topic_dist = doc.get_topic_dist()
        except:
            continue
        for k, prob in enumerate(topic_dist):
            if prob > threshold:
                topic_doc_counts[k] += 1
    return topic_doc_counts

def calculate_weighted_renyi_entropy_full_weighted(model, alpha=2):
    """
    Renyi entropy weighted by the number of documents covered by the topic
    """
    entropies = []
    num_topics = getattr(model, 'k', None) or getattr(model, 'num_topics', None) or 0
    for k in range(num_topics):
        try:
            topic_probs = np.array([prob for word, prob in model.get_topic_words(k, top_n=-1)])
            topic_probs = topic_probs / topic_probs.sum()
            if len(topic_probs) > 0:
                renyi = (1/(1-alpha)) * np.log(np.sum(topic_probs**alpha))
                entropies.append(renyi)
            else:
                entropies.append(0)
        except:
            entropies.append(0)
    # Get the document count for each topic
    topic_doc_counts = get_topic_doc_counts(model)
    total = sum(topic_doc_counts)
    if total > 0:
        return np.average(entropies, weights=topic_doc_counts)
    else:
        return np.mean(entropies)
    
from scipy.spatial.distance import jensenshannon

def calculate_weighted_jsd_full(model, top_n=-1):
    """
    Calculate the pairwise JSD distance for all topic distributions (unweighted), return the average JSD
    - model: topic model object (e.g., tomotopy LDA/CTM/PAM/hLDA)
    - top_n: how many words to take for each topic (-1 for all)
    """
    num_topics = getattr(model, 'k', None) or getattr(model, 'num_topics', None) or 0
    topic_distributions = []
    for k in range(num_topics):
        try:
            topic_words = model.get_topic_words(k, top_n=top_n)
            probs = np.array([prob for word, prob in topic_words])
            probs = probs / probs.sum() if probs.sum() > 0 else np.ones_like(probs) / len(probs)
            topic_distributions.append(probs)
        except:
            continue
    # Calculate pairwise JSD
    jsd_values = []
    for i in range(len(topic_distributions)):
        for j in range(i+1, len(topic_distributions)):
            # Align length
            p = topic_distributions[i]
            q = topic_distributions[j]
            min_len = min(len(p), len(q))
            jsd = jensenshannon(p[:min_len], q[:min_len])
            jsd_values.append(jsd)
    if jsd_values:
        return float(np.mean(jsd_values))
    else:
        return 0.0
    
def calculate_renyi_entropy_unweighted(model, alpha=2):
    """
    Calculate the Renyi entropy for all topics (unweighted version, direct average)
    - model: topic model object (e.g., tomotopy LDA/CTM/PAM/hLDA)
    - alpha: order of Renyi entropy (commonly 2)
    Returns: the average of Renyi entropies for all topics
    """
    import numpy as np
    entropies = []
    num_topics = getattr(model, 'k', None) or getattr(model, 'num_topics', None) or 0
    for k in range(num_topics):
        try:
            topic_probs = np.array([prob for word, prob in model.get_topic_words(k, top_n=-1)])
            topic_probs = topic_probs / topic_probs.sum()
            if len(topic_probs) > 0:
                renyi = (1/(1-alpha)) * np.log(np.sum(topic_probs**alpha))
                entropies.append(renyi)
            else:
                entropies.append(0)
        except:
            entropies.append(0)
    if entropies:
        return float(np.mean(entropies))
    else:
        return 0.0

In [9]:
# Called after LDA model training and evaluation
if lda_model:
    renyi_entropy = calculate_weighted_renyi_entropy_full_weighted(lda_model, alpha=2)
    jsd_distance = calculate_weighted_jsd_full(lda_model)
    print(f"\n🔬 Renyi entropy weighted by document count: {renyi_entropy:.4f}")
    print(f"🔬 Average JSD (full distribution): {jsd_distance:.4f}")
renyi_entropy_unweighted = calculate_renyi_entropy_unweighted(lda_model, alpha=2)
print(f"Unweighted Renyi entropy: {renyi_entropy_unweighted:.4f}")


🔬 Renyi entropy weighted by document count: 3.6679
🔬 Average JSD (full distribution): 0.2599
Unweighted Renyi entropy: 4.1623


In [None]:
# # ===== CTM Model Construction and Training =====

# print("\n🎯 Starting CTM model training")
# print("=" * 60)

# def build_ctm(docs, seed=0, max_iters=2000, K_LEAF=252, use_multiple_coherence=True):
#     """Build and train a CTM model - supports multiple coherence metrics"""
#     print(f"🚀 Creating CTM model (K={K_LEAF}, α={ALPHA}, η={ETA}, seed={seed})")
#     # The correct class name is CTModel
#     mdl = tp.CTModel(k=K_LEAF, smoothing_alpha=ALPHA, eta=ETA, seed=seed)
    
#     if use_multiple_coherence:
#         # Train with multiple coherence metrics
#         return auto_train_with_multiple_metrics(
#             mdl, docs, seed=seed, max_iters=max_iters, 
#             coherence_measures=['c_v', 'c_npmi'], 
#             name='CTM'
#         )
#     else:
#         # Train with a traditional single metric
#         return auto_train(mdl, docs, seed=seed, max_iters=max_iters, name='CTM')   

# def train_and_evaluate_ctm(docs, seed=42, max_iters=2000, K_LEAF=252, detailed_output=True, use_multiple_coherence=True):
#     """
#     Train a CTM model and perform a full evaluation - supports multiple coherence metrics
    
#     Parameters:
#     - docs: List of documents
#     - seed: Random seed
#     - max_iters: Maximum number of training iterations
#     - detailed_output: Whether to output detailed information
#     - use_multiple_coherence: Whether to use multiple coherence metrics
    
#     Returns:
#     - model: Trained CTM model
#     - metrics: Dictionary of evaluation metrics
#     - history: Training history
#     """
    
#     print(f"🚀 Starting to train CTM model (seed={seed})")
#     print(f"📊 Data Overview:")
#     print(f"   - Number of documents: {len(docs)}")
#     print(f"   - Average document length: {np.mean([len(doc) for doc in docs]):.1f} words")
#     print(f"   - Vocabulary size: {len(set(word for doc in docs for word in doc))}")
    
#     print(f"\n🔧 Model Parameters:")
#     print(f"   - Number of topics K = {K_LEAF}")
#     print(f"   - Alpha = {ALPHA}")
#     print(f"   - Eta = {ETA}")
#     print(f"   - Max iterations = {max_iters}")
#     print(f"   - Check interval = {INTERVAL}")
#     if use_multiple_coherence:
#         print(f"   - Multiple coherence metrics: Enabled (C_v, NPMI)")
    
#     # Train the model
#     start_time = time.time()
#     if use_multiple_coherence:
#         # Train with multiple coherence metrics
#         ctm_model, training_history = build_ctm(docs, seed=seed, max_iters=max_iters, 
#                                               K_LEAF=K_LEAF, use_multiple_coherence=True)
#     else:
#         # Train with a traditional single metric
#         ctm_model, training_history = build_ctm(docs, seed=seed, max_iters=max_iters, 
#                                               K_LEAF=K_LEAF, use_multiple_coherence=False)
    
#     training_time = time.time() - start_time
    
#     print(f"\n✅ Training complete! Time taken: {training_time:.1f} seconds")
    
#     # Extract final metrics
#     if training_history:
#         if use_multiple_coherence and isinstance(training_history[-1], dict):
#             # Multiple coherence metrics mode
#             final_metrics = training_history[-1]
#             final_iter = final_metrics['iteration']
#             final_llpw = final_metrics['log_likelihood']
#             final_ppl = final_metrics.get('perplexity', 0)
            
#             # Calculate additional metrics
#             effective_topics = effective_k(ctm_model)
            
#             metrics = {
#                 'iterations': final_iter,
#                 'log_likelihood_per_word': final_llpw,
#                 'perplexity': final_ppl,
#                 'coherence_c_v': final_metrics.get('c_v', 0),
#                 'coherence_c_npmi': final_metrics.get('c_npmi', 0),
#                 'effective_topics': effective_topics,
#                 'training_time_seconds': training_time,
#                 'convergence_iterations': len(training_history)
#             }
            
#             if detailed_output:
#                 print(f"\n📈 Final Evaluation Results:")
#                 print(f"   - Training iterations: {final_iter}")
#                 print(f"   - Log-likelihood per word: {final_llpw:.6f}")
#                 print(f"   - Perplexity: {final_ppl:.2f}")
#                 print(f"   - C_v Coherence: {final_metrics.get('c_v', 'N/A')}")
#                 print(f"   - NPMI Coherence: {final_metrics.get('c_npmi', 'N/A')}")
#                 print(f"   - Effective topics: {effective_topics}/{K_LEAF}")
#                 print(f"   - Convergence history length: {len(training_history)}")
#         else:
#             # Traditional single metric mode
#             final_iter, final_llpw, final_ppl, final_coh = training_history[-1]
            
#             # Calculate additional metrics
#             effective_topics = effective_k(ctm_model)
            
#             metrics = {
#                 'iterations': final_iter,
#                 'log_likelihood_per_word': final_llpw,
#                 'perplexity': final_ppl,
#                 'coherence_c_v': final_coh,
#                 'effective_topics': effective_topics,
#                 'training_time_seconds': training_time,
#                 'convergence_iterations': len(training_history)
#             }
            
#             if detailed_output:
#                 print(f"\n📈 Final Evaluation Results:")
#                 print(f"   - Training iterations: {final_iter}")
#                 print(f"   - Log-likelihood per word: {final_llpw:.6f}")
#                 print(f"   - Perplexity: {final_ppl:.2f}")
#                 print(f"   - Coherence (C_v): {final_coh.get('c_v', 0):.4f}")
#                 print(f"   - Effective topics: {effective_topics}/{K_LEAF}")
#                 print(f"   - Convergence history length: {len(training_history)}")
                
#                 # Display training process overview
#                 if len(training_history) > 1:
#                     first_iter, first_llpw, first_ppl, first_coh = training_history[0]
#                     print(f"\n📊 Training Improvement:")
#                     print(f"   - Perplexity improvement: {first_ppl:.2f} → {final_ppl:.2f} (↓{first_ppl-final_ppl:.2f})")
#                     print(f"   - Coherence improvement: {first_coh:.4f} → {final_coh:.4f} (↑{final_coh-first_coh:.4f})")
#                     print(f"   - Log-likelihood improvement: {first_llpw:.6f} → {final_llpw:.6f} (↑{final_llpw-first_llpw:.6f})")
#     else:
#         print("❌ No training history recorded")
#         metrics = {}
    
#     # If multiple coherence is enabled, perform a full evaluation
#     if use_multiple_coherence:
#         evaluation_results = evaluate_model_with_multiple_coherence(
#             ctm_model, docs, model_name="CTM"
#         )
#         metrics.update(evaluation_results)
    
#     return ctm_model, metrics, training_history

# # ===== Execute CTM Modeling and Evaluation =====
# print("\n🎯 Training CTM model with the full dataset...")

# # Use the full corpus
# full_docs = list(corpus.values())
# print(f"📚 Using full dataset: {len(full_docs)} documents")

# # Train and evaluate CTM
# ctm_model, ctm_metrics, ctm_history = train_and_evaluate_ctm(
#     docs=full_docs,
#     seed=42,
#     max_iters=100,  # 🔧 You can modify this value to control the max iterations
#     K_LEAF=20,      # 🔧 You can modify the number of topics
#     detailed_output=True
# )

# # Topic analysis
# if ctm_model:
#     ctm_topic_info = analyze_model_topics(ctm_model, model_name="CTM", top_words=5)
    
#     print(f"\n🎯 CTM model evaluation complete!")
#     print(f"📊 Key Metrics:")
#     if ctm_metrics:
#         for key, value in ctm_metrics.items():
#             if isinstance(value, float):
#                 print(f"   - {key}: {value:.4f}")
#             else:
#                 print(f"   - {key}: {value}")
    
#     print(f"\n💾 CTM model is trained, features:")
#     print(f"   1. Correlated Topic Model")
#     print(f"   2. Models correlations between topics")
#     print(f"   3. More realistic topic relationships")
#     print(f"   4. Suitable for complex document collections")

# else:
#     print("❌ CTM model training failed")

# # Called after CTM model training and evaluation
# if ctm_model:
#     renyi_entropy = calculate_weighted_renyi_entropy_full_weighted(ctm_model, alpha=2)
#     jsd_distance = calculate_weighted_jsd_full(ctm_model)
#     renyi_entropy_unweighted = calculate_renyi_entropy_unweighted(ctm_model, alpha=2)
#     print(f"\n🔬 Renyi entropy weighted by document count (CTM): {renyi_entropy:.4f}")
#     print(f"🔬 Average JSD (full distribution) (CTM): {jsd_distance:.4f}")
#     print(f"🔬 Unweighted Renyi entropy (CTM): {renyi_entropy_unweighted:.4f}")


🎯 Starting CTM model training

🎯 Training CTM model with the full dataset...
📚 Using full dataset: 970 documents
🚀 Starting to train CTM model (seed=42)
📊 Data Overview:
   - Number of documents: 970
   - Average document length: 85.8 words
   - Vocabulary size: 1490

🔧 Model Parameters:
   - Number of topics K = 20
   - Alpha = 0.1
   - Eta = 0.05
   - Max iterations = 100
   - Check interval = 100
   - Multiple coherence metrics: Enabled (C_v, NPMI)
🚀 Creating CTM model (K=20, α=0.1, η=0.05, seed=42)


  mdl.train(interval)


[CTM] iter= 100 llpw=-5.5186 ppl=249.29 Calculating multiple coherence metrics...
[CTM] iter= 100 llpw=-5.5186 ppl=249.29 c_v=0.5187, c_npmi=0.0089

✅ Training complete! Time taken: 7.6 seconds

📈 Final Evaluation Results:
   - Training iterations: 100
   - Log-likelihood per word: -5.518621
   - Perplexity: 249.29
   - Coherence (C_v): 0.5187
   - Effective topics: 20/20
   - Convergence history length: 1

🔍 CTM Multiple Coherence Metrics Evaluation:
📊 Coherence Metrics Results:
   📈 C_v (Vector Space Coherence): 0.518657
   📈 NPMI (Normalized Pointwise Mutual Information): 0.008891

📊 Basic Metrics:
   📈 Effective topics: 20
   📈 Total topics: 20
   📈 Topic utilization: 100.0%

💡 Coherence Metric Explanations:
   🎯 C_v: Range [0,1], higher is better, based on word vector similarity
   🎯 NPMI: Range [-1,1], higher is better, Normalized Pointwise Mutual Information

🏆 Composite Coherence Score: 0.5116 (average of normalized scores)

🔍 CTM Topic Analysis (showing top 5 words):
Topic   0

In [9]:
# ===== PAM Model Construction and Training (Aligned with hLDA) =====

print("\n🎯 Starting PAM model training - Aligned with hLDA")
print("=" * 60)

# Define PAM-specific parameters aligned with hLDA branching factors
PAM_K1 = B1  # 59, from hLDA branching factor
PAM_K2 = B2  # 4, from hLDA branching factor
PAM_TOTAL_K = PAM_K1 * PAM_K2 # 236 topics, close to hLDA's 252 leaf nodes

print(f"🔧 PAM parameters aligned with hLDA: K1={PAM_K1}, K2={PAM_K2}, Total K={PAM_TOTAL_K}")

def build_pam(docs, seed=0, max_iters=2000, use_multiple_coherence=True):
    """Build and train a PAM model - supports multiple coherence metrics"""
    print(f"🚀 Creating PAM model (K1={PAM_K1}, K2={PAM_K2}, α={ALPHA}, η={ETA}, seed={seed})")
    # PAModel uses k1 and k2 for its two-level topic structure
    mdl = tp.PAModel(k1=PAM_K1, k2=PAM_K2, alpha=ALPHA, subalpha=ALPHA, eta=ETA, seed=seed)
    
    if use_multiple_coherence:
        # Train with multiple coherence metrics
        return auto_train_with_multiple_metrics(
            mdl, docs, seed=seed, max_iters=max_iters, 
            coherence_measures=['c_v', 'c_npmi'], 
            name='PAM'
        )
    else:
        # Train with a traditional single metric
        return auto_train(mdl, docs, seed=seed, max_iters=max_iters, name='PAM')

def train_and_evaluate_pam(docs, seed=42, max_iters=2000, detailed_output=True, use_multiple_coherence=True):
    """
    Train a PAM model and perform a full evaluation - supports multiple coherence metrics
    """
    
    print(f"🚀 Starting to train PAM model (seed={seed})")
    print(f"📊 Data Overview:")
    print(f"   - Number of documents: {len(docs)}")
    print(f"   - Average document length: {np.mean([len(doc) for doc in docs]):.1f} words")
    print(f"   - Vocabulary size: {len(set(word for doc in docs for word in doc))}")
    
    print(f"\n🔧 Model Parameters:")
    print(f"   - Super-topics K1 = {PAM_K1}")
    print(f"   - Sub-topics K2 = {PAM_K2}")
    print(f"   - Total topics = {PAM_TOTAL_K}")
    print(f"   - Alpha = {ALPHA}")
    print(f"   - Eta = {ETA}")
    print(f"   - Max iterations = {max_iters}")
    if use_multiple_coherence:
        print(f"   - Coherence metrics = ['c_v', 'c_npmi']")
    
    # Train the model
    start_time = time.time()
    pam_model, training_history = build_pam(docs, seed=seed, max_iters=max_iters, 
                                           use_multiple_coherence=use_multiple_coherence)
    training_time = time.time() - start_time
    
    print(f"\n✅ Training complete! Time taken: {training_time:.1f} seconds")
    
    # Extract final metrics
    if training_history:
        final_iter, final_llpw, final_ppl, final_coherence_dict = training_history[-1]
        
        # Perform a full evaluation with multiple coherence metrics
        print(f"\n🔍 Performing full evaluation with multiple coherence metrics...")
        full_evaluation = evaluate_model_with_multiple_coherence(
            pam_model, docs, "PAM", 
            coherence_metrics=['c_v', 'c_npmi'],
            top_words_for_coherence=5
        )
        
        metrics = {
            'iterations': final_iter,
            'log_likelihood_per_word': final_llpw,
            'perplexity': final_ppl,
            'coherence_c_v': full_evaluation['coherence_metrics'].get('c_v', 0),
            'coherence_npmi': full_evaluation['coherence_metrics'].get('c_npmi', 0),
            'effective_topics': full_evaluation.get('effective_topics', 0),
            'training_time_seconds': training_time,
            'convergence_iterations': len(training_history),
            'composite_score': full_evaluation.get('composite_score', 0),
            'topic_utilization': full_evaluation.get('topic_utilization', 0)
        }
        
        if detailed_output:
            print(f"\n📈 Final Evaluation Results:")
            print(f"   - Training iterations: {final_iter}")
            print(f"   - Log-likelihood per word: {final_llpw:.6f}")
            print(f"   - Perplexity: {final_ppl:.2f}")
            print(f"   - Effective topics: {metrics['effective_topics']}/{PAM_TOTAL_K}")
            print(f"\n📊 Multiple Coherence Metrics:")
            print(f"   - C_v (Vector Space): {metrics['coherence_c_v']:.6f}")
            print(f"   - NPMI (Pointwise Mutual Information): {metrics['coherence_npmi']:.6f}")
            print(f"   - Composite Score: {metrics['composite_score']:.6f}")
            print(f"   - Topic Utilization: {metrics['topic_utilization']*100:.1f}%")
    else:
        print("❌ No training history recorded")
        metrics = {}
    
    return pam_model, metrics, training_history

# ===== Execute PAM Modeling and Evaluation =====
print("\n🎯 Training PAM model with the full dataset...")

# Use the full corpus
full_docs = list(corpus.values())
print(f"📚 Using full dataset: {len(full_docs)} documents")

# Train and evaluate PAM
pam_model, pam_metrics, pam_history = train_and_evaluate_pam(
    docs=full_docs,
    seed=42,
    max_iters=20000,
    detailed_output=True,
    use_multiple_coherence=True
)

# Topic analysis
if pam_model:
    pam_topic_info = analyze_model_topics(pam_model, model_name="PAM", top_words=5)
    
    print(f"\n🎯 PAM model evaluation complete!")
    print(f"📊 Key Metrics:")
    if pam_metrics:
        for key, value in pam_metrics.items():
            if isinstance(value, float):
                print(f"   - {key}: {value:.4f}")
            else:
                print(f"   - {key}: {value}")
    
    print(f"\n💾 PAM model is trained, features:")
    print(f"   1. Two-level hierarchical model (Pseudo-document-based)")
    print(f"   2. Aligned with hLDA structure for fair comparison")
    print(f"   3. Models super-topics and sub-topics")

else:
    print("❌ PAM model training failed")

# Called after PAM model training and evaluation
if pam_model:
    renyi_entropy = calculate_weighted_renyi_entropy_full_weighted(pam_model, alpha=2)
    jsd_distance = calculate_weighted_jsd_full(pam_model)
    renyi_entropy_unweighted = calculate_renyi_entropy_unweighted(pam_model, alpha=2)
    print(f"\n🔬 Renyi entropy weighted by document count (PAM): {renyi_entropy:.4f}")
    print(f"🔬 Average JSD (full distribution) (PAM): {jsd_distance:.4f}")
    print(f"🔬 Unweighted Renyi entropy (PAM): {renyi_entropy_unweighted:.4f}")

print("=" * 60)


🎯 Starting PAM model training - Aligned with hLDA
🔧 PAM parameters aligned with hLDA: K1=59, K2=4, Total K=236

🎯 Training PAM model with the full dataset...
📚 Using full dataset: 970 documents
🚀 Starting to train PAM model (seed=42)
📊 Data Overview:
   - Number of documents: 970
   - Average document length: 85.8 words
   - Vocabulary size: 1490

🔧 Model Parameters:
   - Super-topics K1 = 59
   - Sub-topics K2 = 4
   - Total topics = 236
   - Alpha = 0.1
   - Eta = 0.05
   - Max iterations = 20000
   - Coherence metrics = ['c_v', 'c_npmi']
🚀 Creating PAM model (K1=59, K2=4, α=0.1, η=0.05, seed=42)


  mdl.train(interval)


[PAM] iter= 100 llpw=-9.8150 ppl=18305.57 Calculating multiple coherence metrics...
[PAM] iter= 100 llpw=-9.8150 ppl=18305.57 c_v=0.5660, c_npmi=0.0225
[PAM] iter= 200 llpw=-9.7506 ppl=17165.08 Calculating multiple coherence metrics...
[PAM] iter= 200 llpw=-9.7506 ppl=17165.08 c_v=0.5332, c_npmi=0.0179
[PAM] iter= 300 llpw=-9.6222 ppl=15096.52 Calculating multiple coherence metrics...
[PAM] iter= 300 llpw=-9.6222 ppl=15096.52 c_v=0.5549, c_npmi=0.0263
[PAM] iter= 400 llpw=-9.5659 ppl=14269.76 Calculating multiple coherence metrics...
[PAM] iter= 400 llpw=-9.5659 ppl=14269.76 c_v=0.5276, c_npmi=0.0285
[PAM] iter= 500 llpw=-9.4510 ppl=12720.30 Calculating multiple coherence metrics...
[PAM] iter= 500 llpw=-9.4510 ppl=12720.30 c_v=0.5730, c_npmi=0.0430
[PAM] iter= 600 llpw=-9.4075 ppl=12179.32 Calculating multiple coherence metrics...
[PAM] iter= 600 llpw=-9.4075 ppl=12179.32 c_v=0.5689, c_npmi=0.0416
[PAM] iter= 700 llpw=-9.3116 ppl=11065.65 Calculating multiple coherence metrics...
[PAM

In [None]:
# ===== HDP Model Construction and Training (Optimized for hLDA Comparison) =====

print("\n🎯 Starting HDP model training - Optimized for hLDA comparison")
print("=" * 60)

def build_hdp_for_hlda_comparison(docs, seed=0, max_iters=1500, target_topics=252):
    """
    Build and train an HDP model - specifically optimized for comparison with hLDA
    
    Adjust hyperparameters to approach the target number of topics (252)
    """
    print(f"🚀 Creating HDP model (target topics≈{target_topics}, seed={seed})")
    
    # Calculate parameters related to data size
    num_docs = len(docs)
    vocab_size = len(set(word for doc in docs for word in doc))
    
    # Adjust gamma parameter based on data size and target number of topics
    # The smaller the gamma, the more topics
    # Empirical formula: gamma ≈ target_topics / (num_docs * adjustment_factor)
    adjustment_factor = 0.3  # Adjustment factor, can be fine-tuned
    gamma = target_topics / (num_docs * adjustment_factor)
    
    # Limit the range of gamma to avoid extreme values
    gamma = max(0.1, min(gamma, 10.0))
    
    print(f"📊 Parameter Adjustment Strategy:")
    print(f"   - Number of documents: {num_docs}")
    print(f"   - Vocabulary size: {vocab_size}")
    print(f"   - Target number of topics: {target_topics}")
    print(f"   - Calculated gamma: {gamma:.3f}")
    
    mdl = tp.HDPModel(
        initial_k=target_topics,    # Set a large initial number of topics
        alpha=ALPHA,                # Document-topic concentration parameter
        eta=ETA,                   # Topic-word parameter
        gamma=gamma,               # Topic count control parameter (key!)
        seed=seed
    )
    
    # Train with multiple coherence metrics (C_v + NPMI) to calculate and print NPMI in real-time during training
    return auto_train_with_multiple_metrics(
        mdl, docs, seed=seed, max_iters=max_iters,
        interval=50, coherence_measures=['c_v', 'c_npmi'], name='HDP', fast_coherence=True
    )

def iterative_hdp_tuning(docs, target_topics=252, seed=42, max_attempts=3):
    """
    Iteratively adjust HDP parameters to try to approach the target number of topics
    """
    print(f"🔄 Iteratively adjusting HDP parameters, target number of topics: {target_topics}")
    
    best_model = None
    best_metrics = None
    best_history = None
    best_diff = float('inf')
    
    gamma_values = [0.5, 1.0, 2.0]  # Different gamma values to try
    
    for attempt, gamma in enumerate(gamma_values, 1):
        print(f"\n--- Attempt {attempt}/{len(gamma_values)}: gamma={gamma} ---")
        
        try:
            # Create model
            mdl = tp.HDPModel(
                initial_k=target_topics,
                alpha=ALPHA,
                eta=ETA,
                gamma=gamma,
                seed=seed
            )
            
            # Train model (use multiple coherence metrics to include both C_v and NPMI in each printout)
            model, history = auto_train_with_multiple_metrics(
                mdl, docs, seed=seed, max_iters=1000,
                interval=50, coherence_measures=['c_v', 'c_npmi'], name=f'HDP-{attempt}', fast_coherence=True
            )
            
            # Evaluate results
            effective_topics = effective_k(model)
            total_topics = getattr(model, 'k', 0)
            diff = abs(effective_topics - target_topics)
            
            print(f"📊 Attempt {attempt} Results:")
            print(f"   - Effective topics: {effective_topics}")
            print(f"   - Total topics: {total_topics}")
            print(f"   - Difference from target: {diff}")
            
            # Update best result
            if diff < best_diff:
                best_model = model
                best_history = history
                best_diff = diff
                
                if history:
                    final_iter, final_llpw, final_ppl, final_coh = history[-1]
                    best_metrics = {
                        'iterations': final_iter,
                        'log_likelihood_per_word': final_llpw,
                        'perplexity': final_ppl,
                        'coherence_c_v': final_coh,
                        'effective_topics': effective_topics,
                        'total_topics': total_topics,
                        'target_diff': diff,
                        'gamma_used': gamma
                    }
            
        except Exception as e:
            print(f"❌ Attempt {attempt} failed: {e}")
            continue
    
    return best_model, best_metrics, best_history

def train_and_evaluate_hdp_for_comparison(docs, seed=42, target_topics=252, detailed_output=True):
    """
    Train an HDP model and perform a full evaluation - specifically for comparison with hLDA
    """
    
    print(f"🚀 Starting to train HDP model for comparison with hLDA (seed={seed})")
    print(f"📊 Data Overview:")
    print(f"   - Number of documents: {len(docs)}")
    print(f"   - Average document length: {np.mean([len(doc) for doc in docs]):.1f} words")
    print(f"   - Vocabulary size: {len(set(word for doc in docs for word in doc))}")
    print(f"   - Target number of topics: {target_topics} (aligned with hLDA leaf nodes)")
    
    # Train the model
    start_time = time.time()
    hdp_model, hdp_metrics, hdp_history = iterative_hdp_tuning(
        docs, target_topics=target_topics, seed=seed
    )
    training_time = time.time() - start_time
    
    print(f"\n✅ Training complete! Time taken: {training_time:.1f} seconds")
    
    # Update metrics
    if hdp_metrics:
        hdp_metrics['training_time_seconds'] = training_time
        hdp_metrics['convergence_iterations'] = len(hdp_history) if hdp_history else 0

        # New: Add NPMI evaluation
        full_eval = evaluate_model_with_multiple_coherence(
            hdp_model, docs, model_name="HDP", coherence_metrics=['c_v', 'c_npmi'], top_words_for_coherence=5
        )
        hdp_metrics['coherence_c_v'] = full_eval['coherence_metrics'].get('c_v', 0)
        hdp_metrics['coherence_c_npmi'] = full_eval['coherence_metrics'].get('c_npmi', 0)
        hdp_metrics['composite_score'] = full_eval.get('composite_score', 0)
        hdp_metrics['topic_utilization'] = full_eval.get('topic_utilization', 0)

        if detailed_output:
            print(f"\n📈 Final Evaluation Results:")
            print(f"   - Training iterations: {hdp_metrics.get('iterations', 'N/A')}")
            print(f"   - Log-likelihood per word: {hdp_metrics.get('log_likelihood_per_word', 0):.6f}")
            print(f"   - Perplexity: {hdp_metrics.get('perplexity', 0):.2f}")
            print(f"   - Coherence (C_v): {hdp_metrics.get('coherence_c_v', 0):.4f}")
            print(f"   - Coherence (NPMI): {hdp_metrics.get('coherence_c_npmi', 0):.4f}")  # New output
            print(f"   - Effective topics: {hdp_metrics.get('effective_topics', 0)}")
            print(f"   - Total topics: {hdp_metrics.get('total_topics', 0)}")
            print(f"   - Target difference: {hdp_metrics.get('target_diff', 0)}")
            print(f"   - Gamma used: {hdp_metrics.get('gamma_used', 0):.3f}")
            
            # Comparison with target
            target_diff = hdp_metrics.get('target_diff', float('inf'))
            effective_topics = hdp_metrics.get('effective_topics', 0)
            
            print(f"\n🎯 hLDA Comparison Analysis:")
            print(f"   - hLDA leaf nodes: {target_topics}")
            print(f"   - HDP effective topics: {effective_topics}")
            print(f"   - Topic count difference: {target_diff}")
            
            if target_diff <= 50:
                print(f"   ✅ Topic count is close to hLDA (difference ≤ 50)")
            elif target_diff <= 100:
                print(f"   ⚠️  Moderate deviation in topic count (difference ≤ 100)")
            else:
                print(f"   ❌ Large difference in topic count (difference > 100)")
    
    return hdp_model, hdp_metrics, hdp_history

# ===== Execute HDP Modeling and Evaluation =====
print("\n🎯 Training HDP model with the full dataset (for hLDA comparison)...")

# Use the full corpus
full_docs = list(corpus.values())
print(f"📚 Using full dataset: {len(full_docs)} documents")

# Train and evaluate HDP - aiming for ~252 topics
hdp_model, hdp_metrics, hdp_history = train_and_evaluate_hdp_for_comparison(
    docs=full_docs,
    seed=24,
    target_topics=100,  # 252 topics, aligned with hLDA9
    detailed_output=True
)

# Topic analysis
if hdp_model:
    hdp_topic_info = analyze_model_topics(hdp_model, model_name="HDP", top_words=5)
    
    print(f"\n🎯 HDP model evaluation complete!")
    print(f"📊 Key Metrics:")
    if hdp_metrics:
        for key, value in hdp_metrics.items():
            if isinstance(value, float):
                print(f"   - {key}: {value:.4f}")
            else:
                print(f"   - {key}: {value}")
            
            
    
    print(f"\n💾 HDP model is trained, features:")
    print(f"   1. Non-parametric model (dynamic topic count, adjusted to be close to {K_LEAF})")
    print(f"   2. Hierarchical structure based on Dirichlet Process")
    print(f"   3. Topic count optimized via gamma parameter")
    print(f"   4. Suitable for fair comparison with hLDA")
    
    # Generate comparison preparation info
    print(f"\n📋 hLDA Comparison Preparation:")
    print(f"   - hLDA leaf nodes: {K_LEAF} topics")
    print(f"   - HDP effective topics: {hdp_metrics.get('effective_topics', 'N/A')} topics")
    print(f"   - Topic count difference: {hdp_metrics.get('target_diff', 'N/A')}")
    print(f"   - Both models use the same dataset and preprocessing")
    print(f"   - Perplexity and coherence can be directly compared")

else:
    print("❌ HDP model training failed")

# ===== Optional: Plot Training Curves =====
if hdp_history:
    print(f"\n📈 Do you want to view the HDP training curves?")
    # plot_training_curves(hdp_history, model_name="HDP", save_plot=True)

print("=" * 60)

# Called after HDP model training and evaluation
if hdp_model:
    renyi_entropy = calculate_weighted_renyi_entropy_full_weighted(hdp_model, alpha=2)
    jsd_distance = calculate_weighted_jsd_full(hdp_model)
    renyi_entropy_unweighted = calculate_renyi_entropy_unweighted(hdp_model, alpha=2)
    print(f"\n🔬 Renyi entropy weighted by document count (HDP): {renyi_entropy:.4f}")
    print(f"🔬 Average JSD (full distribution) (HDP): {jsd_distance:.4f}")
    print(f"🔬 Unweighted Renyi entropy (HDP): {renyi_entropy_unweighted:.4f}")


🎯 Starting HDP model training - Optimized for hLDA comparison

🎯 Training HDP model with the full dataset (for hLDA comparison)...
📚 Using full dataset: 970 documents
🚀 Starting to train HDP model for comparison with hLDA (seed=24)
📊 Data Overview:
   - Number of documents: 970
   - Average document length: 85.8 words
   - Vocabulary size: 1490
   - Target number of topics: 100 (aligned with hLDA leaf nodes)
🔄 Iteratively adjusting HDP parameters, target number of topics: 100

--- Attempt 1/3: gamma=0.5 ---
[HDP-1] iter=  50 llpw=-6.6605 ppl=780.98 Calculating multiple coherence metrics...


  mdl.train(interval)


[HDP-1] iter=  50 llpw=-6.6605 ppl=780.98 c_v=0.5494, c_npmi=0.0549
[HDP-1] iter= 100 llpw=-6.6421 ppl=766.73 Calculating multiple coherence metrics...
[HDP-1] iter= 100 llpw=-6.6421 ppl=766.73 c_v=0.5431, c_npmi=0.0530
[HDP-1] iter= 150 llpw=-6.6326 ppl=759.47 Calculating multiple coherence metrics...
[HDP-1] iter= 150 llpw=-6.6326 ppl=759.47 c_v=0.5409, c_npmi=0.0519
[HDP-1] iter= 200 llpw=-6.6250 ppl=753.72 Calculating multiple coherence metrics...
[HDP-1] iter= 200 llpw=-6.6250 ppl=753.72 c_v=0.5482, c_npmi=0.0546
[HDP-1] iter= 250 llpw=-6.6237 ppl=752.71 Calculating multiple coherence metrics...
[HDP-1] iter= 250 llpw=-6.6237 ppl=752.71 c_v=0.5512, c_npmi=0.0590
[HDP-1] iter= 300 llpw=-6.6192 ppl=749.33 Calculating multiple coherence metrics...
[HDP-1] iter= 300 llpw=-6.6192 ppl=749.33 c_v=0.5546, c_npmi=0.0625
[HDP-1] iter= 350 llpw=-6.6186 ppl=748.89 Calculating multiple coherence metrics...
[HDP-1] iter= 350 llpw=-6.6186 ppl=748.89 c_v=0.5600, c_npmi=0.0732
[HDP-1] iter= 400 ll