Evaluate non BEERTopic topic models using coherence and topic diversity scores. Evaluate LDA, NMF, LSI and HDP.

Preliminaries


In [None]:
import os
import pandas as pd
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA
from octis.models.HDP import HDP
from octis.models.LSI import LSI
from octis.models.NMF import NMF
from octis.models.NMF_scikit import NMF_scikit
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time
current_iteration = 0

from octis.dataset.dataset import Dataset


In [None]:
# Load the dataset
dataset = Dataset()
dataset.load_custom_dataset_from_folder('./travels_tales')

# Perform a basic operation to verify dataset loading
print("Number of documents:", len(dataset.get_corpus()))

# Define the k values
k_values = [200, 218, 236]

# Initialize the DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "k", "Topic Coherence_npmi", "Topic Coherence_c_v", "Topic Diversity"])

# Initialize metrics
topic_diversity_metric = TopicDiversity(topk=10)
coherence_metric = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
cv_metric = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_v')


In [None]:
# Models excluding HDP
for k in k_values:
    for model_name, Model in [("LDA", LDA), ("NMF", NMF), ("NMF-Scikit", NMF_scikit), ("LSI", LSI)]:
        start_time = time.time()  # Start time for this iteration
        
        # Training the model
        print(f"Training {model_name} with k={k}...")
        model = Model(num_topics=k)
        output = model.train_model(dataset)
        
        # Calculate scores
        topic_diversity_score = topic_diversity_metric.score(output)
        coherence_score = coherence_metric.score(output)
        cv_score = cv_metric.score(output)
        end_time = time.time()  # End time for this iteration
        elapsed_time = time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))
        
        
        # Append results to DataFrame using concat
        new_row = pd.DataFrame([{
            "Model": model_name,
            "k": k,
            "Topic Coherence_npmi": coherence_score,
            "Topic Coherence_c_v": cv_score,
            "Topic Diversity": topic_diversity_score,
            "Time": elapsed_time
        }])
        current_iteration += 1
        results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df

We run HDP independently.

In [None]:

# For HDP, run without specifying k
hdp_model = HDP()
start_time = time.time()  # Start time for this iteration
hdp_output = hdp_model.train_model(dataset)
end_time = time.time()  # End time for this iteration
elapsed_time = time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))

hdp_topics = hdp_output['topics']
hdp_k=len(hdp_topics)

    
hdp_coherence_score = coherence_metric.score(hdp_output)
hdp_cv_score = cv_metric.score(hdp_output)
hdp_topic_diversity_score = topic_diversity_metric.score(hdp_output)

# Append HDP results to DataFrame
hdp_row = pd.DataFrame([{
    "Model": "HDP",
    "k": hdp_k,  # HDP does not use a fixed k
    "Topic Coherence_npmi": hdp_coherence_score,
    "Topic Coherence_c_v": hdp_cv_score,
    "Topic Diversity": hdp_topic_diversity_score,
    "Time": elapsed_time
}])

results_df = pd.concat([results_df, hdp_row], ignore_index=True)


In [None]:

# Print or save the DataFrame
results_df

results_df.to_csv('./results.csv', index=False)