Evaluate each of the BERTopic models using the same metrics as were used for the other models in Octis.

Preliminaries.

In [None]:
import pandas as pd
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
import numpy as np
import os

Define functions for topic diversity.

In [None]:

# Function to calculate topic diversity
def calculate_topic_diversity(df, top_words_column):
    unique_words = set()
    total_words = 0

    for top_words in df[top_words_column]:
        words = top_words.split(' - ')  # Adjust if the separator is different
        unique_words.update(words)
        total_words += len(words)

    if total_words > 0:
        diversity = len(unique_words) / total_words
    else:
        diversity = 0  # Avoid division by zero
    return diversity


Function for coherence evaluation.

In [None]:

# Function to process each model
def process_model(model_dir, model_name):
    model_path = os.path.join(model_dir, model_name, f"{model_name}_topic_model")
    embeddings_path = os.path.join(model_dir, model_name, f"{model_name}_embeddings.npy")
    csv_file_path = os.path.join('./doc_topic_assignments', f"{model_name}.csv")

    # Check if the paths exist
    if not os.path.exists(model_path) or not os.path.exists(embeddings_path) or not os.path.exists(csv_file_path):
        print(f"Files not found for model {model_name}")
        return None

    # Load the topic model and other data
    topic_model = BERTopic.load(os.path.abspath(model_path))
    embeddings = np.load(embeddings_path)
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    documents_df = pd.read_csv(csv_file_path)

    # Convert all entries in 'Document' column to strings
    documents_df['Document'] = documents_df['Document'].astype(str)

    # Extract and process documents
    docs = documents_df['Document'].tolist()
    topics = documents_df['Topic'].tolist()
    documents_per_topic = documents_df.groupby('Topic').agg({'Document': ' '.join})

    # Coherence calculation
    try:
        words = vectorizer.get_feature_names_out()
    except AttributeError:
        words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in documents_per_topic['Document'].values]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    # Get all valid topics from the model, excluding -1 (outliers)
    all_topics = topic_model.get_topics()
    if all_topics:
        topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in all_topics.keys() if topic != -1]
    else:
        topic_words = []

    coherence_cv = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_v').get_coherence() if topic_words else None
    coherence_npmi = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_npmi').get_coherence() if topic_words else None

    # Calculate Topic Diversity
    topic_diversity = calculate_topic_diversity(documents_df, 'Top_n_words')  # Use 'Top_n_words' column for diversity calculation

    return {
        "Model": model_name,
        "Number of Documents": len(docs),
        "Number of Topics": len(all_topics) - 1 if all_topics else 0,  # Subtract 1 to exclude outlier topic
        "C_V Coherence": coherence_cv,
        "NPMI Coherence": coherence_npmi,
        "Topic Diversity": topic_diversity
    }


Iterate over the models

In [None]:

# Iterate over models and collect results
results = []
metrics_dir = './metrics_by_model'
for model_name in os.listdir(metrics_dir):
    model_dir = os.path.join(metrics_dir, model_name)
    if os.path.isdir(model_dir):
        model_result = process_model(metrics_dir, model_name)
        if model_result:
            results.append(model_result)

# Compile results into a DataFrame
results_df = pd.DataFrame(results)
results_df

results_df.to_csv('./model_coherence_diversity.csv')