Create multiple labels for each topic model.

In [None]:
import os
import pandas as pd
import numpy as np
import dacy
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired, PartOfSpeech
from copy import deepcopy

# Set up the directory paths
models_dir = './metrics_by_model'
datasets_dir = './complete_datasets'
output_dir = './multi_rep_topiclabels'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

nlp = dacy.load("da_dacy_medium_trf-0.2.0")

# Iterate over each subdirectory in models_dir
for subdir in os.listdir(models_dir):
    subdir_path = os.path.join(models_dir, subdir)
    
    # Check if the path is a directory
    if os.path.isdir(subdir_path):
        model_path = os.path.join(subdir_path, f"{subdir}_topic_model")
        dataset_path = os.path.join(datasets_dir, f"{subdir}.csv")

        # Check if model and dataset files exist
        if not os.path.exists(model_path) or not os.path.exists(dataset_path):
            print(f"Model or dataset file not found for {subdir}")
            continue

        # Load model and dataset
        topic_model = BERTopic.load(model_path)
        text_data = pd.read_csv(dataset_path, index_col=0)
        docs = text_data['text']
        lem_docs = [" ".join([token.lemma_ for token in nlp(doc)]) for doc in docs]

        # Original c-TF-IDF Representation
        original_ctfidf_info = topic_model.get_topic_info()

        # Define the different representation models
        keybert_model = KeyBERTInspired()
        pos_model = PartOfSpeech(model=nlp)
        mmr_model = MaximalMarginalRelevance(diversity=0.3)

        # Create copies of the original model for each representation
        topic_model_keybert = deepcopy(topic_model)
        topic_model_keybertlem = deepcopy(topic_model)
        topic_model_pos = deepcopy(topic_model)
        topic_model_mmr = deepcopy(topic_model)

        # Update copies with respective representation models
        topic_model_keybert.update_topics(docs, representation_model=keybert_model)
        topic_model_keybertlem.update_topics(lem_docs, representation_model=keybert_model)
        topic_model_pos.update_topics(lem_docs, representation_model=pos_model)
        topic_model_mmr.update_topics(docs, representation_model=mmr_model)

        # Extract topic information from each model
        keybert_info = topic_model_keybert.get_topic_info()
        keybertlem_info = topic_model_keybertlem.get_topic_info()
        pos_info = topic_model_pos.get_topic_info()
        mmr_info = topic_model_mmr.get_topic_info()

        # Combine all representations into a single DataFrame
        combined_df = pd.DataFrame({
            'Topic': original_ctfidf_info['Topic'],
            'Count': original_ctfidf_info['Count'],
            'cTFIDF': original_ctfidf_info['Name'],
            'KeyBert': keybert_info['Name'],
            'KeyBert_lem':keybertlem_info['Name'],
            'DaCY_PoS': pos_info['Name'],
            'MMR': mmr_info['Name'],
        })

        combined_df

        # Save the combined DataFrame as a CSV file
        output_file_path = os.path.join(output_dir, f"{subdir}_combined_representations.csv")
        combined_df.to_csv(output_file_path, index=False)

        print(f"Saved combined representations for {subdir} to {output_file_path}")
