In [1]:
'''
Check improved topic representation for selected model
'''

'\nCheck improved topic representation for selected model\n'

In [2]:
#import libraries
import numpy as np
import os
from bertopic import BERTopic
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#specify path to data files
csv_sample_file = "/home/tom/Documents/code/GitHub/dutch-elections/DATA/Parties_and_Leaders_74128_20251027_143525.csv"

#load data
print('load data')
telegram_df = pd.read_csv(csv_sample_file)

docs = list(telegram_df['text']) #get list of messages
sample_size = print('number of docs', len(docs))

#load embeddings
embedding = np.load("/home/tom/Documents/code/GitHub/dutch-elections/embeddings/paraphrase-multilingual-mpnet-base-v2.npy")

  from .autonotebook import tqdm as notebook_tqdm


load data
number of docs 4530


In [4]:
model_folder = "BERTopic_models"

for model_file in tqdm(os.listdir(model_folder)):

    if not model_file.endswith('reduced_outliers'):
        continue

    #load model
    print('load topic model')
    topic_model = BERTopic.load(os.path.join(model_folder, model_file), embedding_model = embedding)


    #set parameters for topic labels
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), max_df = 0.6)
    topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

    topic_labels = topic_model.generate_topic_labels(nr_words=20,
                                                    topic_prefix=True,
                                                    word_length=50,
                                                    separator=", ")

    print('update topic model labels')
    topic_model.set_topic_labels(topic_labels)

    #save the updated topic model and topic model information
    print('save updated topic model and topic model information')
    updated_topic_model_directory = os.path.join(model_folder, model_file + '_updated_keywords')
    if not os.path.isdir(updated_topic_model_directory):
        print('create topic model folder:', updated_topic_model_directory)
        os.mkdir(updated_topic_model_directory)

    print('save updated topic model')
    embedding_model = SentenceTransformer("")
    topic_model.save(updated_topic_model_directory, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

    #check new topics
    topic_model_info = topic_model.get_topic_info()

    #store topic model info for inspection
    print('store info')
    topic_model_info.to_csv(os.path.join(model_folder, model_file + 'reduced_outliers_updated_keywords.csv'))


  0%|          | 0/20 [00:00<?, ?it/s]

load topic model
update topic model labels
save updated topic model and topic model information
save updated topic model


  5%|‚ñå         | 1/20 [00:04<01:20,  4.21s/it]

store info
load topic model
update topic model labels
save updated topic model and topic model information
create topic model folder: BERTopic_models/paraphrase-multilingual-mpnet-base-v2_topic_model_90_reduced_outliers_updated_keywords
save updated topic model


 30%|‚ñà‚ñà‚ñà       | 6/20 [00:08<00:17,  1.26s/it]

store info
load topic model
update topic model labels
save updated topic model and topic model information
create topic model folder: BERTopic_models/paraphrase-multilingual-mpnet-base-v2_topic_model_70_reduced_outliers_updated_keywords
save updated topic model


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 11/20 [00:13<00:09,  1.07s/it]

store info
load topic model
update topic model labels
save updated topic model and topic model information
create topic model folder: BERTopic_models/paraphrase-multilingual-mpnet-base-v2_topic_model_50_reduced_outliers_updated_keywords
save updated topic model


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 12/20 [00:17<00:12,  1.56s/it]

store info
load topic model
update topic model labels
save updated topic model and topic model information
create topic model folder: BERTopic_models/paraphrase-multilingual-mpnet-base-v2_topic_model_100_reduced_outliers_updated_keywords
save updated topic model


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 13/20 [00:21<00:14,  2.00s/it]

store info
load topic model
update topic model labels
save updated topic model and topic model information
create topic model folder: BERTopic_models/paraphrase-multilingual-mpnet-base-v2_topic_model_60_reduced_outliers_updated_keywords
save updated topic model


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:26<00:00,  1.31s/it]

store info





In [None]:
#inspect the topic model
import numpy as np
from bertopic import BERTopic

print('load topic model')
embedding = np.load("/home/tom/Documents/code/GitHub/dutch-elections/embeddings/paraphrase-multilingual-mpnet-base-v2.npy")
topic_model = BERTopic.load("home/tom/Documents/code/GitHub/dutch-elections/BERTopic_models/paraphrase-multilingual-mpnet-base-v2_topic_model_70_reduced_outliers_updated_keywords", embedding_model = embedding)
topic_info = topic_model.get_topic_info()

load topic model


In [None]:
#fit topics to documents
print('fit docs to topics')
topics = topic_model.topics_  
doc_to_topic_df = pd.DataFrame({"document": docs, "topic": topics})
doc_to_topic_df.to_csv('doc_to_topic.csv')

In [24]:
doc_to_topic_df

Unnamed: 0,document,topic
0,–í –ø–æ–¥–¥–µ—Ä–∂–∫—É –î—É—Ä–æ–≤–∞ –≤ Telegram –∑–∞–ø—É—Å—Ç–∏–ª–∏ —Ö–µ—à—Ç–µ–≥...,5
1,ü§∑üèª‚Äç‚ôÇÔ∏èüì¢ –ü—Ä—è–º–æ–π —ç—Ñ–∏—Ä - –ø–æ–¥–ø–∏—Å–∞—Ç—å—Å—è,433
2,‚ö°Ô∏è –û—Å–Ω–æ–≤–∞—Ç–µ–ª—å Telegram –ü–∞–≤–µ–ª –î—É—Ä–æ–≤ –±—ã–ª –æ–±—ä—è–≤–ª–µ...,5
3,‚ö°Ô∏è–ú–æ—à–µ–Ω–Ω–∏–∫–∏ –æ—Ç –∏–º–µ–Ω–∏ –î—É—Ä–æ–≤–∞ —É–∂–µ –∞–∫—Ç–∏–≤–∏–∑–∏—Ä–æ–≤–∞–ª–∏...,5
4,"–ü–æ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –°–ú–ò, –≤–æ –§—Ä–∞–Ω—Ü–∏–∏ –∑–∞–¥–µ—Ä–∂–∞–Ω –ü–∞–≤–µ–ª –î...",5
...,...,...
334466,‚òÄÔ∏è –í –î–µ–Ω—å –†–æ—Å—Å–∏–∏ –≤ –í–æ—Ä–æ–Ω–µ–∂–µ –æ–∂–∏–¥–∞–µ—Ç—Å—è —Ç–µ–º–ø–µ—Ä–∞—Ç...,40
334467,üò≥ –ñ–∏—Ç–µ–ª–∏ –ñ–ö ¬´–¶–≤–µ—Ç–Ω–æ–π –±—É–ª—å–≤–∞—Ä¬ª —Ä–µ—à–∏–ª–∏ —É—Å—Ç—Ä–æ–∏—Ç—å ...,331
334468,üöò ¬´–ê–≤—Ç–æ–í–ê–ó¬ª –ø–ª–∞–Ω–∏—Ä—É–µ—Ç –≤ —Å–µ–Ω—Ç—è–±—Ä–µ —Å–Ω–æ–≤–∞ –ø–æ–¥–Ω—è—Ç...,43
334469,üß± ¬´–°—é—Ä–ø—Ä–∏–∑¬ª –≤ –≤–∏–¥–µ –∫–∏—Ä–ø–∏—á–∞ –ø—Ä–∏–ª–µ—Ç–µ–ª –≤ –ª–æ–±–æ–≤–æ–µ ...,12
