In [None]:
'''
Pipeline for training BERTopic models for different min topic size tresholds as well
https://huggingface.co/cointegrated/rubert-tiny2 
https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru

'''

#import libraries
import numpy as np
import os
import pandas as pd
from bertopic import BERTopic

if __name__ == "__main__":

    #specify path to data files
    csv_sample_file = "/home/tom/Documents/code/GitHub/dutch-elections/DATA/Parties_and_Leaders_74128_20251027_143525.csv"

    #specify path to embeddings
    embeddings_folder = "embeddings"
   
    #load data
    print('load data')
    telegram_df = pd.read_csv(csv_sample_file)

    docs = list(telegram_df['text']) #get list of messages
    sample_size = print('number of docs', len(docs))

    #train the topic model with different minimum topic sizes and for different embedding models
    min_topic_sizes = [50, 60, 70, 80, 90, 100]

    for topic_size in min_topic_sizes:
        for embedding_file in os.listdir(embeddings_folder):

            #get name for embedding file 
            embedding_name = embedding_file.split('.')[0]

            #check if the directory for storing the topic model exists, otherwise create it
            topic_model_folder = "BERTopic_models/" + embedding_name + "_topic_model_" + str(topic_size)
            if not os.path.isdir(topic_model_folder):
                print('create topic model folder')
                os.mkdir(topic_model_folder)

            print('load embeddings')
            embedding_path = os.path.join(embeddings_folder, embedding_file)
            embedding = np.load(embedding_path)

            #fit the model to the messages
            print('fit topic model with min size' , str(topic_size), 'and model', embedding_name)
            topic_model = BERTopic(min_topic_size = topic_size, verbose =True) 
            topics, probabilities = topic_model.fit_transform(docs, embedding)

            print('save topic model')
            topic_model.save(topic_model_folder, serialization="safetensors", save_ctfidf=True)

  from .autonotebook import tqdm as notebook_tqdm
2025-10-27 16:59:34,629 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


load data
number of docs 4530
create topic model folder
load embeddings
fit topic model with min size 50 and model paraphrase-multilingual-mpnet-base-v2


2025-10-27 16:59:48,649 - BERTopic - Dimensionality - Completed ✓
2025-10-27 16:59:48,649 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 16:59:48,760 - BERTopic - Cluster - Completed ✓
2025-10-27 16:59:48,765 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 16:59:49,178 - BERTopic - Representation - Completed ✓
2025-10-27 16:59:49,387 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


save topic model
create topic model folder
load embeddings
fit topic model with min size 60 and model paraphrase-multilingual-mpnet-base-v2


2025-10-27 16:59:50,243 - BERTopic - Dimensionality - Completed ✓
2025-10-27 16:59:50,243 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 16:59:50,359 - BERTopic - Cluster - Completed ✓
2025-10-27 16:59:50,360 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 16:59:50,784 - BERTopic - Representation - Completed ✓
2025-10-27 16:59:50,940 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


save topic model
create topic model folder
load embeddings
fit topic model with min size 70 and model paraphrase-multilingual-mpnet-base-v2


2025-10-27 16:59:51,745 - BERTopic - Dimensionality - Completed ✓
2025-10-27 16:59:51,745 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 16:59:51,869 - BERTopic - Cluster - Completed ✓
2025-10-27 16:59:51,871 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 16:59:52,287 - BERTopic - Representation - Completed ✓
2025-10-27 16:59:52,444 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


save topic model
create topic model folder
load embeddings
fit topic model with min size 80 and model paraphrase-multilingual-mpnet-base-v2


2025-10-27 16:59:53,268 - BERTopic - Dimensionality - Completed ✓
2025-10-27 16:59:53,268 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 16:59:53,407 - BERTopic - Cluster - Completed ✓
2025-10-27 16:59:53,408 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 16:59:53,833 - BERTopic - Representation - Completed ✓
2025-10-27 16:59:54,015 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


save topic model
create topic model folder
load embeddings
fit topic model with min size 90 and model paraphrase-multilingual-mpnet-base-v2


2025-10-27 16:59:54,814 - BERTopic - Dimensionality - Completed ✓
2025-10-27 16:59:54,815 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 16:59:54,975 - BERTopic - Cluster - Completed ✓
2025-10-27 16:59:54,977 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 16:59:55,405 - BERTopic - Representation - Completed ✓
2025-10-27 16:59:55,591 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


save topic model
create topic model folder
load embeddings
fit topic model with min size 100 and model paraphrase-multilingual-mpnet-base-v2


2025-10-27 16:59:56,388 - BERTopic - Dimensionality - Completed ✓
2025-10-27 16:59:56,389 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 16:59:56,539 - BERTopic - Cluster - Completed ✓
2025-10-27 16:59:56,541 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 16:59:56,945 - BERTopic - Representation - Completed ✓


save topic model
