In [5]:
'''
Inspect topic models and reduce outliers
'''

'\nInspect topic models and reduce outliers\n'

In [6]:
#import libraries
import numpy as np
import os
from bertopic import BERTopic
import pandas as pd
from tqdm import tqdm

#specify path to data files
csv_sample_file = "/home/tom/Documents/code/GitHub/dutch-elections/DATA/Parties_and_LeadersV2.csv"

#load data
print('load data')
telegram_df = pd.read_csv(csv_sample_file)

docs = list(telegram_df['text']) #get list of messages
sample_size = print('number of docs', len(docs))

#load embeddings
embeddings = np.load("/home/tom/Documents/code/GitHub/dutch-elections/embeddings/paraphrase-multilingual-mpnet-base-v2.npy")

load data
number of docs 4614


In [7]:
#loop over model folders, load model, reduce outliers, store info for inspection + store updated model

model_folder = "BERTopic_models/elections"

for model_file in tqdm(os.listdir(model_folder)):

    if model_file.endswith('.csv'): #skip csvs with info, only parse folders
        continue

    if model_file.endswith('reduced_outliers'):
        continue

    #load model
    print('load topic model')
    print(model_file)
    topic_model = BERTopic.load(os.path.join(model_folder, model_file), embedding_model = embeddings)

    #refine the topic model: reduce outliers
    old_topics = topic_model.topics_

    print('reduce outliers')
    new_topics = topic_model.reduce_outliers(docs, old_topics)

    print('update topics')
    topic_model.update_topics(docs, topics = new_topics)

    #check new topics
    topic_model_info = topic_model.get_topic_info()

    #store topic model info for inspection
    print('store info')
    topic_model_info.to_csv(os.path.join(model_folder, model_file + '_info.csv'))

    #save the updated model
    print('save updated topic model')
    updated_topic_model_filename = model_file + "_reduced_outliers"
    updated_model_path = os.path.join(model_folder, updated_topic_model_filename)
    if not os.path.isdir(updated_model_path):
        os.mkdir(updated_model_path)
    topic_model.save(updated_model_path, serialization="safetensors", save_ctfidf=True)

  0%|          | 0/3 [00:00<?, ?it/s]

load topic model
paraphrase-multilingual-mpnet-base-v2_topic_model_30
reduce outliers


100%|██████████| 3/3 [00:02<00:00,  1.28it/s]


update topics


 33%|███▎      | 1/3 [00:04<00:08,  4.28s/it]

store info
save updated topic model
load topic model
paraphrase-multilingual-mpnet-base-v2_topic_model_10
reduce outliers


100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


update topics


 67%|██████▋   | 2/3 [00:07<00:03,  3.88s/it]

store info
save updated topic model
load topic model
paraphrase-multilingual-mpnet-base-v2_topic_model_20
reduce outliers


100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


update topics


100%|██████████| 3/3 [00:11<00:00,  3.95s/it]

store info
save updated topic model



