In [1]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import torch
from tqdm import tqdm
from src.utils import ExperimentLogger, ResourceTracker, set_global_seed
logger = ExperimentLogger(experiment_name="topic_modeling_bertopic")
set_global_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


Logging experiments to: reports\topic_modeling_bertopic_20260125_175532.json
Global seed set to: 42


In [2]:
# 1. Configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
DEVICE = "cuda" # Forces GPU usage

print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

#set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


<torch._C.Generator at 0x1fd799ab2d0>

In [3]:
def perform_topic_modeling(df: pd.DataFrame, nr_topics: int = 13):
    docs = df['transcript_filtered'].dropna().tolist()

    # We add specific ones: mr, president, russia, applause, etc
    custom_stop_words = list(CountVectorizer(stop_words="english").get_stop_words())
    custom_stop_words += [
    "mr", "president", "russia", "russian", "federation",
    "putin", "state", "year", "years", "time", "today",
    "work", "people", "country", "applause", "translation",
    "question", "answer", "think", "know", "want", "thank","like","need",
    "grate", "good", "new"
    ]

    vectorizer_model = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 3)) #ngrams up to 3 words
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model, # Use our custom cleaner
        nr_topics=nr_topics+1,                      #  + outlier
        verbose=True,
    )

    print("Training model...")
    topics, probs = topic_model.fit_transform(docs)

    #outliers reassignment
    print(f"Original outlier count: {topics.count(-1)}")
    new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings")
    topic_model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)
    print("Outliers reassigned")

    #aggregate results
    print("Aggregating results...")
    df['found_topic_id'] = new_topics
    topic_name_map = {}
    for topic in set(new_topics):
        keywords = [word for word, _ in topic_model.get_topic(topic)][:5]
        topic_name_map[topic] = ", ".join(keywords)

    # Map the keywords into a new column
    df['found_topic_keywords'] = df['found_topic_id'].map(topic_name_map)
    print(df[['found_topic_id','found_topic_keywords']].value_counts())
    return df

def map_topic_names(df: pd.DataFrame, id_to_name_map: dict) -> pd.DataFrame:
    if 'found_topic_id' in df.columns:
        df['topic_name'] = df['found_topic_id'].map(id_to_name_map)
    else:
        print("Error: 'found_topic_id' column not found. Please check your CSV.")
    return df


In [4]:
df = pd.read_csv("../data/putins_talks_prepared.csv",encoding='utf-8')

with ResourceTracker("Topic Modeling") as tracker:

    df_topcs = perform_topic_modeling(df, nr_topics=13)
logger.log_operation("Topic Modeling", tracker.duration, tracker.peak_memory_mb)

2026-01-25 17:55:37,115 - BERTopic - Embedding - Transforming documents to embeddings.


Training model...


Batches: 100%|██████████| 159/159 [00:20<00:00,  7.93it/s]
2026-01-25 17:55:58,561 - BERTopic - Embedding - Completed ✓
2026-01-25 17:55:58,563 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-25 17:56:42,103 - BERTopic - Dimensionality - Completed ✓
2026-01-25 17:56:42,103 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-25 17:56:42,635 - BERTopic - Cluster - Completed ✓
2026-01-25 17:56:42,635 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-01-25 17:57:47,419 - BERTopic - Representation - Completed ✓
2026-01-25 17:57:47,482 - BERTopic - Topic reduction - Reducing number of topics
2026-01-25 17:57:47,576 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-25 17:58:55,817 - BERTopic - Representation - Completed ✓
2026-01-25 17:58:55,891 - BERTopic - Topic reduction - Reduced number of topics from 118 to 14


Original outlier count: 1272




Outliers reassigned
Aggregating results...
found_topic_id  found_topic_keywords                                    
0               cooperation, relations, countries, economic, trade          1445
1               development, important, just, course, government            1031
2               important, great, world, friends, war                        788
3               law, rights, important, citizens, colleagues                 292
5               sports, sport, olympic, athletes, world                      265
4               defence, military, forces, navy, equipment                   233
6               economy, percent, economic, government, situation            202
9               percent, industry, agricultural, production, support         200
10              medical, healthcare, regions, government, important          171
7               syria, syrian, relations, countries, international           170
8               bank, economy, financial, investment, central bank        

In [5]:
id_to_name_map = {
    0:  "International Relations & Trade", # cooperation, relations, economic...
    1:  "Global Politics & History",       # great, important, world, war...
    2:  "Energy & Industrial Dev",         # development, energy, gas, industry...
    3:  "Economy & Budget",                # percent, economy, government, budget...
    4:  "Sports & Olympics",               # sports, sport, olympic...
    5:  "Defense & Military",              # defence, military, forces...
    6:  "Healthcare & Regions",            # medical, regions, situation...
    7:  "Science & Education",             # research, science, education...
    8:  "Law, Rights & Judiciary",         # law, rights, important, court...
    9:  "Syria Conflict",                  # syria, syrian, military...
    10: "Business & Info Dev",             # important, business, development...
    11: "Housing & Construction",          # housing, government, regions...
    12: "Customs & Oversight"              # service, customs, accounts chamber...
}
df_topics = map_topic_names(df_topcs, id_to_name_map)
output_file = "../data/putins_talks_with_topics"
df_topics.to_csv(output_file, index=False)
print(f"\nSuccess! Labeled data saved to '{output_file}'")


Success! Labeled data saved to '../data/putins_talks_with_topics'


## Lets focus on International Relations & Trade

In [6]:
df_picked_topic = df_topcs[df_topcs['found_topic_id'] == 0]
df_subtopics = perform_topic_modeling(df_picked_topic, nr_topics=6)

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 527920d8-10ff-4219-80fd-6ae2669cefff)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
2026-01-25 18:00:37,313 - BERTopic - Embedding - Transforming documents to embeddings.


Training model...


Batches: 100%|██████████| 46/46 [00:04<00:00, 10.40it/s]
2026-01-25 18:00:41,944 - BERTopic - Embedding - Completed ✓
2026-01-25 18:00:41,945 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-25 18:00:50,680 - BERTopic - Dimensionality - Completed ✓
2026-01-25 18:00:50,680 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-25 18:00:50,719 - BERTopic - Cluster - Completed ✓
2026-01-25 18:00:50,720 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-01-25 18:00:55,469 - BERTopic - Representation - Completed ✓
2026-01-25 18:00:55,485 - BERTopic - Topic reduction - Reducing number of topics
2026-01-25 18:00:55,493 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-25 18:01:00,459 - BERTopic - Representation - Completed ✓
2026-01-25 18:01:00,477 - BERTopic - Topic reduction - Reduced number of topics from 43 to 7


Original outlier count: 139




Outliers reassigned
Aggregating results...
found_topic_id  found_topic_keywords                                  
0               cooperation, relations, countries, economic, trade        831
1               relations, cooperation, european, economic, trade         403
3               economic, union, trade, eurasian, integration              64
2               brics, countries, cooperation, brazil, brics countries     51
4               sco, csto, organisation, cooperation, states               51
5               argentina, cooperation, relations, latin, countries        45
Name: count, dtype: int64


In [7]:
id_to_name_map = {
    0: "International Relations & Trade",       # cooperation, relations, economic... (Generic)
    1: "National Development & Policy",         # development, important, course... (Internal)
    2: "Economic Unions & Trade Blocs",         # economic, countries, union... (EAEU context)
    3: "Ukraine & Crimea Crisis",               # ukraine, crimea, sevastopol...
    4: "Latin American Relations",              # argentina, cooperation, brazil...
    5: "Security Alliances (SCO/CSTO)"          # sco, cooperation, csto...
}

df_subtopics = map_topic_names(df_subtopics, id_to_name_map)
subtopics_list = df_subtopics['topic_name'].tolist()

In [8]:
final_subtopics = []
for i in range(len(df_topics)):
    if df_topics.loc[i, 'found_topic_id'] == 0:
        final_subtopics.append(subtopics_list.pop(0))
    else:
        final_subtopics.append(df_topics.loc[i, 'topic_name'])

df_topics['detailed_topic_name'] = final_subtopics
output_file = "../data/putins_talks_with_detailed_topics"
df_topics.to_csv(output_file, index=False)