In [1]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
DEVICE = "cuda" # Forces GPU usage

print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

#set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


<torch._C.Generator at 0x27c9528ac10>

In [3]:
def perform_topic_modeling(df: pd.DataFrame, nr_topics: int = 13):
    docs = df['transcript_filtered'].dropna().tolist()

    # We add specific ones: mr, president, russia, applause, etc
    custom_stop_words = list(CountVectorizer(stop_words="english").get_stop_words())
    custom_stop_words += [
    "mr", "president", "russia", "russian", "federation",
    "putin", "state", "year", "years", "time", "today",
    "work", "people", "country", "applause", "translation",
    "question", "answer", "think", "know", "want", "thank","like","need",
    "grate", "good", "new"
    ]

    vectorizer_model = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 3)) #ngrams up to 3 words
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model, # Use our custom cleaner
        nr_topics=nr_topics+1,                      #  + outlier
        verbose=True,
    )

    print("Training model...")
    topics, probs = topic_model.fit_transform(docs)

    #outliers reassignment
    print(f"Original outlier count: {topics.count(-1)}")
    new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings")
    topic_model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)
    print("Outliers reassigned")

    #aggregate results
    print("Aggregating results...")
    df['found_topic_id'] = new_topics
    topic_name_map = {}
    for topic in set(new_topics):
        keywords = [word for word, _ in topic_model.get_topic(topic)][:5]
        topic_name_map[topic] = ", ".join(keywords)

    # Map the keywords into a new column
    df['found_topic_keywords'] = df['found_topic_id'].map(topic_name_map)
    print(df[['found_topic_id','found_topic_keywords']].value_counts())
    return df

def map_topic_names(df: pd.DataFrame, id_to_name_map: dict) -> pd.DataFrame:
    if 'found_topic_id' in df.columns:
        df['topic_name'] = df['found_topic_id'].map(id_to_name_map)
    else:
        print("Error: 'found_topic_id' column not found. Please check your CSV.")
    return df


In [4]:
df = pd.read_csv("../data/putins_talks_prepared.csv",encoding='utf-8')
df_topcs = perform_topic_modeling(df, nr_topics=13)

2026-01-18 22:56:59,116 - BERTopic - Embedding - Transforming documents to embeddings.


Training model...


Batches: 100%|██████████| 159/159 [00:14<00:00, 10.86it/s]
2026-01-18 22:57:14,761 - BERTopic - Embedding - Completed ✓
2026-01-18 22:57:14,762 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-18 22:57:40,613 - BERTopic - Dimensionality - Completed ✓
2026-01-18 22:57:40,615 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-18 22:57:40,799 - BERTopic - Cluster - Completed ✓
2026-01-18 22:57:40,800 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-01-18 22:58:08,375 - BERTopic - Representation - Completed ✓
2026-01-18 22:58:08,456 - BERTopic - Topic reduction - Reducing number of topics
2026-01-18 22:58:08,489 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-18 22:58:36,672 - BERTopic - Representation - Completed ✓
2026-01-18 22:58:36,753 - BERTopic - Topic reduction - Reduced number of topics from 120 to 14


Original outlier count: 1219




Outliers reassigned
Aggregating results...
found_topic_id  found_topic_keywords                                        
0               cooperation, relations, economic, countries, trade              2100
1               great, important, world, war, friends                            646
2               development, energy, gas, industry, percent                      616
3               percent, economy, government, economic, budget                   279
6               medical, regions, situation, healthcare, government              256
4               sports, sport, olympic, athletes, world                          255
5               defence, military, forces, navy, equipment                       232
7               research, science, education, university, important              189
8               law, rights, important, court, society                           182
9               syria, syrian, military, forces, international                   115
11              housing, gover

In [5]:
id_to_name_map = {
    0:  "International Relations & Trade", # cooperation, relations, economic...
    1:  "Global Politics & History",       # great, important, world, war...
    2:  "Energy & Industrial Dev",         # development, energy, gas, industry...
    3:  "Economy & Budget",                # percent, economy, government, budget...
    4:  "Sports & Olympics",               # sports, sport, olympic...
    5:  "Defense & Military",              # defence, military, forces...
    6:  "Healthcare & Regions",            # medical, regions, situation...
    7:  "Science & Education",             # research, science, education...
    8:  "Law, Rights & Judiciary",         # law, rights, important, court...
    9:  "Syria Conflict",                  # syria, syrian, military...
    10: "Business & Info Dev",             # important, business, development...
    11: "Housing & Construction",          # housing, government, regions...
    12: "Customs & Oversight"              # service, customs, accounts chamber...
}
df_topics = map_topic_names(df_topcs, id_to_name_map)
output_file = "../data/putins_talks_with_topics"
df_topics.to_csv(output_file, index=False)
print(f"\nSuccess! Labeled data saved to '{output_file}'")


Success! Labeled data saved to '../data/putins_talks_with_topics'


## Lets focus on International Relations & Trade

In [6]:
df_picked_topic = df_topcs[df_topcs['found_topic_id'] == 0]
df_subtopics = perform_topic_modeling(df_picked_topic, nr_topics=6)

2026-01-18 23:02:10,833 - BERTopic - Embedding - Transforming documents to embeddings.


Training model...


Batches: 100%|██████████| 66/66 [00:05<00:00, 11.95it/s]
2026-01-18 23:02:16,786 - BERTopic - Embedding - Completed ✓
2026-01-18 23:02:16,787 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-18 23:02:27,355 - BERTopic - Dimensionality - Completed ✓
2026-01-18 23:02:27,357 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-18 23:02:27,412 - BERTopic - Cluster - Completed ✓
2026-01-18 23:02:27,413 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-01-18 23:02:38,238 - BERTopic - Representation - Completed ✓
2026-01-18 23:02:38,271 - BERTopic - Topic reduction - Reducing number of topics
2026-01-18 23:02:38,287 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-18 23:02:49,383 - BERTopic - Representation - Completed ✓
2026-01-18 23:02:49,433 - BERTopic - Topic reduction - Reduced number of topics from 59 to 7


Original outlier count: 244




Outliers reassigned
Aggregating results...
found_topic_id  found_topic_keywords                              
0               cooperation, relations, countries, economic, trade    1064
1               development, important, course, government, just       465
2               economic, countries, trade, cooperation, union         317
3               ukraine, crimea, ukrainian, sevastopol, situation      136
4               argentina, cooperation, brazil, relations, latin        60
5               sco, cooperation, csto, organisation, countries         58
Name: count, dtype: int64


In [7]:
id_to_name_map = {
    0: "International Relations & Trade",       # cooperation, relations, economic... (Generic)
    1: "National Development & Policy",         # development, important, course... (Internal)
    2: "Economic Unions & Trade Blocs",         # economic, countries, union... (EAEU context)
    3: "Ukraine & Crimea Crisis",               # ukraine, crimea, sevastopol...
    4: "Latin American Relations",              # argentina, cooperation, brazil...
    5: "Security Alliances (SCO/CSTO)"          # sco, cooperation, csto...
}

df_subtopics = map_topic_names(df_subtopics, id_to_name_map)
subtopics_list = df_subtopics['topic_name'].tolist()

In [8]:
final_subtopics = []
for i in range(len(df_topics)):
    if df_topics.loc[i, 'found_topic_id'] == 0:
        final_subtopics.append(subtopics_list.pop(0))
    else:
        final_subtopics.append(df_topics.loc[i, 'topic_name'])

df_topics['detailed_topic_name'] = final_subtopics
output_file = "../data/putins_talks_with_detailed_topics"
df_topics.to_csv(output_file, index=False)