In [42]:
from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
import openai

In [22]:
questions_embeddings_df = pd.read_parquet("../data/intermediate/question_analysis/question_embeddings.parquet")

In [20]:
questions_df = pd.read_parquet("../data/intermediate/cleaned_arena_data/questions.parquet")

In [28]:
questions_and_embedding_df = questions_embeddings_df.merge(questions_df[['question_id', 'user_query']], on='question_id')

In [29]:
embedding_cols = [x for x in questions_and_embedding_df.columns if x.startswith("embedding_dim")]

In [32]:
embeddings = questions_and_embedding_df[embedding_cols].values
queries = questions_and_embedding_df['user_query'].values

In [73]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [94]:
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 8 words. Make sure it is in the following format:
topic: <topic label>
"""
client = openai.OpenAI()
openai_model = OpenAI(client, model="gpt-4.1", exponential_backoff=True, chat=True, prompt=prompt)

In [95]:
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity=0.3)
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
}

In [96]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  calculate_probabilities=True,
  verbose=True
)

In [97]:
topics, probs = topic_model.fit_transform(queries, embeddings)

2025-07-03 23:18:18,555 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-03 23:18:24,835 - BERTopic - Dimensionality - Completed ✓
2025-07-03 23:18:24,836 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-03 23:18:25,836 - BERTopic - Cluster - Completed ✓
2025-07-03 23:18:25,838 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|███████████████████████████████████████████████████████████████████| 11/11 [00:06<00:00,  1.60it/s]
2025-07-03 23:18:33,779 - BERTopic - Representation - Completed ✓


In [169]:
probs.sum(axis=1)

array([0.49861135, 0.54808982, 0.46392351, ..., 0.85211886, 0.84177638,
       0.85857721], shape=(16200,))

In [98]:
topic_info = topic_model.get_topic_info()

In [99]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,8817,-1_write_battle_use_https,"[write, battle, use, https, com, make, like, c...","[ideas, whispers, style, write, template, sugg...",[Vintage Horror Story Idea Generation by Style],"[write, battle, like, characters, mario, list,...",[write 10 story ideas in style Charles Beaumon...
1,0,3499,0_ai_best_llm_use,"[ai, best, llm, use, model, using, models, cod...","[ai, ai model, models, model, google, api, sea...",[Best Free AI Models and Their Uses],"[ai, llm, models, python, file, search, data, ...","[help me find the best AI models, and where to..."
2,1,1091,1_price_stock_market_tariffs,"[price, stock, market, tariffs, money, china, ...","[stock price, stock market, stock, stocks, mar...",[Stock Market Prices and Daily Volatility],"[market, tariffs, china, bank, trading, compan...","[How is the stock market today?, why was the s..."
3,2,599,2_food_water_disease_does,"[food, water, disease, does, effects, eat, pro...","[healthy, fat, diabetes, supplements, vegan, f...",[Nutrition Myths and Disease Risk Factors],"[water, disease, effects, protein, supplements...",[fact checking the claim :\nif u don't eat ani...
4,3,463,3_news_latest news_latest_today,"[news, latest news, latest, today, tell, news ...","[news latest, latest news, news today, today n...",[Latest World News and Events Today],"[news today, city, london, events, tel aviv, 2...","[what is the latest news, what is the latest n..."
5,4,382,4_match_team_won_win,"[match, team, won, win, final, 2025, outcome, ...","[matches, final, tournament, south america, te...",[Europe vs South America World Cup Finals],"[won, final, 2025, teams, league, europe, matc...",[Please find out for me and list all the FIFA ...
6,5,370,5_anime_tell_did_breaking,"[anime, tell, did, breaking, artist, does, fam...","[software engineer, engineer, graduated, caree...",[Biographical Inquiries and Anomalous Media Do...,"[anime, breaking, artist, famous, female, did ...",[Who is Andrei Topala software engineer that g...
7,6,364,6_vs_battles_style_eden,"[vs, battles, style, eden, powers, personaliti...","[battle, battles, combat, strongest, vs, abili...",[Fictional Character Battle Analysis and Compa...,"[vs, battles, eden, ap, abilities, speed, dc, ...","[vs battles appearances, personalities, simila..."
8,7,222,7_book_orthodox_god_summary,"[book, orthodox, god, summary, chapter, st, qu...","[book, chapter, chapters, summary, summary lat...",[Cross Creek Book Summary and Analysis],"[book, orthodox, cross, quotes, internet artic...",[What is the main argument of the book Cross C...
9,8,217,8_song_lyrics_irish_okey,"[song, lyrics, irish, okey, songs, stop, music...","[lyrics, song, testimony, plead, songs, son, m...",[Identifying Songs from Provided Lyrics],"[song, lyrics, irish, okey, songs, testimony, ...","[What song do these lyrics come from\n""Residen..."


In [100]:
new_topics = topic_model.reduce_outliers(queries, topics, strategy='embeddings', embeddings=embeddings)

In [101]:
pd.Series(new_topics).value_counts()

 0    5285
 1    2019
 3    1707
 7    1629
 5    1498
 2    1231
 6     980
 4     729
 8     620
 9     498
-1       4
Name: count, dtype: int64

In [102]:
topic_model.update_topics(queries, topics=new_topics)



In [104]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,4,-1_go_condenser_pressures_don,"[go, condenser, pressures, don, cooled, operat...","[ideas, whispers, style, write, template, sugg...",[Vintage Horror Story Idea Generation by Style],"[write, battle, like, characters, mario, list,...",[write 10 story ideas in style Charles Beaumon...
1,0,5285,0_to_the_and_for,"[to, the, and, for, is, in, what, of, it, that]","[ai, ai model, models, model, google, api, sea...",[Best Free AI Models and Their Uses],"[ai, llm, models, python, file, search, data, ...","[help me find the best AI models, and where to..."
2,1,2019,1_the_to_of_in,"[the, to, of, in, and, for, what, is, on, how]","[stock price, stock market, stock, stocks, mar...",[Stock Market Prices and Daily Volatility],"[market, tariffs, china, bank, trading, compan...","[How is the stock market today?, why was the s..."
3,2,1231,2_the_of_to_and,"[the, of, to, and, in, is, for, what, that, or]","[healthy, fat, diabetes, supplements, vegan, f...",[Nutrition Myths and Disease Risk Factors],"[water, disease, effects, protein, supplements...",[fact checking the claim :\nif u don't eat ani...
4,3,1707,3_the_in_what_is,"[the, in, what, is, and, of, news, to, me, on]","[news latest, latest news, news today, today n...",[Latest World News and Events Today],"[news today, city, london, events, tel aviv, 2...","[what is the latest news, what is the latest n..."
5,4,729,4_the_match_in_team,"[the, match, in, team, who, what, of, last, wo...","[matches, final, tournament, south america, te...",[Europe vs South America World Cup Finals],"[won, final, 2025, teams, league, europe, matc...",[Please find out for me and list all the FIFA ...
6,5,1498,5_who_is_the_of,"[who, is, the, of, in, you, and, name, what, her]","[software engineer, engineer, graduated, caree...",[Biographical Inquiries and Anomalous Media Do...,"[anime, breaking, artist, famous, female, did ...",[Who is Andrei Topala software engineer that g...
7,6,980,6_battle_the_characters_and,"[battle, the, characters, and, in, of, vs, cha...","[battle, battles, combat, strongest, vs, abili...",[Fictional Character Battle Analysis and Compa...,"[vs, battles, eden, ap, abilities, speed, dc, ...","[vs battles appearances, personalities, simila..."
8,7,1629,7_the_of_write_and,"[the, of, write, and, in, satirical, to, this,...","[book, chapter, chapters, summary, summary lat...",[Cross Creek Book Summary and Analysis],"[book, orthodox, cross, quotes, internet artic...",[What is the main argument of the book Cross C...
9,8,620,8_the_you_song_to,"[the, you, song, to, youtube, and, it, of, in,...","[lyrics, song, testimony, plead, songs, son, m...",[Identifying Songs from Provided Lyrics],"[song, lyrics, irish, okey, songs, testimony, ...","[What song do these lyrics come from\n""Residen..."


In [129]:
topic_distr, _ = topic_model.approximate_distribution(queries)

100%|███████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 26.07it/s]


In [141]:
pd.Series(topic_distr.sum(axis=1)).value_counts()

1.0    8378
0.0    4384
1.0    1818
1.0    1040
1.0     531
1.0      46
1.0       3
Name: count, dtype: int64

In [None]:
td
for td in topic_distr:
    

In [185]:
topic_distr[9]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

In [184]:
queries[9]

'how did Hajduk play its last match'

In [105]:
topic_model.get_top

Unnamed: 0,Topic,Count
0,0,5285
6,1,2019
3,3,1707
8,7,1629
5,5,1498
2,2,1231
7,6,980
1,4,729
9,8,620
4,9,498


In [91]:
topic_model.visualize_hierarchy(custom_labels=True)

In [56]:
topic_info.iloc[0]['MMR']

['write',
 'battle',
 'like',
 'characters',
 'mario',
 'list',
 'satirical',
 'search',
 'https www',
 'did']

In [61]:
topic_model.visualize_hierarchy(custom_labels=True)

In [171]:
question_topic_df = pd.read_parquet("../data/intermediate/question_analysis/question_topics.parquet")

In [174]:
topic_distribution_df = pd.read_parquet("../data/intermediate/question_analysis/question_topic_probabilities.parquet")

In [173]:
question_topic_df.topic.value_counts()

topic
 0    5285
 1    2019
 3    1707
 7    1629
 5    1498
 2    1231
 6     980
 4     729
 8     620
 9     498
-1       4
Name: count, dtype: int64

In [178]:
topic_cols = topic_distribution_df.columns[1:]

In [181]:
topic_distribution_df[topic_cols].sum(axis=1).value_counts()

1.0    8037
0.0    4384
1.0    1867
1.0    1208
1.0     639
1.0      51
1.0      12
1.0       2
Name: count, dtype: int64

In [183]:
topic_distribution_df[topic_cols]

Unnamed: 0,topic_0_prob,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob
0,0.149179,0.122304,0.10589,0.133161,0.087395,0.127015,0.07539,0.086747,0.112919,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.083978,0.096802,0.087397,0.352823,0.087865,0.077692,0.070845,0.074743,0.067856,0.0
4,0.121943,0.113099,0.113612,0.199826,0.100722,0.112318,0.022483,0.078666,0.071368,0.065964
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.063811,0.128698,0.139608,0.069787,0.059946,0.410158,0.066456,0.0,0.061537
7,0.0,0.063811,0.128698,0.139608,0.069787,0.059946,0.410158,0.066456,0.0,0.061537
8,0.114779,0.097524,0.092425,0.19245,0.060717,0.119093,0.060074,0.102628,0.131744,0.028565
9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [186]:
topic_distribution_df.sort_values(by=['topic_4_prob'])

Unnamed: 0,question_id,topic_0_prob,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob
8099,q_00016337,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
7917,q_00016018,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0
7918,q_00016019,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0
14129,q_00028703,0.225395,0.0,0.0,0.0,0.0,0.0,0.0,0.774605,0.0,0.0
14127,q_00028701,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12205,q_00024838,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0
3657,q_00007209,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0
14663,q_00029605,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0
760,q_00001505,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0


In [190]:
queries[3657]

'Because last time you said yes'