In [1]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer

# Step 1: Load data
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# class CustomEmbedder(BaseEmbedder):
#     def __init__(self, model_name):
#         self.model = SentenceTransformer(model_name)
#     def embed(self, documents):  # <--- accepts extra arguments now
#         return self.model.encode(documents, show_progress_bar=True)

In [None]:
# You can pass custom embedding model if you like (e.g., all-MiniLM-L6-v2)
# embedder = CustomEmbedder("all-MiniLM-L6-v2")
# topic_model = BERTopic(embedding_model=embedder, calculate_probabilities=True, verbose=True)

from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=60)  # Try increasing to 30, 50, etc.

topic_model = BERTopic(hdbscan_model=hdbscan_model)

In [4]:
topics, probs = topic_model.fit_transform(headlines)

# Step 3: Save topics back to dataframe
gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()
gold_df_filtered["Topic"] = topics

# Step 4: View top 10 topics
print(topic_model.get_topic_info().head(10))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


   Topic  Count                       Name  \
0     -1   3418           -1_gold_to_on_at   
1      0   1466  0_futures_gold_cues_close   
2      1   1063    1_dollar_prices_as_ends   
3      2    502      2_is_the_charts_angel   
4      3    415          3_dec_or_at_ounce   
5      4    401  4_india_imports_import_in   
6      5    378     5_data_after_us_report   
7      6    367         6_fed_rate_hike_as   
8      7    329     7_asia_china_in_prices   
9      8    291        8_rs_cues_silver_by   

                                      Representation  \
0  [gold, to, on, at, in, as, silver, trade, up, ...   
1  [futures, gold, cues, close, global, an, ounce...   
2  [dollar, prices, as, ends, week, gold, for, bu...   
3  [is, the, charts, angel, gold, and, commoditie...   
4  [dec, or, at, ounce, an, settles, up, down, to...   
5  [india, imports, import, in, duty, indias, to,...   
6  [data, after, us, report, jobs, holds, claims,...   
7  [fed, rate, hike, as, minutes, ahead, meet

In [5]:
sample_topic = 0  # change this to see different clusters
print(f"\n--- Sample Headlines from Topic {sample_topic} ---")
print(gold_df_filtered[gold_df_filtered["Topic"] == sample_topic]["News"].head(5))

# Step 6: Visualize
topic_model.visualize_topics().show()


--- Sample Headlines from Topic 0 ---
2        Gold futures edge up after two-session decline
7     Gold futures fall for the session, but gain fo...
24    gold futures down over $20/oz ahead of settlement
32         gold futures at highest since early december
33                    gold futures close narrowly lower
Name: News, dtype: object


In [6]:
topic_model.visualize_barchart(top_n_topics=10).show()

In [7]:
topic_model.visualize_hierarchy().show()