In [24]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN
from keybert import KeyBERT


# Step 1: Load data
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

In [None]:
vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # uptil trigrams
    min_df=10,  # ignore words in less than 10 headlines (0.1%)
    max_df=0.5,  # ignore words in more than 50% headlines
    max_features=5_000,
    token_pattern=r"(?u)\b[\w\-]+\b",  # Keep hyphenated phrases (e.g., "AI-driven")
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,  # Test values between 30-100
    min_samples=10,  # Avoids micro-clusters (10-30% of min_cluster_size)
    cluster_selection_epsilon=0.1,  # Merges nearby clusters
)

topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    verbose=True,
)

In [27]:
topics, probs = topic_model.fit_transform(headlines)

2025-04-10 17:55:20,137 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 331/331 [00:26<00:00, 12.64it/s]
2025-04-10 17:55:47,130 - BERTopic - Embedding - Completed ✓
2025-04-10 17:55:47,130 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-10 17:55:48,641 - BERTopic - Dimensionality - Completed ✓
2025-04-10 17:55:48,642 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-10 17:55:48,775 - BERTopic - Cluster - Completed ✓
2025-04-10 17:55:48,777 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-10 17:55:48,941 - BERTopic - Representation - Completed ✓


In [28]:
# Step 3: Save topics back to dataframe
gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()
gold_df_filtered["Topic"] = topics

In [29]:
# Step 4: View top 10 topics
print(topic_model.get_topic_info().head(10))

   Topic  Count                                       Name  \
0     -1   2235  -1_closes_gold silver_oz gold_gold closes   
1      0   1297          0_data_data gold_holds_gold holds   
2      1    680              1_gold s_gold gold_price_time   
3      2    558           2_gold demand_rupee_demand_price   
4      3    423           3_metals_shares_cues_global cues   
5      4    297          4_demand_gold silver_global_slips   
6      5    214    5_dec gold_dec_settles 1_gold settles 1   
7      6    206            6_feb_february gold_february_90   
8      7    205            7_weekly_loss_biggest_gold ends   
9      8    185                   8_rs_global cues_cues_31   

                                      Representation  \
0  [closes, gold silver, oz gold, gold closes, se...   
1  [data, data gold, holds, gold holds, steady, u...   
2  [gold s, gold gold, price, time, market, gold ...   
3  [gold demand, rupee, demand, price, gold price...   
4  [metals, shares, cues, global cues

In [30]:
sample_topic = 0  # change this to see different clusters
print(f"\n--- Sample Headlines from Topic {sample_topic} ---")
print(gold_df_filtered[gold_df_filtered["Topic"] == sample_topic]["News"].head(5))

# Step 6: Visualize
topic_model.visualize_topics().show()


--- Sample Headlines from Topic 0 ---
19    gold prices slip lower as dollar remains suppo...
25    Gold holds modest gains, up $3.10, or 0.3%, at...
29                       Gold little-changed after data
49    Gold holds near 3-1/2 week low as investors op...
53    gold edges down as dollar gains on fed rate hi...
Name: News, dtype: object


In [31]:
topic_model.visualize_barchart(top_n_topics=10).show()

In [32]:
topic_model.visualize_hierarchy().show()

In [None]:
topic_model.visualize_heatmap()

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [None]:
# Step 1: Get topic embeddings and IDs
topic_ids = topic_model.get_topic_info().Topic.tolist()
# Filter out -1 (outliers)
topic_ids = [t for t in topic_ids if t != -1]

In [None]:
embeddings = topic_model.topic_embeddings_
# Only keep embeddings for the selected topic IDs
topic_idx_map = {i: topic_ids.index(i) for i in topic_ids}
filtered_embeddings = np.array([embeddings[i] for i in topic_ids])

In [37]:
# Step 2: Compute cosine similarity matrix
cosine_sim = cosine_similarity(filtered_embeddings)

# Step 3: Find topic pairs with high similarity (excluding diagonal)
threshold = 0.85
highly_similar_pairs = []
for i, j in itertools.combinations(range(len(topic_ids)), 2):
    if cosine_sim[i, j] >= threshold:
        highly_similar_pairs.append((topic_ids[i], topic_ids[j]))

In [38]:
len(highly_similar_pairs)

38

In [39]:
highly_similar_pairs

[(0, 1),
 (0, 2),
 (0, 4),
 (0, 5),
 (0, 8),
 (0, 16),
 (0, 24),
 (0, 25),
 (0, 28),
 (1, 5),
 (1, 24),
 (1, 25),
 (1, 43),
 (4, 33),
 (4, 46),
 (5, 28),
 (6, 18),
 (6, 31),
 (6, 41),
 (6, 50),
 (9, 17),
 (11, 16),
 (11, 36),
 (18, 19),
 (18, 50),
 (19, 32),
 (19, 41),
 (20, 32),
 (20, 39),
 (22, 49),
 (25, 28),
 (29, 47),
 (32, 44),
 (33, 46),
 (33, 47),
 (36, 47),
 (36, 49),
 (42, 43)]

In [49]:
len(topic_ids)

51

In [55]:
unique_topics = gold_df_filtered.Topic.unique()

In [None]:
# Step 1: Build Union-Find to track connected components
class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(y)] = self.find(x)


uf = UnionFind()
for a, b in highly_similar_pairs:
    uf.union(min(a, b), max(a, b))  # Always union to the smaller ID

# Step 2: Build final topic mapping to lowest ID in each group
# Also apply to all unique topics (including those not in pairs)
final_mapping = {}
for topic in unique_topics:
    if topic == -1:
        final_mapping[topic] = -1
    else:
        final_mapping[topic] = uf.find(topic)

# Step 3: Apply the mapping to the dataframe
gold_df_filtered["Merged_Topic"] = gold_df_filtered["Topic"].map(final_mapping)

In [59]:
# remove the noise data
noise_data = gold_df_filtered[gold_df_filtered.Merged_Topic == -1]

gold_df_filtered = gold_df_filtered[gold_df_filtered.Merged_Topic != -1]

In [62]:
noise_data.shape

(2235, 12)

In [63]:
gold_df_filtered.shape

(8335, 12)

In [61]:
len(gold_df_filtered["Merged_Topic"].unique())

22

In [64]:
gold_df_filtered["Merged_Topic"].value_counts()

Merged_Topic
42    4555
20    1040
3      423
9      301
7      205
10     166
12     159
13     151
14     144
15     133
21     121
23     115
26     112
27     102
30      90
34      83
35      77
37      75
38      75
40      73
45      68
48      67
Name: count, dtype: int64

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 48]["News"]
# seems to be downward price movement

473           gold futures drop $16 to settle at $1,713.80
477         gold futures down $10.70 at $1,590.40 an ounce
918              gold futures down 10 cents at $1187.60/oz
1457             gold futures down 0.5% at $1,371 an ounce
1609              gold futures fall to settle at $1,744.70
                               ...                        
10203    Gold futures close down $2.50, or 0.2%, at $1,...
10243               Gold futures off 0.4% at $1,313.60/oz.
10247            gold futures down 40c at $876.80 an ounce
10389     december gold futures off $45.70 to $1,762.40 oz
10413        dec. gold futures down $1.70 at $594.20/ounce
Name: News, Length: 67, dtype: object


In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 45]["News"]
# macroeconomic events

505      in the seconds before jobs report, e-mini futu...
701        gold futures rally after u.s. employment report
706      Gold futures up $10 from the settlement as inv...
906      gold futures fall from settlement level after ...
953      Gold prices settle sharply higher ahead of FOM...
                               ...                        
10164     gold futures drop as fed policy comes into focus
10265    Gold futures finish lower ahead of Fed interes...
10331    Gold futures off the highs after U.S. consumer...
10339    gold futures pare slight gain after comments f...
10459    gold futures add to gains after ism, housing data
Name: News, Length: 68, dtype: object

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 20]["News"].head(10)
# bullish movement

9     april gold holds slight gain, up $2.50, or 0.2...
10    feb. gold ends up $9.60, or 1.1%, at $901.60 a...
26    april gold up $1.40 to end the day at $554.40/...
31       august gold ends up $1.60, or 0.2%, at $936.20
33                    gold futures close narrowly lower
35    feb gold falls 10c to $452.20/oz in morning ny...
39        feb. gold down $1.20 to close at $1,596.70/oz
50    gold ends higher, logs biggest weekly gain sin...
51              August gold rises 0.2% at $1,257.50/oz.
60    Gold jumps 0.3% from settlement price after Fe...
Name: News, dtype: object

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 3]["News"].head(10)
# broader market conditions

2         Gold futures edge up after two-session decline
24     gold futures down over $20/oz ahead of settlement
66     metals shares turn higher; gold futures still ...
74             gold futures open higher, as dollar drops
84          metals shares fall, but gold futures edge up
112               gold futures fall 0.24% on global cues
117                  gold issues, futures prices inch up
137            gold prices gain along with stock futures
167             gold futures rise, as dollar edges lower
170    rjo futures : gold versus silver: q2 to benefi...
Name: News, dtype: object