In [33]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN
from keybert import KeyBERT


# Step 1: Load data
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

In [34]:
headlines

['april gold down 20 cents to settle at $1,116.10/oz',
 'gold suffers third straight daily decline',
 'Gold futures edge up after two-session decline',
 "dent research : is gold's day in the sun coming soon?",
 'Gold snaps three-day rally as Trump, lawmakers reach debt-ceiling deal',
 'Dec. gold climbs $9.40, or 0.7%, to settle at $1,356.90/oz',
 'gold falls by rs 25 on sluggish demand, global cues',
 'Gold futures fall for the session, but gain for the week',
 'Gold struggles; silver slides, base metals falter',
 'april gold holds slight gain, up $2.50, or 0.2%, at $1320.20/oz.',
 'feb. gold ends up $9.60, or 1.1%, at $901.60 an ounce',
 'gold trades in red in early trade; eyes near-term range at rs 28,300-28,600',
 'gold loses 1.3%, but logs monthly gain of 6.3%',
 "gold recovery? here's one way to play it",
 'gold prices rebound rs 350 on global cues, weak rupee',
 'can investment in gold, sensex & ppfs give the same returns?',
 'gold rush spooks economy as trade deficit surges to $

In [None]:
import re
import numpy as np

headlines_lower = [h.lower() for h in headlines]

months = r"\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)\b"
directions = r"\b(up|down|higher|lower|rise|rises|fall|falls|gain|gains|loses|loss|rebound|slip|climb|surge|drop|drops|edged|edges|recover|recovery|recovers|flat)\b"
numbers = r"[\d\.,]+[%$]?|\d{1,3}(,\d{3})*(\.\d+)?|\d+"
symbols = r"\/oz|rs|bn|usd|\$|%|oz"

cleaned_headlines = []
for h in headlines_lower:
    h_clean = re.sub(months, "", h, flags=re.IGNORECASE)
    h_clean = re.sub(directions, "", h_clean, flags=re.IGNORECASE)
    h_clean = re.sub(numbers, "", h_clean)
    h_clean = re.sub(symbols, "", h_clean, flags=re.IGNORECASE)
    h_clean = re.sub(r"[^\w\s]", "", h_clean)  # remove punctuation
    h_clean = re.sub(r"\s+", " ", h_clean).strip()  # clean up spaces
    cleaned_headlines.append(h_clean.lower())

In [36]:
vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # uptil trigrams
    min_df=10,  # ignore words in less than 10 headlines (0.1%)
    max_df=0.5,  # ignore words in more than 50% headlines
    max_features=5_000,
    token_pattern=r"(?u)\b[\w\-]+\b",  # Keep hyphenated phrases (e.g., "AI-driven")
)

In [37]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,  # Test values between 30-100
    min_samples=10,  # Avoids micro-clusters (10-30% of min_cluster_size)
    cluster_selection_epsilon=0.1,  # Merges nearby clusters
)

topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    verbose=True,
)

In [38]:
topics, probs = topic_model.fit_transform(cleaned_headlines)

2025-04-10 21:29:36,712 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 331/331 [00:16<00:00, 20.33it/s]
2025-04-10 21:29:57,772 - BERTopic - Embedding - Completed ✓
2025-04-10 21:29:57,772 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-10 21:30:00,135 - BERTopic - Dimensionality - Completed ✓
2025-04-10 21:30:00,136 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-10 21:30:00,318 - BERTopic - Cluster - Completed ✓
2025-04-10 21:30:00,320 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-10 21:30:00,483 - BERTopic - Representation - Completed ✓


In [44]:
cleaned_headlines

['gold cents to settle at',
 'gold suffe third straight daily decline',
 'gold futures edge after twosession decline',
 'dent research is golds day in the sun coming soon',
 'gold snaps threeday rally as trump lawmake reach debtceiling deal',
 'gold climbs or to settle at',
 'gold by on sluggish demand global cues',
 'gold futures for the session but for the week',
 'gold struggles silver slides base metals falter',
 'gold holds slight or at',
 'gold ends or at an ounce',
 'gold trades in red in early trade eyes nearterm range at',
 'gold but logs monthly of',
 'gold heres one way to play it',
 'gold prices on global cues weak rupee',
 'can investment in gold sensex ppfs give the same returns',
 'gold rush spooks economy as trade deficit surges to',
 'gold futures at',
 'state street gold likely to back to an ounce by the end of',
 'gold prices as dollar remains supported',
 'gold to trade in range achiieve equities',
 'gold holds firm near on dovish fed',
 'gold prices finish at a mor

In [7]:
# Step 3: Save topics back to dataframe
gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()
gold_df_filtered["Topic"] = topics

In [8]:
# Step 4: View top 10 topics
print(topic_model.get_topic_info().head(10))

   Topic  Count                                   Name  \
0     -1   2656               -1_ounce_fit_cents_today   
1      0   1758       0_global_demand_global cues_cues   
2      1    710                  1_low_month_week_hits   
3      2    639           2_china_ease_support_investo   
4      3    482                3_fed_minutes_rate_hike   
5      4    220  4_week gold_session_session gold_week   
6      5    216       5_highest_high gold_record_level   
7      6    213                6_demand_rates_market_q   
8      7    192     7_dollar gold_steady_remain_weaker   
9      8    189           8_hike_cut_likely_price gold   

                                      Representation  \
0  [ounce, fit, cents, today, day, session, weekl...   
1  [global, demand, global cues, cues, buy, deman...   
2  [low, month, week, hits, week high, lows, high...   
3  [china, ease, support, investo, demand, slight...   
4  [fed, minutes, rate, hike, rates, steady, gold...   
5  [week gold, session, s

In [9]:
sample_topic = 0  # change this to see different clusters
print(f"\n--- Sample Headlines from Topic {sample_topic} ---")
print(gold_df_filtered[gold_df_filtered["Topic"] == sample_topic]["News"].head(5))

# Step 6: Visualize
topic_model.visualize_topics().show()


--- Sample Headlines from Topic 0 ---
3     dent research : is gold's day in the sun comin...
6     gold falls by rs 25 on sluggish demand, global...
8     Gold struggles; silver slides, base metals falter
9     april gold holds slight gain, up $2.50, or 0.2...
15    can investment in gold, sensex & ppfs give the...
Name: News, dtype: object


In [10]:
topic_model.visualize_barchart(top_n_topics=10).show()

In [11]:
topic_model.visualize_hierarchy().show()

In [12]:
topic_model.visualize_heatmap()

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [14]:
# Step 1: Get topic embeddings and IDs
topic_ids = topic_model.get_topic_info().Topic.tolist()
# Filter out -1 (outliers)
topic_ids = [t for t in topic_ids if t != -1]

In [15]:
embeddings = topic_model.topic_embeddings_
# Only keep embeddings for the selected topic IDs
topic_idx_map = {i: topic_ids.index(i) for i in topic_ids}
filtered_embeddings = np.array([embeddings[i] for i in topic_ids])

In [16]:
# Step 2: Compute cosine similarity matrix
cosine_sim = cosine_similarity(filtered_embeddings)

# Step 3: Find topic pairs with high similarity (excluding diagonal)
threshold = 0.85
highly_similar_pairs = []
for i, j in itertools.combinations(range(len(topic_ids)), 2):
    if cosine_sim[i, j] >= threshold:
        highly_similar_pairs.append((topic_ids[i], topic_ids[j]))

In [17]:
len(highly_similar_pairs)

34

In [18]:
highly_similar_pairs

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 8),
 (0, 15),
 (0, 18),
 (0, 28),
 (0, 31),
 (0, 32),
 (0, 37),
 (1, 3),
 (1, 15),
 (1, 31),
 (1, 37),
 (2, 6),
 (3, 4),
 (3, 8),
 (4, 8),
 (5, 16),
 (5, 32),
 (14, 25),
 (15, 21),
 (16, 18),
 (16, 32),
 (18, 28),
 (18, 32),
 (18, 35),
 (18, 39),
 (28, 32),
 (30, 37),
 (33, 36),
 (34, 37)]

In [19]:
len(topic_ids)

41

In [20]:
unique_topics = gold_df_filtered.Topic.unique()

In [21]:
gold_df_filtered

Unnamed: 0,Dates,URL,News,Price Direction Up,Price Direction Constant,Price Direction Down,Asset Comparision,Past Information,Future Information,Price Sentiment,Topic
0,28-01-2016,http://www.marketwatch.com/story/april-gold-do...,"april gold down 20 cents to settle at $1,116.1...",0,0,1,0,1,0,negative,-1
1,13-09-2017,http://www.marketwatch.com/story/gold-prices-s...,gold suffers third straight daily decline,0,0,1,0,1,0,negative,18
2,26-07-2016,http://www.marketwatch.com/story/gold-futures-...,Gold futures edge up after two-session decline,1,0,0,0,1,0,positive,31
3,28-02-2018,https://www.metalsdaily.com/link/277199/dent-r...,dent research : is gold's day in the sun comin...,0,0,0,0,0,1,none,0
4,06-09-2017,http://www.marketwatch.com/story/gold-steadies...,"Gold snaps three-day rally as Trump, lawmakers...",0,0,1,0,1,0,negative,-1
...,...,...,...,...,...,...,...,...,...,...,...
10565,07-01-2013,https://www.moneycontrol.com/news/business/mar...,gold seen falling from 3-week high this week,0,0,1,0,1,0,negative,1
10566,27-09-2018,https://www.metalsdaily.com/link/284468/domini...,dominic frisby : now looks like a good time to...,1,0,0,0,0,1,positive,0
10567,03-03-2017,https://www.thehindubusinessline.com/markets/g...,Gold heading for worst week since November on ...,0,0,1,0,1,0,negative,1
10568,11-06-2008,http://www.marketwatch.com/story/august-gold-u...,august gold up $7.60 at $878.80 an ounce on nymex,1,0,0,0,1,0,positive,-1


In [22]:
# Step 1: Build Union-Find to track connected components
class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(y)] = self.find(x)


uf = UnionFind()
for a, b in highly_similar_pairs:
    uf.union(min(a, b), max(a, b))  # Always union to the smaller ID

# Step 2: Build final topic mapping to lowest ID in each group
# Also apply to all unique topics (including those not in pairs)
final_mapping = {}
for topic in unique_topics:
    if topic == -1:
        final_mapping[topic] = -1
    else:
        final_mapping[topic] = uf.find(topic)

# Step 3: Apply the mapping to the dataframe
gold_df_filtered["Merged_Topic"] = gold_df_filtered["Topic"].map(final_mapping)

In [23]:
# remove the noise data
noise_data = gold_df_filtered[gold_df_filtered.Merged_Topic == -1]

gold_df_filtered = gold_df_filtered[gold_df_filtered.Merged_Topic != -1]

In [24]:
noise_data.shape

(2656, 12)

In [25]:
gold_df_filtered.shape

(7914, 12)

In [26]:
len(gold_df_filtered["Merged_Topic"].unique())

20

In [27]:
gold_df_filtered["Merged_Topic"].value_counts()

Merged_Topic
34    5488
14     237
7      192
9      186
10     176
11     173
12     162
33     148
13     146
17     117
19     105
20     102
22     100
23      96
24      95
26      91
27      89
29      84
38      66
40      61
Name: count, dtype: int64

In [28]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 39]["News"]
# seems to be downward price movement

Series([], Name: News, dtype: object)

In [29]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 38]["News"]
# macroeconomic events

112                 gold futures fall 0.24% on global cues
224               gold futures rebound on firm global cues
242       gold rises rs 67 in futures trade on global cues
334                 gold futures give up europe-deal gains
629                 gold futures rise rs 34 on global cues
                               ...                        
9402         Gold futures down at Rs 30,800 on global cues
9571     Gold, silver futures rise with U.S. data, Ukra...
9667                  gold futures rise as japan mulls buy
10152         gold futures up by rs 91 on firm global cues
10187    Gold Rate Today: Gold rises in futures trade; ...
Name: News, Length: 66, dtype: object

In [30]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 13]["News"].head(10)
# bullish movement

10     feb. gold ends up $9.60, or 1.1%, at $901.60 a...
45          gold ends 0.5% higher, at $1,159.60 an ounce
207           gold futures up $12.20 at $954.70 an ounce
213    june gold futures fall $11.60, or 0.9%, to set...
245          gold futures up $8.50 at $1,005.20 an ounce
401    gold ends floor trade up $9.8 at $1191.7 an ounce
471               gold futures up $3.40 at $653 an ounce
477       gold futures down $10.70 at $1,590.40 an ounce
518         gold futures up $12.20 at $1,212.40 an ounce
533         gold futures up $11.40 at $1,198.70 an ounce
Name: News, dtype: object

In [31]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 28]["News"].head(10)
# broader market conditions

Series([], Name: News, dtype: object)