In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN
from keybert import KeyBERT
import hdbscan


# Step 1: Load data
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

In [188]:
headlines

['april gold down 20 cents to settle at $1,116.10/oz',
 'gold suffers third straight daily decline',
 'Gold futures edge up after two-session decline',
 "dent research : is gold's day in the sun coming soon?",
 'Gold snaps three-day rally as Trump, lawmakers reach debt-ceiling deal',
 'Dec. gold climbs $9.40, or 0.7%, to settle at $1,356.90/oz',
 'gold falls by rs 25 on sluggish demand, global cues',
 'Gold futures fall for the session, but gain for the week',
 'Gold struggles; silver slides, base metals falter',
 'april gold holds slight gain, up $2.50, or 0.2%, at $1320.20/oz.',
 'feb. gold ends up $9.60, or 1.1%, at $901.60 an ounce',
 'gold trades in red in early trade; eyes near-term range at rs 28,300-28,600',
 'gold loses 1.3%, but logs monthly gain of 6.3%',
 "gold recovery? here's one way to play it",
 'gold prices rebound rs 350 on global cues, weak rupee',
 'can investment in gold, sensex & ppfs give the same returns?',
 'gold rush spooks economy as trade deficit surges to $

In [189]:
import re
import numpy as np

headlines_lower = [h.lower() for h in headlines]

months = r"\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)\b"
directions = r"\b(up|down|higher|lower|rise|rises|fall|falls|gain|gains|loses|loss|rebound|slip|climb|surge|drop|drops|edged|edges|recover|recovery|recovers|flat)\b"
numbers = r"[\d\.,]+[%$]?|\d{1,3}(,\d{3})*(\.\d+)?|\d+"
symbols = r"\/oz|rs|bn|usd|\$|%|oz"

cleaned_headlines = []
for h in headlines_lower:
    h_clean = re.sub(months, "", h, flags=re.IGNORECASE)
    h_clean = re.sub(directions, "", h_clean, flags=re.IGNORECASE)
    h_clean = re.sub(numbers, "", h_clean)
    h_clean = re.sub(symbols, "", h_clean, flags=re.IGNORECASE)
    h_clean = re.sub(r"[^\w\s]", "", h_clean)  # remove punctuation
    h_clean = re.sub(r"\s+", " ", h_clean).strip()  # clean up spaces
    cleaned_headlines.append(h_clean.lower())

In [None]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

In [191]:
vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # uptil trigrams
    min_df=10,  # ignore words in less than 10 headlines (0.1%)
    max_df=0.5,  # ignore words in more than 50% headlines
    max_features=5_000,
    token_pattern=r"(?u)\b[\w\-]+\b",  # Keep hyphenated phrases (e.g., "AI-driven")
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,  # Test values between 30-100
    min_samples=10,  # Avoids micro-clusters (10-30% of min_cluster_size)
    cluster_selection_epsilon=0.1,  # Merges nearby clusters
    prediction_data=True,  # for soft clustering
)

topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    verbose=True,
)

In [193]:
topics, probs = topic_model.fit_transform(cleaned_headlines)

2025-04-12 12:58:07,235 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 331/331 [00:14<00:00, 22.47it/s]
2025-04-12 12:58:23,054 - BERTopic - Embedding - Completed ✓
2025-04-12 12:58:23,054 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-12 12:58:24,863 - BERTopic - Dimensionality - Completed ✓
2025-04-12 12:58:24,863 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-12 12:58:25,038 - BERTopic - Cluster - Completed ✓
2025-04-12 12:58:25,040 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-12 12:58:25,168 - BERTopic - Representation - Completed ✓


In [194]:
topic_model.hdbscan_model.probabilities_

array([1.        , 0.55478093, 1.        , ..., 1.        , 0.        ,
       0.        ])

In [195]:
(probs == topic_model.hdbscan_model.probabilities_).all()

True

In [196]:
prob_matrix = np.array(hdbscan.all_points_membership_vectors(topic_model.hdbscan_model))

In [197]:
prob_matrix

array([[8.58923919e-309, 1.50322002e-308, 1.49600445e-308, ...,
        2.29645126e-308, 2.07576989e-308, 2.13872452e-308],
       [2.88044999e-003, 4.38422344e-003, 2.67845296e-003, ...,
        1.45931938e-002, 1.24105209e-002, 1.21733072e-002],
       [8.57322910e-309, 1.62224124e-308, 8.35480154e-309, ...,
        7.89861204e-308, 5.37253983e-308, 5.17751800e-308],
       ...,
       [2.24197258e-003, 3.64260385e-003, 2.05412774e-003, ...,
        2.57005835e-002, 2.11023993e-002, 2.03454744e-002],
       [3.58802234e-004, 4.44653207e-004, 3.96751949e-004, ...,
        9.46120439e-004, 9.13195379e-004, 9.17652478e-004],
       [3.58807603e-004, 4.44657326e-004, 3.96761629e-004, ...,
        9.46117583e-004, 9.13196381e-004, 9.17653267e-004]])

In [198]:
normalized_prob = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)

In [199]:
normalized_prob[1]

array([0.00519205, 0.00790262, 0.00482795, 0.0075443 , 0.0059417 ,
       0.00745636, 0.00505384, 0.00744744, 0.0090925 , 0.00806509,
       0.00905552, 0.01046981, 0.01013724, 0.03899505, 0.03452448,
       0.01275697, 0.0158701 , 0.01408243, 0.01955297, 0.02626033,
       0.03132747, 0.04280084, 0.02458003, 0.01553566, 0.01419409,
       0.01313178, 0.02357973, 0.01865766, 0.02596097, 0.03127887,
       0.1368678 , 0.0655756 , 0.06684495, 0.0785185 , 0.02129558,
       0.01602758, 0.02015936, 0.02281768, 0.02630443, 0.02237013,
       0.02194255])

In [None]:
def get_hdbscan_probabilities(topic_model, documents):
    """Get full probability distributions for HDBSCAN using soft clustering"""
    # First ensure we have prediction data
    if not hasattr(topic_model.hdbscan_model, "prediction_data_"):
        raise ValueError("HDBSCAN needs to be initialized with prediction_data=True")

    # Get embeddings
    embeddings = topic_model._extract_embeddings(documents)

    # Get all cluster probabilities using HDBSCAN's soft clustering
    soft_clusters = hdbscan.all_points_membership_vectors(topic_model.hdbscan_model)

    # Convert to numpy array and normalize
    prob_matrix = np.array(soft_clusters)
    normalized_probs = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)

    return normalized_probs
    # return prob_matrix


# Usage:
hdbscan_probs = get_hdbscan_probabilities(topic_model, cleaned_headlines)

In [215]:
hdbscan_probs

array([[8.58923919e-309, 1.50322002e-308, 1.49600445e-308, ...,
        2.29645126e-308, 2.07576989e-308, 2.13872452e-308],
       [5.19204940e-003, 7.90262103e-003, 4.82794707e-003, ...,
        2.63044259e-002, 2.23701288e-002, 2.19425481e-002],
       [8.57322910e-309, 1.62224124e-308, 8.35480154e-309, ...,
        7.89861204e-308, 5.37253983e-308, 5.17751800e-308],
       ...,
       [2.24197258e-003, 3.64260385e-003, 2.05412774e-003, ...,
        2.57005835e-002, 2.11023993e-002, 2.03454744e-002],
       [9.49999826e-003, 1.17730724e-002, 1.05047920e-002, ...,
        2.50504085e-002, 2.41786524e-002, 2.42966629e-002],
       [9.50014042e-003, 1.17731815e-002, 1.05050483e-002, ...,
        2.50503329e-002, 2.41786790e-002, 2.42966838e-002]])

In [None]:
hdbscan_probs[1] == max(hdbscan_probs[1])

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False])

In [None]:
n_topics = hdbscan_probs.shape[1]
column_names = [str(i) for i in range(0, n_topics)]

prob_df = pd.DataFrame(hdbscan_probs, columns=column_names)

In [218]:
prob_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,8.589239e-309,1.503220e-308,1.496004e-308,1.140164e-308,1.578125e-308,1.000000e+00,1.555295e-308,1.372121e-308,1.257603e-308,1.191428e-308,...,2.121962e-308,2.005975e-308,2.166437e-308,2.306150e-308,2.466393e-308,2.378495e-308,2.045697e-308,2.296451e-308,2.075770e-308,2.138725e-308
1,5.192049e-03,7.902621e-03,4.827947e-03,7.544303e-03,5.941698e-03,7.456359e-03,5.053840e-03,7.447437e-03,9.092501e-03,8.065091e-03,...,6.557560e-02,6.684495e-02,7.851850e-02,2.129558e-02,1.602758e-02,2.015936e-02,2.281768e-02,2.630443e-02,2.237013e-02,2.194255e-02
2,8.573229e-309,1.622241e-308,8.354802e-309,1.541071e-308,1.299980e-308,1.543615e-308,9.593716e-309,1.396190e-308,1.345224e-308,1.193671e-308,...,5.450672e-308,6.192047e-308,6.832456e-308,4.869175e-308,3.699464e-308,4.842260e-308,6.150733e-308,7.898612e-308,5.372540e-308,5.177518e-308
3,9.650967e-309,1.202450e-308,8.846196e-309,1.180524e-308,1.209378e-308,1.531821e-308,1.066724e-308,1.430961e-308,1.311128e-308,1.189907e-308,...,4.225774e-308,3.876879e-308,5.461558e-308,8.705192e-308,6.039904e-308,7.412515e-308,5.165401e-308,7.534285e-308,4.773825e-308,5.508687e-308
4,2.742042e-03,4.401532e-03,2.500162e-03,3.829295e-03,3.493929e-03,4.522222e-03,3.430458e-03,4.637355e-03,3.561937e-03,3.239807e-03,...,1.591686e-02,1.692443e-02,3.984786e-02,4.708938e-02,2.656008e-02,4.766212e-02,4.554680e-02,6.400161e-02,1.178481e-01,2.360679e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10565,9.426836e-309,1.469341e-308,8.536621e-309,1.304352e-308,1.083522e-308,1.416576e-308,9.657266e-309,1.331608e-308,1.414652e-308,1.275946e-308,...,1.174922e-307,1.543265e-307,1.000000e+00,5.739901e-308,3.930653e-308,5.341981e-308,6.186378e-308,7.796784e-308,5.957625e-308,5.920302e-308
10566,9.191453e-309,1.194494e-308,8.791769e-309,1.186662e-308,1.265201e-308,1.569297e-308,1.105220e-308,1.543940e-308,1.255348e-308,1.135163e-308,...,3.716828e-308,3.396310e-308,4.747328e-308,8.636894e-308,6.444790e-308,7.729433e-308,5.006954e-308,7.400081e-308,4.696810e-308,5.494090e-308
10567,2.241973e-03,3.642604e-03,2.054128e-03,3.092702e-03,2.642334e-03,3.537484e-03,2.477179e-03,3.363962e-03,3.156756e-03,2.862047e-03,...,2.199654e-02,2.607039e-02,5.655973e-01,1.732922e-02,1.116500e-02,1.642001e-02,1.881175e-02,2.570058e-02,2.110240e-02,2.034547e-02
10568,9.499998e-03,1.177307e-02,1.050479e-02,2.000309e-02,2.322737e-02,1.557614e-02,1.363080e-02,3.596998e-02,2.241062e-02,1.975961e-02,...,2.375694e-02,2.332802e-02,2.382808e-02,2.461636e-02,2.374105e-02,2.434717e-02,2.470985e-02,2.505041e-02,2.417865e-02,2.429666e-02


In [None]:
topic_columns = [col for col in prob_df.columns]

prob_df["dominant_topic"] = prob_df[topic_columns].idxmax(axis=1)

In [None]:
prob_df["topic"] = topics

In [221]:
# BERTopic's topic mapping
topic_mapping = topic_model.topic_mapper_.get_mappings()
print(topic_mapping)

{-1: -1, 0: 14, 1: 18, 2: 21, 3: 22, 4: 25, 5: 40, 6: 28, 7: 26, 8: 34, 9: 29, 10: 6, 11: 38, 12: 24, 13: 23, 14: 37, 15: 35, 16: 32, 17: 27, 18: 13, 19: 31, 20: 19, 21: 2, 22: 17, 23: 7, 24: 12, 25: 11, 26: 20, 27: 33, 28: 0, 29: 15, 30: 3, 31: 10, 32: 39, 33: 1, 34: 30, 35: 8, 36: 36, 37: 4, 38: 9, 39: 16, 40: 5}


In [222]:
prob_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,dominant_topic,topic
0,8.589239e-309,1.503220e-308,1.496004e-308,1.140164e-308,1.578125e-308,1.000000e+00,1.555295e-308,1.372121e-308,1.257603e-308,1.191428e-308,...,2.166437e-308,2.306150e-308,2.466393e-308,2.378495e-308,2.045697e-308,2.296451e-308,2.075770e-308,2.138725e-308,5,40
1,5.192049e-03,7.902621e-03,4.827947e-03,7.544303e-03,5.941698e-03,7.456359e-03,5.053840e-03,7.447437e-03,9.092501e-03,8.065091e-03,...,7.851850e-02,2.129558e-02,1.602758e-02,2.015936e-02,2.281768e-02,2.630443e-02,2.237013e-02,2.194255e-02,30,3
2,8.573229e-309,1.622241e-308,8.354802e-309,1.541071e-308,1.299980e-308,1.543615e-308,9.593716e-309,1.396190e-308,1.345224e-308,1.193671e-308,...,6.832456e-308,4.869175e-308,3.699464e-308,4.842260e-308,6.150733e-308,7.898612e-308,5.372540e-308,5.177518e-308,21,2
3,9.650967e-309,1.202450e-308,8.846196e-309,1.180524e-308,1.209378e-308,1.531821e-308,1.066724e-308,1.430961e-308,1.311128e-308,1.189907e-308,...,5.461558e-308,8.705192e-308,6.039904e-308,7.412515e-308,5.165401e-308,7.534285e-308,4.773825e-308,5.508687e-308,28,0
4,2.742042e-03,4.401532e-03,2.500162e-03,3.829295e-03,3.493929e-03,4.522222e-03,3.430458e-03,4.637355e-03,3.561937e-03,3.239807e-03,...,3.984786e-02,4.708938e-02,2.656008e-02,4.766212e-02,4.554680e-02,6.400161e-02,1.178481e-01,2.360679e-01,40,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10565,9.426836e-309,1.469341e-308,8.536621e-309,1.304352e-308,1.083522e-308,1.416576e-308,9.657266e-309,1.331608e-308,1.414652e-308,1.275946e-308,...,1.000000e+00,5.739901e-308,3.930653e-308,5.341981e-308,6.186378e-308,7.796784e-308,5.957625e-308,5.920302e-308,33,1
10566,9.191453e-309,1.194494e-308,8.791769e-309,1.186662e-308,1.265201e-308,1.569297e-308,1.105220e-308,1.543940e-308,1.255348e-308,1.135163e-308,...,4.747328e-308,8.636894e-308,6.444790e-308,7.729433e-308,5.006954e-308,7.400081e-308,4.696810e-308,5.494090e-308,28,0
10567,2.241973e-03,3.642604e-03,2.054128e-03,3.092702e-03,2.642334e-03,3.537484e-03,2.477179e-03,3.363962e-03,3.156756e-03,2.862047e-03,...,5.655973e-01,1.732922e-02,1.116500e-02,1.642001e-02,1.881175e-02,2.570058e-02,2.110240e-02,2.034547e-02,33,1
10568,9.499998e-03,1.177307e-02,1.050479e-02,2.000309e-02,2.322737e-02,1.557614e-02,1.363080e-02,3.596998e-02,2.241062e-02,1.975961e-02,...,2.382808e-02,2.461636e-02,2.374105e-02,2.434717e-02,2.470985e-02,2.505041e-02,2.417865e-02,2.429666e-02,11,-1


In [None]:
prob_df["topic"] = (
    pd.to_numeric(prob_df["topic"], errors="coerce").fillna(-1).astype(int)
)
prob_df["dominant_topic"] = (
    pd.to_numeric(prob_df["dominant_topic"], errors="coerce").fillna(-1).astype(int)
)
prob_df["mapped_topic"] = prob_df["dominant_topic"].map(topic_mapping)
prob_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,dominant_topic,topic,mapped_topic
0,8.589239e-309,1.503220e-308,1.496004e-308,1.140164e-308,1.578125e-308,1.000000e+00,1.555295e-308,1.372121e-308,1.257603e-308,1.191428e-308,...,2.306150e-308,2.466393e-308,2.378495e-308,2.045697e-308,2.296451e-308,2.075770e-308,2.138725e-308,5,40,40
1,5.192049e-03,7.902621e-03,4.827947e-03,7.544303e-03,5.941698e-03,7.456359e-03,5.053840e-03,7.447437e-03,9.092501e-03,8.065091e-03,...,2.129558e-02,1.602758e-02,2.015936e-02,2.281768e-02,2.630443e-02,2.237013e-02,2.194255e-02,30,3,3
2,8.573229e-309,1.622241e-308,8.354802e-309,1.541071e-308,1.299980e-308,1.543615e-308,9.593716e-309,1.396190e-308,1.345224e-308,1.193671e-308,...,4.869175e-308,3.699464e-308,4.842260e-308,6.150733e-308,7.898612e-308,5.372540e-308,5.177518e-308,21,2,2
3,9.650967e-309,1.202450e-308,8.846196e-309,1.180524e-308,1.209378e-308,1.531821e-308,1.066724e-308,1.430961e-308,1.311128e-308,1.189907e-308,...,8.705192e-308,6.039904e-308,7.412515e-308,5.165401e-308,7.534285e-308,4.773825e-308,5.508687e-308,28,0,0
4,2.742042e-03,4.401532e-03,2.500162e-03,3.829295e-03,3.493929e-03,4.522222e-03,3.430458e-03,4.637355e-03,3.561937e-03,3.239807e-03,...,4.708938e-02,2.656008e-02,4.766212e-02,4.554680e-02,6.400161e-02,1.178481e-01,2.360679e-01,40,-1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10565,9.426836e-309,1.469341e-308,8.536621e-309,1.304352e-308,1.083522e-308,1.416576e-308,9.657266e-309,1.331608e-308,1.414652e-308,1.275946e-308,...,5.739901e-308,3.930653e-308,5.341981e-308,6.186378e-308,7.796784e-308,5.957625e-308,5.920302e-308,33,1,1
10566,9.191453e-309,1.194494e-308,8.791769e-309,1.186662e-308,1.265201e-308,1.569297e-308,1.105220e-308,1.543940e-308,1.255348e-308,1.135163e-308,...,8.636894e-308,6.444790e-308,7.729433e-308,5.006954e-308,7.400081e-308,4.696810e-308,5.494090e-308,28,0,0
10567,2.241973e-03,3.642604e-03,2.054128e-03,3.092702e-03,2.642334e-03,3.537484e-03,2.477179e-03,3.363962e-03,3.156756e-03,2.862047e-03,...,1.732922e-02,1.116500e-02,1.642001e-02,1.881175e-02,2.570058e-02,2.110240e-02,2.034547e-02,33,1,1
10568,9.499998e-03,1.177307e-02,1.050479e-02,2.000309e-02,2.322737e-02,1.557614e-02,1.363080e-02,3.596998e-02,2.241062e-02,1.975961e-02,...,2.461636e-02,2.374105e-02,2.434717e-02,2.470985e-02,2.505041e-02,2.417865e-02,2.429666e-02,11,-1,38


In [None]:
prob_df[prob_df["topic"] == -1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,dominant_topic,topic,mapped_topic,matching
4,0.002742,0.004402,0.002500,0.003829,0.003494,0.004522,0.003430,0.004637,0.003562,0.003240,...,0.026560,0.047662,0.045547,0.064002,0.117848,0.236068,40,-1,5,False
20,0.010883,0.011366,0.010805,0.011377,0.019296,0.025356,0.017355,0.014322,0.012985,0.012303,...,0.034601,0.031145,0.025710,0.027440,0.023928,0.025419,25,-1,11,False
28,0.003521,0.004547,0.003293,0.004658,0.004973,0.005885,0.004101,0.005736,0.004843,0.004368,...,0.029489,0.034507,0.025419,0.035284,0.022269,0.025829,28,-1,0,False
30,0.002912,0.004017,0.002586,0.003626,0.003709,0.004853,0.003645,0.004477,0.003509,0.003215,...,0.054931,0.139131,0.029597,0.050231,0.026753,0.036416,34,-1,30,False
31,0.005959,0.006973,0.005301,0.007297,0.006672,0.008030,0.005595,0.008003,0.009658,0.008660,...,0.025849,0.030449,0.028603,0.033525,0.026811,0.028496,28,-1,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10555,0.006240,0.006974,0.005499,0.007036,0.006739,0.008472,0.005855,0.007896,0.009265,0.008427,...,0.027376,0.031411,0.027636,0.033324,0.025969,0.027945,28,-1,0,False
10556,0.006997,0.007384,0.006073,0.008466,0.008272,0.008868,0.007143,0.011226,0.010662,0.009598,...,0.027564,0.030556,0.027930,0.031007,0.026756,0.028458,28,-1,0,False
10559,0.008820,0.015125,0.014672,0.014116,0.013862,0.020284,0.015976,0.032942,0.019546,0.017828,...,0.025668,0.026882,0.024995,0.027248,0.028326,0.027784,17,-1,27,False
10568,0.009500,0.011773,0.010505,0.020003,0.023227,0.015576,0.013631,0.035970,0.022411,0.019760,...,0.023741,0.024347,0.024710,0.025050,0.024179,0.024297,11,-1,38,False


In [None]:
prob_df["matching"] = prob_df["topic"] == prob_df["mapped_topic"]
prob_df["matching"].sum()

7621

In [None]:
7621 / prob_df.shape[0]

0.7210028382213812

In [227]:
2818 + 7621 == prob_df.shape[0]

False

In [None]:
prob_df[prob_df["matching"] == False]["topic"].value_counts()

topic
-1     2818
 7       41
 3       39
 1       19
 10      14
 5        7
 6        5
 12       2
 16       1
 8        1
 31       1
 17       1
Name: count, dtype: int64

In [212]:
prob_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,dominant_topic,topic,topic_int,dominant_topic_int,mapped_topic
0,8.589239e-309,1.503220e-308,1.496004e-308,1.140164e-308,1.578125e-308,1.000000e+00,1.555295e-308,1.372121e-308,1.257603e-308,1.191428e-308,...,2.378495e-308,2.045697e-308,2.296451e-308,2.075770e-308,2.138725e-308,5,40,40,5,40
1,5.192049e-03,7.902621e-03,4.827947e-03,7.544303e-03,5.941698e-03,7.456359e-03,5.053840e-03,7.447437e-03,9.092501e-03,8.065091e-03,...,2.015936e-02,2.281768e-02,2.630443e-02,2.237013e-02,2.194255e-02,30,3,3,30,3
2,8.573229e-309,1.622241e-308,8.354802e-309,1.541071e-308,1.299980e-308,1.543615e-308,9.593716e-309,1.396190e-308,1.345224e-308,1.193671e-308,...,4.842260e-308,6.150733e-308,7.898612e-308,5.372540e-308,5.177518e-308,21,2,2,21,2
3,9.650967e-309,1.202450e-308,8.846196e-309,1.180524e-308,1.209378e-308,1.531821e-308,1.066724e-308,1.430961e-308,1.311128e-308,1.189907e-308,...,7.412515e-308,5.165401e-308,7.534285e-308,4.773825e-308,5.508687e-308,28,0,0,28,0
4,2.742042e-03,4.401532e-03,2.500162e-03,3.829295e-03,3.493929e-03,4.522222e-03,3.430458e-03,4.637355e-03,3.561937e-03,3.239807e-03,...,4.766212e-02,4.554680e-02,6.400161e-02,1.178481e-01,2.360679e-01,40,-1,-1,40,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10565,9.426836e-309,1.469341e-308,8.536621e-309,1.304352e-308,1.083522e-308,1.416576e-308,9.657266e-309,1.331608e-308,1.414652e-308,1.275946e-308,...,5.341981e-308,6.186378e-308,7.796784e-308,5.957625e-308,5.920302e-308,33,1,1,33,1
10566,9.191453e-309,1.194494e-308,8.791769e-309,1.186662e-308,1.265201e-308,1.569297e-308,1.105220e-308,1.543940e-308,1.255348e-308,1.135163e-308,...,7.729433e-308,5.006954e-308,7.400081e-308,4.696810e-308,5.494090e-308,28,0,0,28,0
10567,2.241973e-03,3.642604e-03,2.054128e-03,3.092702e-03,2.642334e-03,3.537484e-03,2.477179e-03,3.363962e-03,3.156756e-03,2.862047e-03,...,1.642001e-02,1.881175e-02,2.570058e-02,2.110240e-02,2.034547e-02,33,1,1,33,1
10568,9.499998e-03,1.177307e-02,1.050479e-02,2.000309e-02,2.322737e-02,1.557614e-02,1.363080e-02,3.596998e-02,2.241062e-02,1.975961e-02,...,2.434717e-02,2.470985e-02,2.505041e-02,2.417865e-02,2.429666e-02,11,-1,-1,11,38


In [236]:
# Step 3: Save topics back to dataframe
gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()

In [None]:
gold_df_filtered = pd.concat([gold_df_filtered, prob_df], axis=1)

In [238]:
# Step 4: View top 10 topics
print(topic_model.get_topic_info().head(10))

   Topic  Count                                 Name  \
0     -1   2818    -1_ends_cents_gold ends_spot gold   
1      0   1576          0_global_demand_cues_buying   
2      1    593                1_low_month_near_week   
3      2    571        2_end_week gold_session_close   
4      3    382          3_weekly_week gold_ends_day   
5      4    341  4_data gold_report_data_report gold   
6      5    241            5_fed_rate_rate hike_hike   
7      6    239       6_trades_marginally_sell_slips   
8      7    223                       7_q_vs_cut_tax   
9      8    223           8_china_focus_slightly_fed   

                                      Representation  \
0  [ends, cents, gold ends, spot gold, spot, stoc...   
1  [global, demand, cues, buying, commodities, bu...   
2  [low, month, near, week, hits, low gold, lows,...   
3  [end, week gold, session, close, session gold,...   
4  [weekly, week gold, ends, day, week, gold ends...   
5  [data gold, report, data, report gold, jobs,

In [None]:
sample_topic = 0  # change this to see different clusters
print(f"\n--- Sample Headlines from Topic {sample_topic} ---")
print(gold_df_filtered[gold_df_filtered["Topic"] == sample_topic]["News"].head(5))

# Step 6: Visualize
topic_model.visualize_topics().show()

In [None]:
topic_model.visualize_barchart(top_n_topics=10).show()

In [None]:
topic_model.visualize_hierarchy().show()

In [None]:
topic_model.visualize_heatmap()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [None]:
# Step 1: Get topic embeddings and IDs
topic_ids = topic_model.get_topic_info().Topic.tolist()
# Filter out -1 (outliers)
topic_ids = [t for t in topic_ids if t != -1]

In [None]:
embeddings = topic_model.topic_embeddings_
# Only keep embeddings for the selected topic IDs
topic_idx_map = {i: topic_ids.index(i) for i in topic_ids}
filtered_embeddings = np.array([embeddings[i] for i in topic_ids])

In [None]:
# Step 2: Compute cosine similarity matrix
cosine_sim = cosine_similarity(filtered_embeddings)

# Step 3: Find topic pairs with high similarity (excluding diagonal)
threshold = 0.85
highly_similar_pairs = []
for i, j in itertools.combinations(range(len(topic_ids)), 2):
    if cosine_sim[i, j] >= threshold:
        highly_similar_pairs.append((topic_ids[i], topic_ids[j]))

In [None]:
len(highly_similar_pairs)

In [None]:
highly_similar_pairs

In [None]:
len(topic_ids)

In [None]:
unique_topics = gold_df_filtered.Topic.unique()

In [None]:
gold_df_filtered

In [None]:
# Step 1: Build Union-Find to track connected components
class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(y)] = self.find(x)


uf = UnionFind()
for a, b in highly_similar_pairs:
    uf.union(min(a, b), max(a, b))  # Always union to the smaller ID

# Step 2: Build final topic mapping to lowest ID in each group
# Also apply to all unique topics (including those not in pairs)
final_mapping = {}
for topic in unique_topics:
    if topic == -1:
        final_mapping[topic] = -1
    else:
        final_mapping[topic] = uf.find(topic)

# Step 3: Apply the mapping to the dataframe
gold_df_filtered["Merged_Topic"] = gold_df_filtered["Topic"].map(final_mapping)

In [None]:
# remove the noise data
noise_data = gold_df_filtered[gold_df_filtered.Merged_Topic == -1]

gold_df_filtered = gold_df_filtered[gold_df_filtered.Merged_Topic != -1]

In [None]:
noise_data.shape

In [None]:
gold_df_filtered.shape

In [None]:
len(gold_df_filtered["Merged_Topic"].unique())

In [None]:
gold_df_filtered["Merged_Topic"].value_counts()

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 39]["News"]
# seems to be downward price movement

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 38]["News"]
# macroeconomic events

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 13]["News"].head(10)
# bullish movement

In [None]:
gold_df_filtered[gold_df_filtered.Merged_Topic == 28]["News"].head(10)
# broader market conditions