In [None]:
import pandas as pd
import numpy as np
import re

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN, all_points_membership_vectors
from keybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import davies_bouldin_score

## Load & Preprocess News

In [None]:
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

In [None]:
months = r"\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)\b"
directions = r"\b(up|down|higher|lower|rise|rises|fall|falls|gain|gains|loses|loss|rebound|slip|climb|surge|drop|drops|edged|edges|recover|recovery|recovers|flat)\b"
numbers = r"[\d\.,]+[%$]?|\d{1,3}(,\d{3})*(\.\d+)?|\d+"
symbols = r"\/oz|rs|bn|usd|\$|%|oz"

cleaned_headlines = []
for h in headlines:
    h = h.lower()
    h = re.sub(months, "", h)
    h = re.sub(directions, "", h)
    h = re.sub(numbers, "", h)
    h = re.sub(symbols, "", h)
    h = re.sub(r"[^\w\s]", "", h)
    h = re.sub(r"\s+", " ", h).strip()
    cleaned_headlines.append(h)

## Set Models: Embedding, Vectorizer, UMAP, HDBSCAN

In [None]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=10,
    max_df=0.5,
    max_features=5000,
    token_pattern=r"(?u)\b[\w\-]+\b",
)

In [None]:
umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,
    min_samples=10,
    cluster_selection_epsilon=0.1,
    prediction_data=True,
)

## Fit BERTopic

In [None]:
topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer,
    verbose=True,
)

topics, probs = topic_model.fit_transform(cleaned_headlines)

## Soft Cluster Probabilities + DataFrame

In [None]:
prob_matrix = np.array(all_points_membership_vectors(topic_model.hdbscan_model))
normalized_prob = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)

prob_df = pd.DataFrame(
    normalized_prob, columns=[str(i) for i in range(normalized_prob.shape[1])]
)
prob_df["dominant_topic"] = prob_df.idxmax(axis=1)
prob_df["topic"] = topics

## Map BERTopic's Internal Topic ID

In [None]:
topic_mapping = topic_model.topic_mapper_.get_mappings()

prob_df["topic"] = (
    pd.to_numeric(prob_df["topic"], errors="coerce").fillna(-1).astype(int)
)
prob_df["dominant_topic"] = (
    pd.to_numeric(prob_df["dominant_topic"], errors="coerce").fillna(-1).astype(int)
)
prob_df["mapped_topic"] = prob_df["dominant_topic"].map(topic_mapping)

## Agglomerative Merging of Similar Topics

In [None]:
# topic_embeddings = topic_model.topic_embeddings_
# similarity_matrix = cosine_similarity(topic_embeddings)
# distance_matrix = 1 - similarity_matrix

# agg_cluster = AgglomerativeClustering(
#     n_clusters=15, metric="precomputed", linkage="average"
# )
# topic_groups = agg_cluster.fit_predict(distance_matrix)
# print("Davies-Bouldin Score:", davies_bouldin_score(distance_matrix, topic_groups))
# topic_group_map = pd.DataFrame(
#     {"Original_Topic": range(len(topic_groups)), "New_Group": topic_groups}
# )

# prob_df["merged_group"] = prob_df["mapped_topic"].map(
#     topic_group_map.set_index("Original_Topic")["New_Group"]
# )

## Merge Metadata Back to News Data

In [None]:
# gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()
# gold_df_filtered = pd.concat([gold_df_filtered, prob_df.reset_index(drop=True)], axis=1)

## Visualize Topics

In [None]:
topic_model.visualize_topics().show()
topic_model.visualize_barchart(top_n_topics=10).show()
topic_model.visualize_hierarchy().show()
topic_model.visualize_heatmap()

## Distance based cluster merging

In [None]:
embeddings = topic_model.topic_embeddings_
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity = cosine_similarity(embeddings)
distance = 1 - similarity  # for clustering

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

Z = linkage(distance, method="average")  # or 'ward' if Euclidean

In [None]:
macro_labels = fcluster(Z, t=20, criterion="maxclust")

In [None]:
# macro_labels = fcluster(Z, t=0.2, criterion="distance")

In [None]:
topic_group_map = pd.DataFrame(
    {"Original_Topic": np.arange(len(macro_labels)), "Macro_Group": macro_labels}
)

In [None]:
topic_group_map

In [None]:
topic_group_map["Macro_Group"].value_counts().plot(kind="bar")

In [None]:
gold_df["Macro_Group"] = [
    (
        topic_group_map.loc[
            topic_group_map["Original_Topic"] == t, "Macro_Group"
        ].values[0]
        if t in topic_group_map["Original_Topic"].values
        else -1
    )
    for t in topics
]

In [None]:
gold_df

In [None]:
# Align topic-to-group mapping with the shape of the probability matrix
aligned_macro_group_map = topic_group_map.loc[
    topic_group_map["Original_Topic"] < prob_matrix.shape[1]
]

# Compute macro group probabilities (via matrix multiplication)
macro_group_prob = (
    prob_matrix @ pd.get_dummies(aligned_macro_group_map["Macro_Group"]).values
)

# Store as list in a single column
prob_df["all_probabilities"] = macro_group_prob.tolist()

In [None]:
import numpy as np

# Small epsilon to avoid log(0)
epsilon = 1e-12

# Convert to log-space
log_macro_group_prob = np.log(macro_group_prob + epsilon)

# Store log-space probabilities as a list in a single column
prob_df["all_probabilities"] = log_macro_group_prob.tolist()

In [None]:
prob_df

In [None]:
# Add the probability vector as a new column
gold_df["Probabilities"] = prob_df["all_probabilities"]

# Add the Macro_Group column
gold_df["Macro_Group"] = [
    (
        topic_group_map.loc[
            topic_group_map["Original_Topic"] == t, "Macro_Group"
        ].values[0]
        if t in topic_group_map["Original_Topic"].values
        else -1
    )
    for t in topics
]

In [None]:
gold_df

In [None]:
gold_df["Macro_Group"].value_counts().plot(kind="bar")

## Test on new headline

In [None]:
test_sentence = "Gold falls down 2 perc as silver rises"
cleaned = re.sub(months, "", test_sentence.lower())
cleaned = re.sub(directions, "", cleaned)
cleaned = re.sub(numbers, "", cleaned)
cleaned = re.sub(symbols, "", cleaned)
cleaned = re.sub(r"[^\w\s]", "", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()

topic, prob = topic_model.transform([cleaned])
merged_group = topic_group_map.loc[
    topic_group_map["Original_Topic"] == topic[0], "Macro_Group"
].values[0]

In [None]:
topic

In [None]:
merged_group