In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN
from keybert import KeyBERT
import hdbscan


# Step 1: Load data
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

In [None]:
headlines

In [None]:
import re
import numpy as np

headlines_lower = [h.lower() for h in headlines]

months = r"\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)\b"
directions = r"\b(up|down|higher|lower|rise|rises|fall|falls|gain|gains|loses|loss|rebound|slip|climb|surge|drop|drops|edged|edges|recover|recovery|recovers|flat)\b"
numbers = r"[\d\.,]+[%$]?|\d{1,3}(,\d{3})*(\.\d+)?|\d+"
symbols = r"\/oz|rs|bn|usd|\$|%|oz"

cleaned_headlines = []
for h in headlines_lower:
    h_clean = re.sub(months, "", h, flags=re.IGNORECASE)
    h_clean = re.sub(directions, "", h_clean, flags=re.IGNORECASE)
    h_clean = re.sub(numbers, "", h_clean)
    h_clean = re.sub(symbols, "", h_clean, flags=re.IGNORECASE)
    h_clean = re.sub(r"[^\w\s]", "", h_clean)  # remove punctuation
    h_clean = re.sub(r"\s+", " ", h_clean).strip()  # clean up spaces
    cleaned_headlines.append(h_clean.lower())

In [None]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3),  # uptil trigrams
    min_df=10,  # ignore words in less than 10 headlines (0.1%)
    max_df=0.5,  # ignore words in more than 50% headlines
    max_features=5_000,
    token_pattern=r"(?u)\b[\w\-]+\b",  # Keep hyphenated phrases (e.g., "AI-driven")
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,  # Test values between 30-100
    min_samples=10,  # Avoids micro-clusters (10-30% of min_cluster_size)
    cluster_selection_epsilon=0.1,  # Merges nearby clusters
    prediction_data=True,  # for soft clustering
)
from umap import UMAP

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42,  # valid here!
)

topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer,
    verbose=True,
    # Added random state for reproducibility
)

In [None]:
topics, probs = topic_model.fit_transform(cleaned_headlines)

In [None]:
topic_model.hdbscan_model.probabilities_

In [None]:
(probs == topic_model.hdbscan_model.probabilities_).all()

In [None]:
prob_matrix = np.array(hdbscan.all_points_membership_vectors(topic_model.hdbscan_model))

In [None]:
prob_matrix

In [None]:
normalized_prob = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)

In [None]:
normalized_prob[1]

In [None]:
def get_hdbscan_probabilities(topic_model, documents):
    """Get full probability distributions for HDBSCAN using soft clustering"""
    # First ensure we have prediction data
    if not hasattr(topic_model.hdbscan_model, "prediction_data_"):
        raise ValueError("HDBSCAN needs to be initialized with prediction_data=True")

    # Get embeddings
    embeddings = topic_model._extract_embeddings(documents)

    # Get all cluster probabilities using HDBSCAN's soft clustering
    soft_clusters = hdbscan.all_points_membership_vectors(topic_model.hdbscan_model)

    # Convert to numpy array and normalize
    prob_matrix = np.array(soft_clusters)
    normalized_probs = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)

    return normalized_probs
    # return prob_matrix


# Usage:
hdbscan_probs = get_hdbscan_probabilities(topic_model, cleaned_headlines)

In [None]:
hdbscan_probs

In [None]:
hdbscan_probs[1] == max(hdbscan_probs[1])

In [None]:
n_topics = hdbscan_probs.shape[1]
column_names = [str(i) for i in range(0, n_topics)]

prob_df = pd.DataFrame(hdbscan_probs, columns=column_names)

In [None]:
prob_df

In [None]:
topic_columns = [col for col in prob_df.columns]

prob_df["dominant_topic"] = prob_df[topic_columns].idxmax(axis=1)

In [None]:
prob_df["topic"] = topics

In [None]:
# BERTopic's topic mapping
topic_mapping = topic_model.topic_mapper_.get_mappings()
print(topic_mapping)

In [None]:
prob_df

In [None]:
prob_df["topic"] = (
    pd.to_numeric(prob_df["topic"], errors="coerce").fillna(-1).astype(int)
)
prob_df["dominant_topic"] = (
    pd.to_numeric(prob_df["dominant_topic"], errors="coerce").fillna(-1).astype(int)
)
prob_df["mapped_topic"] = prob_df["dominant_topic"].map(topic_mapping)
prob_df

In [None]:
prob_df[prob_df["topic"] == -1]

In [None]:
prob_df["matching"] = prob_df["topic"] == prob_df["mapped_topic"]
prob_df["matching"].sum()

In [None]:
7621 / prob_df.shape[0]

In [None]:
2818 + 7621 == prob_df.shape[0]

In [None]:
(2818 + 7621) / 10570

In [None]:
prob_df[prob_df["matching"] == False]["topic"].value_counts()

In [None]:
prob_df

In [None]:
topic_model.topic_embeddings_

In [None]:
# Get topic embeddings (dimensions: n_topics x embedding_size)
topic_embeddings = topic_model.topic_embeddings_
topic_embeddings.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix (n_topics x n_topics)
similarity_matrix = cosine_similarity(topic_embeddings)
similarity_matrix

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_target_clusters = 10
distance_matrix = 1 - similarity_matrix  # Convert to distance
agg_cluster = AgglomerativeClustering(
    n_clusters=n_target_clusters,
    metric="precomputed",  # Critical fix
    linkage="average",
)
topic_groups = agg_cluster.fit_predict(distance_matrix)

In [None]:
topic_group_map = pd.DataFrame(
    {"Original_Topic": range(len(topic_groups)), "New_Group": topic_groups}
)

In [None]:
topic_group_map

In [None]:
topic_groups

In [None]:
prob_df["merged_group"] = prob_df["mapped_topic"].map(
    topic_group_map.set_index("Original_Topic")["New_Group"]
)

In [None]:
prob_df

In [None]:
# Step 3: Save topics back to dataframe
gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()

In [None]:
gold_df_filtered = pd.concat([gold_df_filtered, prob_df], axis=1)

In [None]:
# Step 4: View top 10 topics
print(topic_model.get_topic_info().head(10))

In [None]:
sample_topic = 0  # change this to see different clusters
print(f"\n--- Sample Headlines from Topic {sample_topic} ---")
print(gold_df_filtered[gold_df_filtered["topic"] == sample_topic]["News"].head(5))

# Step 6: Visualize
topic_model.visualize_topics().show()

In [None]:
topic_model.visualize_barchart(top_n_topics=10).show()

In [None]:
topic_model.visualize_hierarchy().show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

Z = linkage(distance_matrix, "average")
plt.figure(figsize=(12, 6))
dendrogram(Z, labels=[f"Topic {i}" for i in range(len(topic_embeddings))])
plt.xticks(rotation=90)
plt.title("Hierarchical Clustering Dendrogram")
plt.show()

In [None]:
topic_model.visualize_heatmap()

In [None]:
gold_df_filtered

In [None]:
gold_df_filtered["merged_group"].value_counts()

In [None]:
gold_df_filtered[gold_df_filtered["merged_group"] == 9]["News"][:30]

## testing

In [None]:
new_sentence = "Gold falls down 2 perc as silver rises"

# Clean the sentence (same as training)
cleaned_sentence = re.sub(months, "", new_sentence.lower())
cleaned_sentence = re.sub(directions, "", cleaned_sentence)
cleaned_sentence = re.sub(numbers, "", cleaned_sentence)
cleaned_sentence = re.sub(symbols, "", cleaned_sentence)
cleaned_sentence = re.sub(r"[^\w\s]", "", cleaned_sentence)
cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence).strip()

cleaned_sentence

In [None]:
topic, prob = topic_model.transform([cleaned_sentence])

In [None]:
topic

In [None]:
dominant_topic = topic[0]
dominant_topic

In [None]:
prob

In [None]:
merged_group = topic_group_map[topic_group_map["Original_Topic"] == dominant_topic][
    "New_Group"
].values[0]

In [None]:
merged_group

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [None]:
# # Step 1: Get topic embeddings and IDs
# topic_ids = topic_model.get_topic_info().Topic.tolist()
# # Filter out -1 (outliers)
# topic_ids = [t for t in topic_ids if t != -1]

In [None]:
# embeddings = topic_model.topic_embeddings_
# # Only keep embeddings for the selected topic IDs
# topic_idx_map = {i: topic_ids.index(i) for i in topic_ids}
# filtered_embeddings = np.array([embeddings[i] for i in topic_ids])

In [None]:
# Step 2: Compute cosine similarity matrix
# cosine_sim = cosine_similarity(filtered_embeddings)

# # Step 3: Find topic pairs with high similarity (excluding diagonal)
# threshold = 0.85
# highly_similar_pairs = []
# for i, j in itertools.combinations(range(len(topic_ids)), 2):
#     if cosine_sim[i, j] >= threshold:
#         highly_similar_pairs.append((topic_ids[i], topic_ids[j]))

In [None]:
# unique_topics = gold_df_filtered.Topic.unique()

In [None]:
# # Step 1: Build Union-Find to track connected components
# class UnionFind:
#     def __init__(self):
#         self.parent = {}

#     def find(self, x):
#         if x != self.parent.setdefault(x, x):
#             self.parent[x] = self.find(self.parent[x])
#         return self.parent[x]

#     def union(self, x, y):
#         self.parent[self.find(y)] = self.find(x)


# uf = UnionFind()
# for a, b in highly_similar_pairs:
#     uf.union(min(a, b), max(a, b))  # Always union to the smaller ID

# # Step 2: Build final topic mapping to lowest ID in each group
# # Also apply to all unique topics (including those not in pairs)
# final_mapping = {}
# for topic in unique_topics:
#     if topic == -1:
#         final_mapping[topic] = -1
#     else:
#         final_mapping[topic] = uf.find(topic)

# # Step 3: Apply the mapping to the dataframe
# gold_df_filtered["Merged_Topic"] = gold_df_filtered["Topic"].map(final_mapping)

In [None]:
# # remove the noise data
# noise_data = gold_df_filtered[gold_df_filtered.Merged_Topic == -1]

# gold_df_filtered = gold_df_filtered[gold_df_filtered.Merged_Topic != -1]

In [None]:
# noise_data.shape

In [None]:
# gold_df_filtered.shape

In [None]:
# len(gold_df_filtered["Merged_Topic"].unique())

In [None]:
# gold_df_filtered["Merged_Topic"].value_counts()

In [None]:
# gold_df_filtered[gold_df_filtered.Merged_Topic == 39]["News"]
# seems to be downward price movement

In [None]:
# gold_df_filtered[gold_df_filtered.Merged_Topic == 38]["News"]
# macroeconomic events

In [None]:
# gold_df_filtered[gold_df_filtered.Merged_Topic == 13]["News"].head(10)
# bullish movement

In [None]:
# gold_df_filtered[gold_df_filtered.Merged_Topic == 28]["News"].head(10)
# broader market conditions