In [1]:
import pandas as pd
import numpy as np
import re

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN, all_points_membership_vectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score

  from .autonotebook import tqdm as notebook_tqdm


## Load & Preprocess Headlines

In [2]:
gold_df = pd.read_csv("gold-dataset-sinha-khandait.csv")
headlines = gold_df["News"].dropna().astype(str).tolist()

# Custom cleaning rules
months = r"\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)\b"
directions = r"\b(up|down|higher|lower|rise|rises|fall|falls|gain|gains|loses|loss|rebound|slip|climb|surge|drop|drops|edged|edges|recover|recovery|recovers|flat)\b"
numbers = r"[\d\.,]+[%$]?|\d{1,3}(,\d{3})*(\.\d+)?|\d+"
symbols = r"\/oz|rs|bn|usd|\$|%|oz"


# Clean each headline
def clean_text(text):
    text = text.lower()
    text = re.sub(months, "", text)
    text = re.sub(directions, "", text)
    text = re.sub(numbers, "", text)
    text = re.sub(symbols, "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()


cleaned_headlines = [clean_text(h) for h in headlines]

## Model Setup (Embedding, Vectorizer, UMAP, HDBSCAN)

In [3]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

In [4]:
vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3),
    min_df=10,
    max_df=0.5,
    max_features=5000,
    token_pattern=r"(?u)\b[\w\-]+\b",
)

In [5]:
umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

In [6]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,
    min_samples=10,
    cluster_selection_epsilon=0.1,
    prediction_data=True,
)

## Fit BERTopic

In [7]:
topic_model = BERTopic(
    embedding_model=embedding_model,  # use object, not string
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer,
    verbose=True,
)

topics, probs = topic_model.fit_transform(cleaned_headlines)

2025-04-13 21:58:10,114 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 331/331 [00:16<00:00, 19.73it/s]
2025-04-13 21:58:26,953 - BERTopic - Embedding - Completed ✓
2025-04-13 21:58:26,954 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-04-13 21:58:39,271 - BERTopic - Dimensionality - Completed ✓
2025-04-13 21:58:39,272 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-13 21:58:39,455 - BERTopic - Cluster - Completed ✓
2025-04-13 21:58:39,457 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-13 21:58:39,567 - BERTopic - Representation - Completed ✓


In [8]:
topic_model.visualize_topics().show()
topic_model.visualize_barchart(top_n_topics=10).show()
topic_model.visualize_hierarchy().show()
topic_model.visualize_heatmap()

## Create Probability DataFrame

In [9]:
prob_matrix = np.array(all_points_membership_vectors(topic_model.hdbscan_model))
normalized_prob = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)

prob_df = pd.DataFrame(
    normalized_prob, columns=[str(i) for i in range(normalized_prob.shape[1])]
)
prob_df["dominant_topic"] = prob_df.idxmax(axis=1).astype(int)
prob_df["topic"] = topics

## Distance-Based Macro Topic Merging

In [10]:
embeddings = topic_model.topic_embeddings_
similarity = cosine_similarity(embeddings)
distance = 1 - similarity

In [11]:
from sklearn.metrics import davies_bouldin_score

# Evaluate DB score for different cluster counts
best_score = float("inf")
best_t = None
Z = linkage(distance, method="average")
print("Davies-Bouldin Scores by Number of Macro Groups:")
for t in range(10, 20, 2):  # Try 10 to 50 macro groups
    cluster_labels = fcluster(Z, t=t, criterion="maxclust")
    try:
        db_score = davies_bouldin_score(distance, cluster_labels)
        print(f"{t} clusters → DB Score: {db_score:.4f}")
        if db_score < best_score:
            best_score = db_score
            best_t = t
    except Exception as e:
        print(f"{t} clusters → DB Score: [ERROR] {str(e)}")

print(f"\n✅ Best number of clusters (lowest DB): {best_t} with score {best_score:.4f}")

Davies-Bouldin Scores by Number of Macro Groups:
10 clusters → DB Score: 0.8922
12 clusters → DB Score: 0.9024
14 clusters → DB Score: 0.8250
16 clusters → DB Score: 0.7500
18 clusters → DB Score: 0.6421

✅ Best number of clusters (lowest DB): 18 with score 0.6421


In [12]:
macro_labels = fcluster(Z, t=best_t, criterion="maxclust")

In [13]:
topic_group_map = pd.DataFrame(
    {"Original_Topic": np.arange(len(macro_labels)), "Macro_Group": macro_labels}
)

In [14]:
topic_group_map

Unnamed: 0,Original_Topic,Macro_Group
0,0,15
1,1,3
2,2,3
3,3,3
4,4,3
5,5,13
6,6,3
7,7,6
8,8,3
9,9,9


## Assign Macro Groups and Compute Log Probabilities

In [15]:
# Step 1: Create topic → macro group mapping (aligned by topic index order)
macro_group_map_vector = (
    topic_group_map.set_index("Original_Topic")
    .loc[list(range(prob_matrix.shape[1]))]["Macro_Group"]
    .values
)

# Step 2: Convert to one-hot (n_topics x n_macro_groups)
macro_group_onehot = pd.get_dummies(macro_group_map_vector)

# Step 3: Matrix multiplication
macro_group_prob = prob_matrix @ macro_group_onehot.values

# Step 4: Convert to log-space safely
epsilon = 1e-12
log_macro_group_prob = np.log(macro_group_prob + epsilon)

# Step 5: Store in DataFrame
prob_df["all_probabilities"] = log_macro_group_prob.tolist()

## Merge Probabilities and Groups Back to Data

In [16]:
gold_df_filtered = gold_df.loc[gold_df["News"].notna()].copy()

gold_df_filtered["Topic"] = topics
gold_df_filtered["Macro_Group"] = [
    (
        topic_group_map.loc[
            topic_group_map["Original_Topic"] == t, "Macro_Group"
        ].values[0]
        if t in topic_group_map["Original_Topic"].values
        else -1
    )
    for t in topics
]
gold_df_filtered["Probabilities"] = prob_df["all_probabilities"]

In [17]:
# Convert list of log-probs back to array
log_probs_array = np.array(gold_df_filtered["Probabilities"].tolist())

# Get argmax (highest log-probability → dominant macro group index)
dominant_macro_groups = log_probs_array.argmax(axis=1)

# Add it as a new column
gold_df_filtered["Dominant_Macro_Group"] = dominant_macro_groups

In [18]:
gold_df_filtered.head()

Unnamed: 0,Dates,URL,News,Price Direction Up,Price Direction Constant,Price Direction Down,Asset Comparision,Past Information,Future Information,Price Sentiment,Topic,Macro_Group,Probabilities,Dominant_Macro_Group
0,28-01-2016,http://www.marketwatch.com/story/april-gold-do...,"april gold down 20 cents to settle at $1,116.1...",0,0,1,0,1,0,negative,33,16,"[-27.631021115928547, -27.631021115928547, 1.0...",2
1,13-09-2017,http://www.marketwatch.com/story/gold-prices-s...,gold suffers third straight daily decline,0,0,1,0,1,0,negative,4,3,"[-27.631021115928547, -27.631021115928547, 1.0...",2
2,26-07-2016,http://www.marketwatch.com/story/gold-futures-...,Gold futures edge up after two-session decline,1,0,0,0,1,0,positive,28,7,"[-27.631021115928547, -27.631021115928547, -27...",5
3,28-02-2018,https://www.metalsdaily.com/link/277199/dent-r...,dent research : is gold's day in the sun comin...,0,0,0,0,0,1,none,23,11,"[-27.631021115928547, -27.631021115928547, -27...",9
4,06-09-2017,http://www.marketwatch.com/story/gold-steadies...,"Gold snaps three-day rally as Trump, lawmakers...",0,0,1,0,1,0,negative,-1,-1,"[-4.196897111170408, -5.228338252285433, -2.99...",15


In [19]:
matching_rows = (
    gold_df_filtered["Dominant_Macro_Group"] == gold_df_filtered["Macro_Group"]
)
matching_count = matching_rows.sum()

print(f"Number of matching rows: {matching_count}")

Number of matching rows: 307


In [20]:
gold_df_filtered.Macro_Group.value_counts().sort_index()

Macro_Group
-1     3298
 1      357
 2      196
 3     2811
 4      247
 5       69
 6      879
 7       90
 8       81
 9      201
 10     158
 11     105
 12     486
 13     295
 14     100
 15     715
 16     146
 17     149
 18     187
Name: count, dtype: int64

In [21]:
gold_df_filtered.Dominant_Macro_Group.value_counts().sort_index()

Dominant_Macro_Group
0      196
1       97
2     3141
3      202
4      428
5     1671
6       73
7      302
8       68
9      410
10     660
11    2084
12     189
13     150
14     104
15     547
16     183
17      65
Name: count, dtype: int64

## Group Headlines by Macro Group

In [22]:
import google.generativeai as genai
import random
import time
from tqdm import tqdm

In [23]:
# Assuming gold_df contains the cleaned headlines and 'Macro_Group'
grouped_headlines = (
    gold_df_filtered.groupby("Macro_Group")["News"].apply(list).to_dict()
)

In [None]:
# Get top keywords for each topic
topic_keywords = {
    topic_num: [word for word, _ in topic_model.get_topic(topic_num)]
    for topic_num in topic_model.get_topic_info()["Topic"]
    if topic_num != -1
}

# Map macro group to all topics it contains
macro_to_topics = (
    topic_group_map.groupby("Macro_Group")["Original_Topic"].apply(list).to_dict()
)

# For each macro group, collect top N keywords from each associated topic
macro_group_keywords = {}
for macro_id, topic_ids in macro_to_topics.items():
    keywords = []
    for tid in topic_ids:
        # 5 keys words from each macro topic that get added into the keywords
        keywords.extend(topic_keywords.get(tid, [])[:5])
    # Keep top 25 most common keywords
    macro_group_keywords[macro_id] = list(dict.fromkeys(keywords))[:25]

In [None]:
all_keywords_prompt = f"""You are helping identify economic themes.

Below is a dictionary where each key represents a macro group ID, and the value is a list of keywords extracted from clustered gold-related news headlines.

Your task is to generate a short (3–6 word) **unique** theme label summarizing the main idea of each keyword group. Ensure no two groups share the same theme label.

macro_group_keywords = {macro_group_keywords}

Return your response in the following format:

{{ 
  group_id_1: "Theme Label 1", 
  group_id_2: "Theme Label 2", 
  ...
}}
"""

In [None]:
# Step 1: Configure Gemini
genai.configure(api_key="your_key")  # replace with your actual API key
model = genai.GenerativeModel("models/gemini-2.0-flash")

In [None]:
# Generate all theme labels in one go
try:
    response = model.generate_content(all_keywords_prompt)
    response_text = response.text.strip()

    # Safely evaluate the returned dictionary (assuming it's Python-style dict)
    import ast

    # If response was a stringified JSON
    cleaned = response_text.strip().replace("\\n", "").replace("\n", "")
    cleaned = re.sub(r"^```json|^```|```$", "", cleaned.strip(), flags=re.MULTILINE)
    macro_group_labels = ast.literal_eval(cleaned)

except Exception as e:
    macro_group_labels = {"error": f"[ERROR] {str(e)}"}

In [29]:
macro_group_labels

{1: 'Gold Price Support Levels',
 2: 'Market Rates & Gold Demand',
 3: 'Global Investment & Rate Impact',
 4: 'Gold Settlement Price Correction',
 5: 'Time Sensitive Prediction',
 6: 'Economic Data & Gold Performance',
 7: 'Gold Trading Momentum Continues',
 8: 'Early Gold Trading Activity',
 9: 'Gold Spot Price Outlook',
 10: 'Comex Gold Closing Prices',
 11: 'Gold Market Turning Points',
 12: 'Global Cues and Gold Stability',
 13: 'China, Fed & Gold Focus',
 14: 'Gold Prices & Metal Shares',
 15: 'Gold Near Monthly Lows',
 16: 'Gold Climbs in Trading',
 17: 'Gold Holds as Data Releases',
 18: 'Rate Hikes/Cuts Price Impact'}

In [None]:
gold_df_filtered["Topic_Label"] = gold_df_filtered["Macro_Group"].map(
    macro_group_labels
)

In [None]:
gold_df_filtered["Topic_Label"] = np.where(
    gold_df_filtered["Macro_Group"] == -1, "noise", gold_df_filtered["Topic_Label"]
)

In [48]:
gold_df_filtered

Unnamed: 0,Dates,URL,News,Price Direction Up,Price Direction Constant,Price Direction Down,Asset Comparision,Past Information,Future Information,Price Sentiment,Topic,Macro_Group,Probabilities,Dominant_Macro_Group,Topic_Label
0,28-01-2016,http://www.marketwatch.com/story/april-gold-do...,"april gold down 20 cents to settle at $1,116.1...",0,0,1,0,1,0,negative,33,16,"[-27.631021115928547, -27.631021115928547, 1.0...",2,Gold Climbs in Trading
1,13-09-2017,http://www.marketwatch.com/story/gold-prices-s...,gold suffers third straight daily decline,0,0,1,0,1,0,negative,4,3,"[-27.631021115928547, -27.631021115928547, 1.0...",2,Global Investment & Rate Impact
2,26-07-2016,http://www.marketwatch.com/story/gold-futures-...,Gold futures edge up after two-session decline,1,0,0,0,1,0,positive,28,7,"[-27.631021115928547, -27.631021115928547, -27...",5,Gold Trading Momentum Continues
3,28-02-2018,https://www.metalsdaily.com/link/277199/dent-r...,dent research : is gold's day in the sun comin...,0,0,0,0,0,1,none,23,11,"[-27.631021115928547, -27.631021115928547, -27...",9,Gold Market Turning Points
4,06-09-2017,http://www.marketwatch.com/story/gold-steadies...,"Gold snaps three-day rally as Trump, lawmakers...",0,0,1,0,1,0,negative,-1,-1,"[-4.196897111170408, -5.228338252285433, -2.99...",15,noise
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10565,07-01-2013,https://www.moneycontrol.com/news/business/mar...,gold seen falling from 3-week high this week,0,0,1,0,1,0,negative,0,15,"[-27.631021115928547, -27.631021115928547, -27...",10,Gold Near Monthly Lows
10566,27-09-2018,https://www.metalsdaily.com/link/284468/domini...,dominic frisby : now looks like a good time to...,1,0,0,0,0,1,positive,1,3,"[-4.0906712565573615, -5.106674554783079, -2.5...",4,Global Investment & Rate Impact
10567,03-03-2017,https://www.thehindubusinessline.com/markets/g...,Gold heading for worst week since November on ...,0,0,1,0,1,0,negative,0,15,"[-3.7496462953932133, -4.533892448909454, -2.3...",10,Gold Near Monthly Lows
10568,11-06-2008,http://www.marketwatch.com/story/august-gold-u...,august gold up $7.60 at $878.80 an ounce on nymex,1,0,0,0,1,0,positive,21,3,"[-3.4146375360315675, -4.1595826528082815, -1....",2,Global Investment & Rate Impact


In [50]:
gold_df_filtered.to_csv("gold-dataset-clustered-v4.csv")

## Test on new headline

In [None]:
test_sentence = "april gold down 20 cents to settle at $1,116.1"
cleaned = re.sub(months, "", test_sentence.lower())
cleaned = re.sub(directions, "", cleaned)
cleaned = re.sub(numbers, "", cleaned)
cleaned = re.sub(symbols, "", cleaned)
cleaned = re.sub(r"[^\w\s]", "", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()

topic, prob = topic_model.transform([cleaned])
if topic == -1:
    print("noise")
else:
    merged_group = topic_group_map.loc[
        topic_group_map["Original_Topic"] == topic[0], "Macro_Group"
    ].values[0]
    print(merged_group)

Batches: 100%|██████████| 1/1 [00:00<00:00, 16.77it/s]
2025-04-13 22:08:45,333 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-13 22:08:45,335 - BERTopic - Dimensionality - Completed ✓
2025-04-13 22:08:45,335 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-13 22:08:45,335 - BERTopic - Cluster - Completed ✓


16
