In [9]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Downloading plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting scikit-learn>=1.0 (from bertopic)
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting tqdm>=4.41.1 (from bertopic)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting llvmlite>0.36.0 (from bertopic)
  Downloading llvmlite-0.45.1-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting scipy>=1.0 (from hdbscan>=0.8.29->bertopic)
  Downloading scipy-1.16.3-cp312-cp312-win_amd64.whl.meta


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
#Connect to MongoDB
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.server_api import ServerApi

load_dotenv()

MONGODB_CONNECTION_STRING = os.getenv("MONGODB_CONNECTION_STRING")

client = MongoClient(MONGODB_CONNECTION_STRING, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


Implementation for Topic Modeling

In [2]:
#Data cleaning for Topic Modeling
import pandas as pd

db = client["finance_news_db"]
collection = db["numerous_articles"]

#DataFrame
df = pd.DataFrame(list(collection.find()))
df.head()

Unnamed: 0,_id,title,publisher,tickers,link,authors,time_published,content
0,692af2e04a7fefe22f224c5f,Rising star catch-up: Chad Tredway is back at ...,Business Insider,"[{'symbol': 'JPM', 'change': None}]",https://finance.yahoo.com/news/rising-star-cat...,dgeiger@businessinsider.com (Daniel Geiger),"Sat, November 29, 2025 at 7:20 PM GMT+7",Chad Tredway was a rising star at JPMorgan Cha...
1,692af2e14a7fefe22f224c60,Want Steady Income in Retirement? These Overlo...,Investopedia,[],https://finance.yahoo.com/news/want-steady-inc...,Jonathan Ponciano,"Sat, November 29, 2025 at 7:08 PM GMT+7",Klaus Vedfelt / Getty Images\n93% of workers w...
2,692af2e24a7fefe22f224c61,The hottest new AI company is…Google?,CNN Business,[],https://finance.yahoo.com/news/hottest-ai-comp...,"Analysis by Lisa Eadicicco, CNN","Sat, November 29, 2025 at 7:01 PM GMT+7",Google just threw another twist in the fast-ch...
3,692af2e34a7fefe22f224c62,Medicare Advantage woos seniors with plan perk...,Yahoo Finance,"[{'symbol': 'HUM', 'change': None}]",https://finance.yahoo.com/news/medicare-advant...,Kerry Hannon · Senior Columnist,"Sat, November 29, 2025 at 6:57 PM GMT+7",Seniors have embraced Medicare Advantage plans...
4,692af2e44a7fefe22f224c63,Why Waiting for a Housing Crash Could Be Costi...,Investopedia,[],https://finance.yahoo.com/news/why-waiting-hou...,Isabel O'Brien,"Sat, November 29, 2025 at 6:39 PM GMT+7",Fact checked by Suzanne Kvilhaug\nDrs Producoe...


In [4]:
#Data preparation for Topic Modeling
df['content'] = df['content'].astype(str).str.replace('\n', ' ').str.replace('\r', ' ')
input_corpus = df['content'].tolist()
print(df[['content']][0:2])
print(input_corpus)

                                             content
0  Chad Tredway was a rising star at JPMorgan Cha...
1  Klaus Vedfelt / Getty Images 93% of workers wa...


In [20]:
from umap import UMAP
import hdbscan
import numpy as np

#Setup Topic Modeling parameters
top_words = 5;
topic_size = 5;
np.random.seed(67)

umap_model = UMAP(n_neighbors=15, 
                  n_components=15, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=67)

hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=15, 
                                metric='euclidean', 
                                cluster_selection_method='eom', 
                                prediction_data=True)

In [21]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired 

#Model Training
topic_model = BERTopic(language="english", 
                       calculate_probabilities=True, 
                       verbose=True,
                       representation_model=KeyBERTInspired(),
                       top_n_words=top_words,
                       min_topic_size=topic_size,
                       umap_model=umap_model,          
                       hdbscan_model=hdbscan_model)

topics, probs = topic_model.fit_transform(input_corpus)

2025-12-02 14:25:19,575 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 25/25 [00:15<00:00,  1.66it/s]
2025-12-02 14:25:38,822 - BERTopic - Embedding - Completed ✓
2025-12-02 14:25:38,823 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-02 14:25:40,716 - BERTopic - Dimensionality - Completed ✓
2025-12-02 14:25:40,717 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-02 14:25:40,762 - BERTopic - Cluster - Completed ✓
2025-12-02 14:25:40,764 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-02 14:25:41,542 - BERTopic - Representation - Completed ✓


In [22]:
#Topic Representation
topic_info = topic_model.get_topic_info()
#print(topic_info)
#index -1 = noise

#Display popular words in each topic
for topic in topic_info['Topic'].tolist():
    if topic == -1:
        continue
    #print(f"Topic {topic}: {topic_model.get_topic(topic)}")
    print(f"Topic {topic_info['Name'][topic_info['Topic'] == topic].values[0]}: {topic_info['Representation'][topic_info['Topic'] == topic].values[0]}")

Topic 0_investors_interest_bank_markets: ['investors', 'interest', 'bank', 'markets', 'reserve', 'assets', 'market', 'fed', 'rates', 'crypto']
Topic 1_savings_financial_rates_rate: ['savings', 'financial', 'rates', 'rate', 'cash', 'mortgage', 'interest', 'income', 'average', 'earn']
Topic 2_ceo_investors_market_bubble: ['ceo', 'investors', 'market', 'bubble', 'companies', 'ai', 'billion', 'apple', 'stock', 'business']
Topic 3_investments_cash_revenue_profitable: ['investments', 'cash', 'revenue', 'profitable', 'market', 'demand', 'valuation', 'stocks', 'business', 'companies']
Topic 4_market_prices_assets_2026: ['market', 'prices', 'assets', '2026', 'million', 'production', 'traders', 'capacity', 'exports', 'bid']
Topic 5_prices_sales_dollars_interest: ['prices', 'sales', 'dollars', 'interest', 'dollar', 'gambling', 'cents', 'dinar', 'peso', 'futures']
Topic 6_revenue_marketing_business_revenues: ['revenue', 'marketing', 'business', 'revenues', 'stock', 'share', 'analysts', 'million', 

In [25]:
#Most Popular Words per Topic Visualization 
topic_model.visualize_barchart(
    top_n_topics=11,
    n_words=5, 
    title=f"Top {5} Most Popular Words per Topic"
)

In [26]:
#Visualization
topic_model.visualize_documents(
    input_corpus, 
    title="News Topic in Latent Space"
)