In [9]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Downloading plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting scikit-learn>=1.0 (from bertopic)
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting tqdm>=4.41.1 (from bertopic)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting llvmlite>0.36.0 (from bertopic)
  Downloading llvmlite-0.45.1-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting scipy>=1.0 (from hdbscan>=0.8.29->bertopic)
  Downloading scipy-1.16.3-cp312-cp312-win_amd64.whl.meta


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
#Connect to MongoDB
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.server_api import ServerApi

load_dotenv()

MONGODB_CONNECTION_STRING = os.getenv("MONGODB_CONNECTION_STRING")

client = MongoClient(MONGODB_CONNECTION_STRING, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


Implementation for Topic Modeling

In [5]:
#Data cleaning for Topic Modeling
import pandas as pd
import pprint as pp
db = client["finance_news_db"]
collection = db["news_articles"]
data = list(collection.find({}))

#pp.pprint (collection.find_one())
# Turn into pandas DataFrame
df = pd.DataFrame(list(collection.find()))
df.head()





Unnamed: 0,_id,title,publisher,tickers,link,authors,time_published,content
0,692ae48a4a7fefe22f224b8a,Medicare Advantage woos seniors with plan perk...,Yahoo Finance,"[{'symbol': 'HUM', 'change': '-0.09%'}]",https://finance.yahoo.com/news/medicare-advant...,Kerry Hannon · Senior Columnist,"Sat, November 29, 2025 at 6:57 PM GMT+7",Seniors have embraced Medicare Advantage plans...
1,692ae48b4a7fefe22f224b8b,BofA Tracked Credit and Debit Spending By Gene...,Investopedia,"[{'symbol': 'BAC', 'change': None}]",https://finance.yahoo.com/news/bofa-tracked-cr...,Adam Hayes,"Sat, November 29, 2025 at 6:08 PM GMT+7",PeopleImages / Getty Images\nU.S. card spendin...
2,692ae48c4a7fefe22f224b8c,Palantir uses the '5 Whys' approach to problem...,Business Insider,"[{'symbol': 'PLTR', 'change': None}, {'symbol'...",https://finance.yahoo.com/news/palantir-uses-5...,Brent D. Griffiths,"Sat, November 29, 2025 at 6:01 PM GMT+7",Palantir CEO Alex Karp swears by a method that...
3,692ae48c4a7fefe22f224b8d,"HELOC rates today, November 29, 2025: Rates fa...",Yahoo Personal Finance,[],https://finance.yahoo.com/personal-finance/mor...,"Hal Bundrick, CFP® · Senior Writer Laur...","Sat, November 29, 2025 at 6:00 PM GMT+7",The national average HELOC rate remains under ...
4,692ae48d4a7fefe22f224b8e,Why the World’s Top Coffee Producer is Switchi...,Bloomberg,[],https://finance.yahoo.com/news/why-world-top-c...,Renata Carlos Daou,"Sat, November 29, 2025 at 6:00 PM GMT+7",A worker packs coffee cherries during a harves...


In [6]:
#Change time_published to Datetime Object
df['time_published'] = pd.to_datetime(df['time_published'], errors='coerce')

for col in df.columns:
    # Check if the column is of object type (usually strings in pandas)
    if df[col].dtype == 'object':
        # Apply .str.strip() if the column contains strings
        # Use errors='ignore' to handle non-string elements gracefully if any
        try:
            df[col] = df[col].astype(str).str.strip()
        except AttributeError:
            # Handle cases where .str might not be applicable (e.g., lists in 'tickers' column)
            pass

# Special handling for 'tickers' column if it contains list of dicts that need stripping
# This assumes the user wants to strip string values within the dictionaries as well
def strip_ticker_dict(ticker_list):
    if isinstance(ticker_list, list):
        cleaned_list = []
        for item in ticker_list:
            if isinstance(item, dict):
                cleaned_item = {k.strip() if isinstance(k, str) else k: v.strip() if isinstance(v, str) else v for k, v in item.items()}
                cleaned_list.append(cleaned_item)
            else:
                cleaned_list.append(item.strip() if isinstance(item, str) else item)
        return cleaned_list
    return ticker_list

df['tickers'] = df['tickers'].apply(strip_ticker_dict)

print("DataFrame after stripping spaces from string attributes:")

display(df.head())


DataFrame after stripping spaces from string attributes:


  df['time_published'] = pd.to_datetime(df['time_published'], errors='coerce')


Unnamed: 0,_id,title,publisher,tickers,link,authors,time_published,content
0,692ae48a4a7fefe22f224b8a,Medicare Advantage woos seniors with plan perk...,Yahoo Finance,"[{'symbol': 'HUM', 'change': '-0.09%'}]",https://finance.yahoo.com/news/medicare-advant...,Kerry Hannon · Senior Columnist,2025-11-29 18:57:00-07:00,Seniors have embraced Medicare Advantage plans...
1,692ae48b4a7fefe22f224b8b,BofA Tracked Credit and Debit Spending By Gene...,Investopedia,"[{'symbol': 'BAC', 'change': None}]",https://finance.yahoo.com/news/bofa-tracked-cr...,Adam Hayes,2025-11-29 18:08:00-07:00,PeopleImages / Getty Images\nU.S. card spendin...
2,692ae48c4a7fefe22f224b8c,Palantir uses the '5 Whys' approach to problem...,Business Insider,"[{'symbol': 'PLTR', 'change': None}, {'symbol'...",https://finance.yahoo.com/news/palantir-uses-5...,Brent D. Griffiths,2025-11-29 18:01:00-07:00,Palantir CEO Alex Karp swears by a method that...
3,692ae48c4a7fefe22f224b8d,"HELOC rates today, November 29, 2025: Rates fa...",Yahoo Personal Finance,[],https://finance.yahoo.com/personal-finance/mor...,"Hal Bundrick, CFP® · Senior Writer Laur...",2025-11-29 18:00:00-07:00,The national average HELOC rate remains under ...
4,692ae48d4a7fefe22f224b8e,Why the World’s Top Coffee Producer is Switchi...,Bloomberg,[],https://finance.yahoo.com/news/why-world-top-c...,Renata Carlos Daou,2025-11-29 18:00:00-07:00,A worker packs coffee cherries during a harves...


In [7]:
#Data preparation for Topic Modeling
df['content'] = df['content'].astype(str).str.replace('\n', ' ').str.replace('\r', ' ')
input_corpus = df['content'].tolist()
print(df[['content']][0:2])
print(input_corpus)

                                             content
0  Seniors have embraced Medicare Advantage plans...
1  PeopleImages / Getty Images U.S. card spending...


In [41]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired 

#Model Training
topic_model = BERTopic(language="english", 
                       calculate_probabilities=True, 
                       verbose=True,
                       representation_model=KeyBERTInspired(),
                       top_n_words=10,
                       nr_topics=10,
                       min_topic_size=4)

topics, probs = topic_model.fit_transform(input_corpus)


2025-12-01 19:58:15,764 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 4/4 [00:01<00:00,  2.13it/s]
2025-12-01 19:58:21,331 - BERTopic - Embedding - Completed ✓
2025-12-01 19:58:21,332 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-01 19:58:21,490 - BERTopic - Dimensionality - Completed ✓
2025-12-01 19:58:21,490 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-01 19:58:21,497 - BERTopic - Cluster - Completed ✓
2025-12-01 19:58:21,497 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-12-01 19:58:21,537 - BERTopic - Representation - Completed ✓
2025-12-01 19:58:21,538 - BERTopic - Topic reduction - Reducing number of topics
2025-12-01 19:58:21,538 - BERTopic - Topic reduction - Number of topics (10) is equal or higher than the clustered topics(8).
2025-12-01 19:58:21,539 - BERTopic - Representation - Fine-tuning topics using representation models.
2

In [42]:
freq = topic_model.get_topic_info()
print(freq.head())

   Topic  Count                                       Name  \
0     -1     15  -1_bankruptcies_markets_economy_financial   
1      0     11       0_shopping_retailers_shoppers_retail   
2      1     18               1_prices_futures_cocoa_wheat   
3      2      5                   2_fed_feds_federal_trump   
4      3     21                  3_stocks_stock_nasdaq_dow   

                                      Representation  \
0  [bankruptcies, markets, economy, financial, in...   
1  [shopping, retailers, shoppers, retail, sales,...   
2  [prices, futures, cocoa, wheat, cents, tariffs...   
3  [fed, feds, federal, trump, inflation, dollar,...   
4  [stocks, stock, nasdaq, dow, markets, market, ...   

                                 Representative_Docs  
0  [A Brazilian fintech firm, Tanssi, is rolling ...  
1  [Preliminary Black Friday sales data won't sta...  
2  [Lean hog futures were up 17 cents in the Dece...  
3  [As President Donald Trump prepares to nominat...  
4  [Stocks gain

In [44]:
#Visualization
topic_model.visualize_documents(
    input_corpus, 
    custom_labels=True, 
    title="News Articles in Latent Space"
)