<a href="https://colab.research.google.com/github/tournemonde/topic_modeling/blob/main/Topics_and_Trends_on_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic Modeling with BERTopic and Gemma**



# **Instructions** </br>
There is the need to have a HuggingFace API key with access to the selected model. Store the key in the secrets section with the name `HF_TOKEN` or answer the login prompt from HuggingFace


# Environment Installation

#### Package Installation

In [1]:
# Install core libraries including BERTopic and others
!pip install bertopic datasets adjustText

# Install HuggingFace and related libraries
!pip install --upgrade transformers accelerate bitsandbytes

# Install pytorch
!pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118

# Install sentence-transformers for embedding model
!pip install sentence-transformers

# Install Kagglehub for dataset loading
!pip install kagglehub

# Install safetensors for saving the BERTopic model
!pip install safetensors

# Necessary for tranformers
!pip install xformers

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting adjustText
  Downloading adjustText-1.3.0-py3-none-any.whl.metadata (3.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cubl

# Data Engine

## General Imports

In [2]:
import re
import pandas as pd
import numpy as np

## Functions

In [3]:
# Clean the topics found by gemma into something more human
def extract_clean_label(label_list: list[str]) -> str:
    if not label_list or not isinstance(label_list, list):
        return ""

    text = label_list[0]
    original_text = text  # Keep a copy of the original text

    # Check for md bold pattern
    bold_match = re.search(r"\*\*(.*?)\*\*", text)

    if bold_match:
        bold_content = bold_match.group(1)
        # Check if "!Sure" or "Please" is in the bold content
        if re.search(r"!?Sure|Please", bold_content, re.IGNORECASE):
             # If "!Sure" or "Please" is in bold, choose the string before the bold section
            text = original_text.split("**")[0]
        else:
            # If no "!Sure" or "Please" in bold, use the bold content
            text = bold_content
    else:
        # If no bold section, check if "!Sure" is in the text
        if re.search(r"!?Sure", text, re.IGNORECASE):
            colon_index = text.find(":")
            if colon_index != -1:
                text = text[colon_index+1:]

    # Remove "Please" and everything after it
    please_index = re.search(r"Please", text, re.IGNORECASE)
    if please_index:
        text = text[:please_index.start()]

    # remove specific substrings and characters
    text = re.sub(r"<strong>", "", text)
    text = re.sub(r"\$bn", "", text, flags=re.IGNORECASE)
    text = re.sub(r"& FY", "", text, flags=re.IGNORECASE)
    text = text.replace("*", "")
    text = text.replace(":", "")
    text = text.replace(".", "")
    text = text.replace("/", "")
    text = text.replace("#", "") # Remove '#' character
    text = re.sub(r"\(too much text provided.*", "", text, flags=re.IGNORECASE) # Remove "(too much text provided..." and anything after
    text = re.sub(r"these prompts.*", "", text, flags=re.IGNORECASE) # Remove "these prompts..." and anything after


    # Step 3: Handle single quotes/double quotes
    if text.count("'") == 1:
        text = text.replace("'", "")
    if text.count('"') == 1:
        text = text.replace('"', "")

    # Step 4: Remove trailing ampersand if it appears at the end
    text = re.sub(r"\s*&\s*$", "", text)

    # Remove multiple spaces and strip leading/trailing whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Truncate to a maximum of 70 characters
    if len(text) > 70:
        text = text[:70]

    return text

In [4]:
# Generate random dates for each article to plot topics over time
def create_random_dates(df: pd.DataFrame):

    np.random.seed(42)

    start_date = pd.to_datetime('2012-12-31')
    end_date = pd.to_datetime('2021-01-01')

    days_in_range = (end_date - start_date).days

    random_days = np.random.randint(0, days_in_range, size=len(df))

    df['date'] = start_date + pd.to_timedelta(random_days, unit='D')
    return df



## Load data


In [5]:
news_df = pd.read_csv("https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv", header = None)



In [6]:
news_df['Heading'] = news_df[1]
news_df['Article'] = news_df[2]

news_df['NewsType'] = news_df[0].replace({1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'})
news_df = news_df.drop(columns=[0, 1, 2])
news_df.to_parquet("news_articles.parquet")
news_df['NewsType'].value_counts()

Unnamed: 0_level_0,count
NewsType,Unnamed: 1_level_1
Business,30000
Sci/Tech,30000
Sports,30000
World,30000


In [7]:
news_df.head()

Unnamed: 0,Heading,Article,NewsType
0,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business
2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business
3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business
4,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",Business


In [8]:
print(f"Dataset length. rows: {news_df.shape[0]}  columns: {news_df.shape[1]} ({', '.join(news_df.columns)})")
print(f"Description\n{news_df.info()}")

Dataset length. rows: 120000  columns: 3 (Heading, Article, NewsType)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Heading   120000 non-null  object
 1   Article   120000 non-null  object
 2   NewsType  120000 non-null  object
dtypes: object(3)
memory usage: 2.7+ MB
Description
None


### Basic transforms

In [9]:
if "date" in news_df.columns:
    news_df["date"] = pd.to_datetime(news_df.Date).dt.normalize()
    news_df.drop(columns="Date", inplace=True)

In [10]:
news_df["Article"] = news_df.Article.str.strip()

In [11]:
news_df["Article"] = news_df["Article"].str.replace(r"^.*strong>", "", regex=True)

In [12]:
for col in news_df.columns:
    print(f"\n'{col}' example:\n{news_df[col].iloc[1000]}")


'Heading' example:
European Union Extends Microsoft-Time Warner Review

'Article' example:
BRUSSELS, Belgium (AP) -- European antitrust regulators said Monday they have extended their review of a deal between Microsoft Corp. (MSFT) and Time Warner Inc...

'NewsType' example:
Sci/Tech


### Login to HuggingFace for inference with Gemma LLM

In [13]:
WITH_SECRET = True

from google.colab import userdata
from huggingface_hub import login
from huggingface_hub import notebook_login

if WITH_SECRET:
    hf_token = userdata.get('HF_TOKEN')
    login()
else:
    notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Models configuration

In [14]:
from torch import cuda

model_id = "google/gemma-7b-it"
device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"

print(device)

cuda:0


In [15]:
# set quantization configuration - less GPU memory

import transformers
from torch import bfloat16

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    # bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

### Tokenizer and Model creation

In [16]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto",
)
model.eval()

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

  warn(


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
      )
    )
    

In [17]:
# Text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    max_new_tokens=20,
    repetition_penalty=1.1
)

Device set to use cuda:0


### Prompt Template to query the LLM model

In [18]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
<</SYS>>
You are a helpful assistant for labeling topics in news articles. Your goal is find the facts, and possibly opinions, of the article and create a short label that describes the topic of that article.
"""

In [19]:
# Example prompt describes how to proceed
example_prompt = """
I have a topic containing the following documents:
- Renewable energy sources such as solar and wind are becoming more cost-effective and widespread.
- Investing in renewable energy can significantly reduce greenhouse gas emissions.
- Governments are increasingly subsidizing renewable energy projects to fight climate change.

The topic is described by the keywords: 'renewable, energy, solar, wind, emissions, climate, investment, subsidy'.

Based on the information about the topic above, please create a concise label for this topic in 1 to 5 words. The label should be neutral, clearly descriptive, and summarize the core theme of the documents. Return only the label and nothing more.

[/INST] Growth of renewable energy

I have a topic containing the following documents:
- The new smartphone model features advanced AI capabilities and improved camera quality.
- Tech companies compete fiercely to dominate the market with innovative smartphone designs.
- Smartphone sales have significantly impacted global consumer electronics markets.

The topic is described by the keywords: 'smartphone, technology, AI, market, camera, sales, electronics, consumer'.

Based on the information about the topic above, please create a concise label for this topic in 1 to 5 words. The label should be neutral, clearly descriptive, and summarize the core theme of the documents. Return only the label and nothing more.

[/INST] Innovations in smartphone technology

I have a topic containing the following documents:
- Eating habits are shifting toward plant-based diets due to health and environmental concerns.
- The environmental impact of industrial meat production is prompting people to reconsider their dietary choices.
- Processed meat consumption has been linked to various health risks.

The topic is described by the keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a concise label for this topic in 1 to 5 words. The label should be neutral, clearly descriptive, and summarize the core theme of the documents. Return only the label and nothing more.

[/INST] Environmental impacts of meat consumption
"""

In [20]:
# Main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a concise label for this topic in 1 to 5 words. The label should be neutral, clearly descriptive, and summarize the core theme of the documents. Return only the label and nothing more.
[/INST]
"""

In [21]:
prompt = system_prompt + example_prompt + main_prompt

##  Topic Modeling

#### Prepare embeddings


In [22]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(news_df.Article, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

### Submodels for dimensionality reduction and clustering


In [23]:
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=777)
hdbscan_cluster = HDBSCAN(min_cluster_size=150, metric="euclidean", cluster_selection_method="eom", prediction_data=True)
kmeans_cluster = KMeans(n_clusters = 25)

In [24]:
# Reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine", random_state=777).fit_transform(embeddings)

  warn(


### Text Representation Models


In [25]:
from bertopic.representation import KeyBERTInspired, TextGeneration

keybert = KeyBERTInspired()

# Text generation with gemma
gemma = TextGeneration(generator, prompt=prompt)

representation_model = {
    "KeyBERT": keybert,
    "Gemma": gemma,
}

## Train model


In [26]:
from bertopic import BERTopic

topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model= hdbscan_cluster,
  representation_model=representation_model,
  top_n_words=12,
  verbose=True
)

topics, probs = topic_model.fit_transform(news_df.Article, embeddings)

2025-05-18 19:19:09,755 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-18 19:21:46,250 - BERTopic - Dimensionality - Completed ✓
2025-05-18 19:21:46,255 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-18 19:22:08,470 - BERTopic - Cluster - Completed ✓
2025-05-18 19:22:08,515 - BERTopic - Representation - Fine-tuning topics using representation models.
  9%|▉         | 10/108 [01:06<11:27,  7.02s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 108/108 [12:25<00:00,  6.90s/it]
2025-05-18 19:34:41,852 - BERTopic - Representation - Completed ✓


### Checks and output cleansing

In [27]:
topic_model.get_topic_info().head(2).T

Unnamed: 0,0,1
Topic,-1,0
Count,39310,5784
Name,-1_to_of_and_the,0_drug_scientists_of_researchers
Representation,"[to, of, and, the, in, its, for, on, said, tha...","[drug, scientists, of, researchers, that, the,..."
KeyBERT,"[reuters, news, business, companies, technolog...","[vioxx, merck, pfizer, drug, arthritis, health..."
Gemma,"[Mergers And Acquisitions News, , , , , , , , , ]","[Drug side effects, , , , , , , , , ]"
Representative_Docs,[LOS ANGELES (Reuters) - Texas Instruments Inc...,[A clinical trial of the blockbuster arthritis...


In [28]:
topics_info_df = topic_model.get_topic_info()
# print(topics_info_df.iloc[0:2, :-1].T)

In [29]:
topics_info_df["topics"] = topics_info_df["Gemma"].apply(extract_clean_label)

In [30]:
topics = topics_info_df["topics"].tolist()
topic_model.set_topic_labels(topics)
print(f"Identified Topics: {len(topics)}")
for topic in topics:
    print(topic)

Identified Topics: 108
Mergers And Acquisitions News
Drug side effects
Football News
Game coverage & Baseball News
College Football Game Recaps
Football highlights from around league
Merger & Acquisition News
Political campaigns & elections
Sports coverage during Olympiad events
Wal-mart Sales Performance Analysis
NBA games & highlights
DeltaS potential Bankruptcy Threat
Golfing events featuringTigerWoods
Market fluctuations
Gaming industry trends
AppleS dominance overmusic distribution
Oil price fluctuations
Peace negotiations & conflicts In Dar Fur
Tennis tournaments & player performance
Search applications & developments
Auto industry struggles amidst rising fuel costs
OracleS attempted acquisition of Peoplesoft
Nuclear standoff between US &Iran
Natural disasters affectingJapan
Formula Racing News
EU relations & potential accession negotiations between turkey&eu
Processor advancements
International conflict involving violence & human rights violations
Diplomacy efforts amidst tensio

In [31]:
# Save files for streamlit
import pickle

topics_info_df.to_parquet("topics_info_df.parquet", index=False)
news_df.to_parquet("news_articles.parquet", index=False)

with open("rep_docs.pickle", "wb") as handle:
    pickle.dump(topic_model.representative_docs_, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("reduced_embeddings.pickle", "wb") as handle:
    pickle.dump(reduced_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

topic_model.save("topic_model",
                 serialization="safetensors",
                 save_ctfidf=True,
                 save_embedding_model=embedding_model
                )

# News Dataset Analysis

In [32]:
news_df = create_random_dates(news_df)

In [33]:
news_df.to_parquet("news_articles.parquet")

## Topics Overtime

In [34]:
topics_over_time = topic_model.topics_over_time(news_df.Article, news_df.date.dt.strftime("%Y-%m"))
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=7, custom_labels=True,)

97it [01:20,  1.21it/s]


## Visualize Documents

In [35]:
topic_model.visualize_documents(news_df.Heading, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)

## Document Heatmap

In [44]:
topic_model.visualize_heatmap(custom_labels=True, top_n_topics=10)

## Topic Visualization

In [37]:
topic_model.visualize_topics(custom_labels=True)

## Topic Hierarchy

In [38]:
topic_model.visualize_hierarchy(custom_labels=True, top_n_topics=25)

## Topics per News Type

In [39]:
genres = news_df["NewsType"]

topics_per_class = topic_model.topics_per_class(news_df["Article"], classes = genres)
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics = 25, custom_labels=True)

4it [00:02,  1.71it/s]


In [40]:
topic_model.visualize_barchart(top_n_topics=12)

# Content output

In [41]:
%%capture
!zip -r /content/topic_modeling.zip /content

In [42]:
!pip freeze > requirements.txt


In [43]:
# !cat requirements.txt
!python --version



Python 3.11.12
