# BERTopic Exploratory Notebook

This notebook walks through loading a CSV, preprocessing text, training a BERTopic model, and visualizing topics. Adjust paths and parameters as needed.

In [3]:
!pip install pandas bertopic umap-learn hdbscan nltk

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [4]:
import os
os.makedirs('output', exist_ok=True)
import pandas as pd
import nltk
from bertopic import BERTopic

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Load your data
# Place your CSV (with a 'text' column) in the same directory as this notebook

df = pd.read_csv('input.csv')
df.head()

Unnamed: 0,blocks,bot_id,bot_profile__app_id,bot_profile__deleted,bot_profile__id,bot_profile__name,bot_profile__team_id,bot_profile__updated,channel_id,client_msg_id,...,text,thread_ts,topic,ts,type,unread_count,upload,user,user_team,username
0,,BDG87ESBD,A8GBNUWU8,,BDG87ESBD,GitHub,T7GMC60JX,2018-10-17 15:42:58,CFGAL9Q1J,,...,,1599934000.0,,2020-09-12 18:10:32,message,,,UDFNJM85P,,
1,,,,,,,,,C01B6H6U8S2,,...,<@U01B4H1FQTS> has joined the channel,1601448000.0,,2020-09-30 06:41:53,message,,,U01B4H1FQTS,,
2,,,,,,,,,C01CPKS98EM,,...,<@U01880WS0EA> has joined the channel,1604009000.0,,2020-10-29 22:03:16,message,,,U01880WS0EA,,
3,"[{""type"": ""rich_text""}]",,,,,,,,C016B3LG5B4,,...,*Dynamic Filtering*\nConsider the dashboard ca...,1594982000.0,,2020-07-17 10:26:05,message,,,U017A55PE9E,,
4,"[{""type"": ""rich_text""}]",,,,,,,,C014LS99C1K,428eaca9-c247-418a-9c54-42990eb192da,...,for the other stuf I think having workers in a...,1593714000.0,,2020-07-02 18:19:50,message,,,U0144UAHXKJ,,


In [7]:
# Preprocessing
def clean(text):
    import re
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOP = set(stopwords.words('english'))

print(f'Stopwords: {STOP}')

def tokenize(text):
    return [w for w in word_tokenize(clean(text)) if w not in STOP and len(w) > 2]

docs = df['text'].dropna().tolist()
tokenized = [tokenize(doc) for doc in docs]

Stopwords: {'whom', 'being', 'd', "we're", 'theirs', "we've", "hasn't", 'through', "weren't", 'of', 'these', 'a', 'about', "she'll", 'for', 'most', 'own', "shan't", 'is', 'doing', 'that', 'your', 'their', 'him', 'had', 'down', 'as', "she'd", 'what', 'why', "you're", 'few', 'don', 'm', 'now', "mustn't", 'not', 'her', 'myself', 'won', 'doesn', 'were', 'shouldn', "should've", 'how', 'against', "don't", 'an', 'll', 'further', "that'll", 'until', 's', "didn't", 'some', "they're", 'me', 'aren', "she's", "they've", 'those', 'i', 'isn', 'be', 'such', 'then', 'up', 'are', 'very', "won't", 'but', 'at', 'he', 'who', "isn't", 'hadn', 'out', "you'll", "he'll", 'by', 'do', 'so', 'before', "aren't", 'nor', 'over', 'should', 'himself', 'there', 'has', 't', 'when', 'the', 'below', 'off', 'been', "haven't", 'during', "he'd", 'yourselves', 'yourself', 'no', "i'd", 're', 'weren', "i've", 'having', 'other', 'they', "it's", 'into', "i'll", 'mustn', 'just', 'will', 'once', 'couldn', "doesn't", 'herself', "mi

In [8]:
# Train BERTopic model
model = BERTopic(nr_topics=20)
topics, probs = model.fit_transform(docs)
print(f'Trained with {len(set(topics))} topics')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Trained with 3 topics


In [10]:
# Assign topics back to DataFrame
# Create a new DataFrame with the 'text' column and corresponding topics
topics_df = pd.DataFrame({'text': docs, 'topic': topics})

# Instead of merging, directly assign the 'topic' column to the original df
# This ensures the 'topic' column is added correctly to the df DataFrame.
df['topic'] = topics_df['topic']

# Save results
export_path = 'output/topics.csv'
df[['text','topic']].to_csv(export_path, index=False)
print(f'Topic assignments saved to {export_path}')

Topic assignments saved to output/topics.csv


In [11]:
# Explore topics information
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,28,0_u01880ws0ea_joined_channel_has,"[u01880ws0ea, joined, channel, has, the, u80ef...","[<@U01880WS0EA> has joined the channel, <@U018..."
1,1,1398,1_to_the_in_and,"[to, the, in, and, is, superset, for, of, on, it]","[Hi Everyone,\nI have a requirement in which s..."
2,2,263,2_channel_joined_has_the,"[channel, joined, has, the, u014d0q95k5, um4aw...","[<@U014D0Q95K5> has joined the channel, <@U014..."


In [32]:
# Visualize topics
fig = model.visualize_topics()
fig.write_html('output/bertopic_visualization.html')
fig.show()
print('Visualization saved to output/bertopic_visualization.html')

Visualization saved to output/bertopic_visualization.html


## Next steps
- Tweak `nr_topics` to change the number of discovered topics
- Use `model.get_topic(topic_id)` to view the keywords for a specific topic
- Integrate this notebook into a pipeline or wrap in a web app as needed

In [33]:
# Specify the topic ID you want to explore
topic_id = 5  # Replace with the desired topic ID

# Get the keywords for the specified topic
topic_keywords = model.get_topic(topic_id)

# Print the topic keywords
print(f"Keywords for topic {topic_id}:")
for keyword, probability in topic_keywords:
    print(f"- {keyword}: {probability:.4f}")

Keywords for topic 5:
- deleted: 0.9187
- message: 0.8834
- was: 0.6884
- this: 0.4364
- ci: 0.0300
- screenshot: 0.0222
- its: 0.0151
- our: 0.0124
- not: 0.0088
- of: 0.0070
