In [1]:
# install packages
!pip install bertopic


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.12.0-py2.py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 4.9 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 7.2 MB/s 
[?25hCollecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.3 MB/s 
Collecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 65.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml<6.0
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 58.3 MB/s 
Collecti

In [2]:
# manage imports
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import re
from nltk import tokenize
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [51]:
# Github URL
# https://raw.githubusercontent.com/xrtze/BERTopic-Topic-Modeling/main/2019.csv
# 'https://github.com/xrtze/BERTopic-Topic-Modeling/blob/main/subs_for_peaks_with_comments_Master.xlsx?raw=true'
data_url = 'https://raw.githubusercontent.com/xrtze/BERTopic-Topic-Modeling/main/2019.csv'

# Read the dataset from the URL
# data = pd.read_excel(data_url, engine='openpyxl', sheet_name='2020', usecols=['title', 'selftext', 'top_level_comments'])
data = pd.read_csv(data_url, delimiter=';', usecols=['title', 'selftext', 'comments'])

In [4]:
# Show head of loaded data table
data.head()

Unnamed: 0,title,selftext,comments
0,For privacy newbies - understand your threat m...,[removed],Removed: >Please don’t fuel conspiracy thinki...
1,NLST THE STOCK THAT WILL GO TO JUPITER AND BACK.,"Guys please take a read, this is better than G...","This submission is flaired as ""DD."" If you do ..."
2,How big is “big data?”,"Admittedly, “big data” is a phrase I hear most...","It's not about the size of the data, but how y..."
3,Nokia (NOK) underrated stock deserves more att...,[deleted],This is way better dd than the guy last month ...
4,"$PASO Due Diligence, Rockets Inside!","Hello, and thank you for reading my due dilig...","This submission is flaired as ""DD."" If you do ..."


In [37]:
# Combine documents and pre-process data
data = data.fillna('')
data["documents"] = data["title"].astype('str') + ' ' + data["selftext"].astype('str') + ' ' + data["comments"].astype('str')
documents = [tokenize.sent_tokenize(str(x)) for x in data['documents']]
documents = [item for sublist in documents for item in sublist]
documents = [x.lower().replace('\n', ' ').replace('[removed]', '').replace('[deleted]', '').replace('removed', '').replace('x200b', '').replace('#', ' ') for x in documents]

In [38]:
print(documents[0:20])

['for privacy newbies - understand your threat model first.', ' :  >please don’t fuel conspiracy thinking here.', 'don’t try to spread fud, especially against reliable privacy-enhancing software.', 'extraordinary claims require extraordinary evidence.', 'show credible sources. most of these are about anonymity rather than privacy.', 'very cool post nonetheless i find it hard to take this post seriously when you say that firefox is a  copycat of chrome.', 'what the fuck are you on about?', 'you do realise tails uses firefox, right? please explain how firefox is an issue...?  no firefox?', 'even with all the security add-ons?', 'why?', "not trying to argue, i actually don't know why?", 'o thought it was pretty secure with the add-ons.', 'btw excellent post. a good post op.', 'i slightly disagree.', 'for newbies i’d suggest keeping an modest on line identity for convenience e.g.', 'a gmail account and a few google apps or similar on apple.', 'much convenience and little risk.', 'even gmai

In [None]:
# Train the topic model
# all-MiniLM-L6-v2
# all-mpnet-base-v2 

model_name = SentenceTransformer('all-mpnet-base-v2')
vectorizer_model = CountVectorizer(stop_words="english")

model = BERTopic(verbose=True, embedding_model=model_name, vectorizer_model=vectorizer_model, min_topic_size= 25, nr_topics="auto", diversity=0.4)

reddit_topics, _ = model.fit_transform(documents)

In [47]:
model.visualize_barchart(top_n_topics=20, n_words=10)

In [53]:
model.visualize_topics()

In [54]:
model.visualize_hierarchy(top_n_topics=20)