In [1]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import json
import nltk
nltk.download('stopwords')

with open("../../data/sports.json", 'r', encoding="utf-8") as f:
    data = json.load(f)

umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)

hdbscan_model = HDBSCAN(min_cluster_size=40, min_samples=30,
                        prediction_data=True, gen_min_span_tree=True)

stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)

#model.fit(data['selftext'])
data_text = []
for text in data:
    data_text.append(text['title'])
    
topics, probs = model.fit_transform(data_text)

#print(topics)
model.get_topic_info()

  from .autonotebook import tqdm as notebook_tqdm
2023-01-05 19:17:39.967336: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-05 19:17:40.043918: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-05 19:17:40.350282: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-05 19:17:40.350318: W tensorflow/co

FileNotFoundError: [Errno 2] No such file or directory: '../../../data/sports.json'

In [None]:
model.visualize_barchart()

: 

In [None]:
model.visualize_topics()

: 

In [None]:
model.visualize_heatmap(n_clusters=3, width=700, height=700)

: 

In [None]:
model.visualize_term_rank()

: 