In [1]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.3-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Downloading Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Downloading bertopic-0.16.3-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.0.

In [2]:
import os
import warnings

import spacy
import nltk
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
# Changed to CPU supported modules due to unavailability 
#from cuml.cluster import HDBSCAN
#from cuml.manifold import UMAP
from hdbscan import HDBSCAN
import umap
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer
from transformers import pipeline, set_seed

warnings.filterwarnings("ignore", category=DeprecationWarning)      
set_seed(42)

2024-07-29 01:42:45.713451: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 01:42:45.713566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 01:42:45.842591: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class BertopicModel:

    def __init__(self, nr_topics):

        self.nr_topics = nr_topics
        # Prepare stopwords list
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=list(
            self.stop_words))  # max_df=0.90, min_df=0.005) #percentage threshold to remove words based on occurence in documents
        self.tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')

    def clean(self, df, column):
        df = df.loc[df[column].notnull(), :]
        documents = df[column].to_list()
        return df, documents

    def get_representation_model(self):

        prompt = """I have a topic described by the following keywords: [KEYWORDS] and  [Documents]

                    Based on the previous keywords, what is this topic about?"""

        # Create your representation model
        generator = pipeline('text2text-generation',
                                model='google/flan-t5-large')
        representation_model_text_generation = TextGeneration(
            generator, prompt=prompt)
        representation_model_keybert = KeyBERTInspired()
        representation_model_mmm = MaximalMarginalRelevance(diversity=0.25)
        representation_model = [representation_model_mmm, representation_model_keybert,
                                representation_model_text_generation]
        return representation_model

    def topic_model(self):
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        # embedding = tokenizer(documents.to_list(), padding=True, truncation=True, max_length=1024, return_tensors='pt')
        # Create instances of GPU-accelerated UMAP and HDBSCAN
        umap_model = umap.UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
        # Optimize UMAP parameters for memory usage
        #umap_model = umap.UMAP(n_components=2, n_neighbors=10, min_dist=0.0, low_memory=True)
        hdbscan_model = HDBSCAN(
            min_samples=10, gen_min_span_tree=True, prediction_data=True)

        topic_model = BERTopic("english",
                                embedding_model=self.tokenizer,
                                verbose=True,
                                nr_topics=self.nr_topics,  # check
                                top_n_words=25,
                                representation_model=self.get_representation_model(),
                                vectorizer_model=self.vectorizer_model,
                                ctfidf_model=ctfidf_model,
                                umap_model=umap_model,
                                hdbscan_model=hdbscan_model
                                )
        return topic_model


    def run(self, input_path, output_path1, output_path2, column, reduce_outliers=True,
        strategy="embeddings"):  # reduce_outliers #optional
        df = pd.read_csv(input_path)
        df, documents = self.clean(df=df, column=column)
        model = self.topic_model()
        # documents_list = documents.to_list()
        topics, probs = model.fit_transform(documents=documents)
        if reduce_outliers:
            # Reduce outliers using the embeddings strategy
            print("Running outlier reduction")
            reduced_topics = model.reduce_outliers(
                documents, topics, strategy=strategy)
            model.update_topics(
                documents, topics=reduced_topics, vectorizer_model=self.vectorizer_model)
        topic_info = model.get_topic_info()
        topic_info.to_csv(output_path1, index=False)  
        document_info = model.get_document_info(documents)
        document_info.to_csv(output_path2, index=False)


In [4]:
# define hyperparameters
nr_topics = 15
input_data = '/kaggle/input/checkpoint3/checkpoint3.csv'
output_path1 = "title_topics.csv"   
output_path2 = "title_clustered.csv" 
column = "title"
        

# run topic modelling
topic_model = BertopicModel(nr_topics=nr_topics)  # get input from get_topic function
topic_model.run(input_path=input_data, output_path1=output_path1, output_path2=output_path2, column=column)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

2024-07-29 01:43:27,188 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

2024-07-29 01:43:58,618 - BERTopic - Embedding - Completed ✓
2024-07-29 01:43:58,619 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-29 01:47:21,916 - BERTopic - Dimensionality - Completed ✓
2024-07-29 01:47:21,920 - BERTopic - Cluster - Start clustering the reduced embeddings
  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Running outlier reduction




In [5]:
# define hyperparameters
nr_topics = 15
input_data = '/kaggle/input/checkpoint3/checkpoint3.csv'
output_path1 = "text_topics.csv"   
output_path2 = "text_clustered.csv" 
column = "text"
        

# run topic modelling
topic_model = BertopicModel(nr_topics=nr_topics)  # get input from get_topic function
topic_model.run(input_path=input_data, output_path1=output_path1, output_path2=output_path2, column=column)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2024-07-29 02:55:16,766 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

2024-07-29 02:56:01,239 - BERTopic - Embedding - Completed ✓
2024-07-29 02:56:01,241 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-29 02:57:12,915 - BERTopic - Dimensionality - Completed ✓
2024-07-29 02:57:12,918 - BERTopic - Cluster - Start clustering the reduced embeddings
  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Running outlier reduction


