In [None]:
import os
import warnings

import spacy
import nltk
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
# Changed to CPU supported modules due to unavailability of CUDA pip wheel in GCR environment
# from cuml.cluster import HDBSCAN
# from cuml.manifold import UMAP
from hdbscan import HDBSCAN
import umap
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer
from transformers import pipeline

warnings.filterwarnings("ignore", category=DeprecationWarning)

class BertopicModel:

    def _init_(self, nr_topics):

        self.nr_topics = nr_topics
        # Prepare stopwords list
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=list(
            self.stop_words))  # max_df=0.90, min_df=0.005) #percentage threshold to remove words based on occurence in documents
        self.tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')

    def clean(self, df, column):
        df = df.loc[df[column].notnull(), :]
        nlp = spacy.load('en_core_web_sm')
        # lemmatize:
        documents = []
        for feedback in df[column]:
            document = nlp(feedback)
            lemmatized_feedback = " ".join([token.lemma_ for token in document])
            documents.append(lemmatized_feedback)
        return df, documents

    def get_representation_model(self):

        prompt = """I have a topic described by the following keywords: [KEYWORDS] and  [Documents]

                    Based on the previous keywords, what is this topic about?"""

        # Create your representation model
        generator = pipeline('text2text-generation',
                                model='google/flan-t5-large')
        representation_model_text_generation = TextGeneration(
            generator, prompt=prompt)
        representation_model_keybert = KeyBERTInspired()
        representation_model_mmm = MaximalMarginalRelevance(diversity=0.25)
        representation_model = [representation_model_mmm, representation_model_keybert,
                                representation_model_text_generation]
        return representation_model

    def topic_model(self):
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        # embedding = tokenizer(documents.to_list(), padding=True, truncation=True, max_length=1024, return_tensors='pt')
        # Create instances of GPU-accelerated UMAP and HDBSCAN
        umap_model = umap.UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
        hdbscan_model = HDBSCAN(
            min_samples=10, gen_min_span_tree=True, prediction_data=True)

        topic_model = BERTopic("english",
                                embedding_model=self.tokenizer,
                                verbose=True,
                                nr_topics=self.nr_topics,  # check
                                top_n_words=25,
                                representation_model=self.get_representation_model(),
                                vectorizer_model=self.vectorizer_model,
                                ctfidf_model=ctfidf_model,
                                umap_model=umap_model,
                                hdbscan_model=hdbscan_model
                                )
        return topic_model

    def run(self, input_path, output_path1, output_path2, column, reduce_outliers=True,
            strategy="embeddings"):  # reduce_outliers #optional
        df = pd.read_csv(input_path)
        df, documents = self.clean(df=df, column=column)
        model = self.topic_model()
        # documents_list = documents.to_list()
        topics, probs = model.fit_transform(documents=documents)
        if reduce_outliers:
            # Reduce outliers using the embeddings strategy
            print("Running outlier reduction")
            reduced_topics = model.reduce_outliers(
                documents, topics, strategy=strategy)
            model.update_topics(
                documents, topics=reduced_topics, vectorizer_model=self.vectorizer_model)
        topic_info = model.get_topic_info()
        topic_info.to_csv(output_path1, index=False)  
        document_info = model.get_documents()
        document_info.to_csv(output_path2, index=False)
        
       

In [None]:
# define hyperparameters
nr_topics = 10
input_data =
output_path1 = "topics.csv"   
output_path2 = "documents_custered.csv" 
column = ""
        

# run topic modelling
topic_model = BertopicModel(nr_topics=nr_topics)  # get input from get_topic function
topic_model.run(input_path=input_data, output_path1=, column=column)