In [None]:
import pandas as pd
from bertopic import BERTopic

# Check pandas version
print("pandas version:", pd.__version__)

# Initialize BERTopic and print confirmation
topic_model = BERTopic()
print("BERTopic initialized successfully")


In [None]:
# import pandas as pd
# from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Load the data
file_path = '/data_path'  # Update with the correct file path
df = pd.read_csv(file_path)



# Extract the "clean_message" column
train_messages = df_pos['clean_message'].dropna().tolist()
print("for Pos: ", df_pos.shape)



In [None]:
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer

# Load BERT-large-uncased model for embeddings
embedding_model = SentenceTransformer('sentence-transformers/stsb-bert-large')


# Define custom stop words and vectorizer
# custom_stop_words = ['am','app','window','create','method','allow','option','string','name','data','website','type','click','line','don','added','object','snell','record','form','widget','list','module','E','editor','js','ben','view','web','M snell','Noordhuis','James','M','closes','message','node','value','update','error','Â','odoo','close','PR','field','URL','call','differential','revision','summary','time','instead','fbshipit','source','Revision','thread','now','cache','code','using','build','make','user','may','read','driver','file','email','E mail','mail','changes','event','nougat','lollipop','mr1','tests','used','marshmallow','values','size','set','device','reviewed','function','new','case','s','check','S','kernel','cc','test','CTS','dev','cts','use','ID','use','id','add','will','remove','commit', 'change','changeid', 'merge', 'pull', 'request', 'from', 'bhack','by', 'commitid', 'signed']

custom_stop_words = ['am','update','URL','call','differential','revision','summary','time','instead','fbshipit','source','Revision','thread','now','cache','code','using','build','make','user','may','read','driver','file','email','E mail','mail','changes','event','nougat','lollipop','mr1','tests','used','marshmallow','values','size','set','device','reviewed','function','new','case','s','check','S','kernel','cc','test','CTS','dev','cts','use','ID','use','id','add','will','remove','commit', 'change','changeid', 'merge', 'pull', 'request', 'from', 'bhack','by', 'commitid', 'signed']

custom_stop_words = ['docs','error','support','doc','documentation','log','IPv4LL','assignments','KEY','man','message','label','doesn','state','Dell','core','sd','let','data','properties','t','empty','example','RUNTIME','DIR','proof','value','makes','files','dhcp','client','allow',"don",'transaction','restart','service','until','need','udev','path','journald','jounral','systemd','added','close','IMP','odoo','bzr','revid','PR','E','am','update','URL','call','fixe','CVE','fix','differential','revision','summary','time','instead','fbshipit','source','Revision','thread','now','cache','code','using','build','make','user','may','read','driver','file','email','E mail','mail','changes','event','nougat','lollipop','mr1','tests','used','marshmallow','values','size','set','device','reviewed','function','new','case','s','check','S','kernel','cc','test','CTS','dev','cts','use','ID','use','id','add','will','remove','commit', 'change','changeid', 'merge', 'pull', 'request', 'from', 'bhack','by', 'commitid', 'signed']
custom_stop_words = ['themehosttest','testholothemes','version','run','cpu','x86','usb','revert','not','for','do','type','default','Move','string','ff','work','pulled','variable','frame','branch','option','git','pack','window','docs','support','doc','documentation','log','IPv4LL','assignments','KEY','man','message','label','doesn','state','Dell','core','sd','let','data','properties','t','empty','example','RUNTIME','DIR','proof','value','makes','files','dhcp','client','allow',"don",'transaction','restart','service','until','need','udev','path','journald','jounral','systemd','added','close','IMP','odoo','bzr','revid','PR','E','am','update','URL','call','fixe','differential','revision','summary','time','instead','fbshipit','source','Revision','thread','now','cache','code','using','build','make','user','may','read','driver','file','email','E mail','mail','changes','event','nougat','lollipop','mr1','tests','used','marshmallow','values','size','set','device','reviewed','function','new','case','s','check','S','kernel','cc','test','CTS','dev','cts','use','ID','use','id','add','will','remove','commit', 'change','changeid', 'merge', 'pull', 'request', 'from', 'bhack','by', 'commitid', 'signed']

# Combine custom stop words with English stop words
stop_words = list(ENGLISH_STOP_WORDS) + custom_stop_words
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3))


In [None]:

# Initialize KeyBERTInspired for fine-tuning topic representations
representation_model = KeyBERTInspired()

# Initialize BERTopic with custom parameters and representation model
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    embedding_model=embedding_model,
    min_topic_size=10,  # Adjust based on your data
    nr_topics=10,       # Adjust to desired number of topics
    representation_model=representation_model
)



# Fit the model on the data
topics, probabilities = topic_model.fit_transform(train_messages)

# Save the model
topic_model.save("./saved_topic_models/bertopic_model")

# Get the topic information
topic_info = topic_model.get_topic_info()

# Display the top 10 topics
top_10_topics = topic_info.head(10)
print(top_10_topics)



In [None]:
import pandas as pd
from bertopic import BERTopic

# Load the saved BERTopic model
topic_model = BERTopic.load("./saved_topic_models/bertopic_model")

# Function to generate topic names
def generate_topic_names_and_counts(topic_model, top_n_words=10):
    topic_info = topic_model.get_topic_info()
    topic_names = {}
    topic_counts = {}
    for _, row in topic_info.iterrows():
        topic_id = row['Topic']
        if topic_id != -1:  # Avoid processing the -1 topic
            topic = topic_model.get_topic(topic_id)
            if topic:
                top_words = [word for word, _ in topic[:top_n_words]]
                topic_name = " ".join(top_words)
                topic_names[topic_id] = topic_name
                topic_counts[topic_id] = row['Count']
    return topic_names, topic_counts

# Generate topic names and counts
topic_names, topic_counts = generate_topic_names_and_counts(topic_model)

# Print topic names and counts
print("Topic Names and Counts:")
for topic_id in topic_names:
    print(f"Topic {topic_id}: {topic_names[topic_id]} (Count: {topic_counts[topic_id]})")

   