In [143]:
dataset_input = 'dataset/docs_saved.txt';
# 3. Set up storage directories
import os

output_dir = "results/NepaliBERT_AllDoc"
os.makedirs(output_dir, exist_ok=True)

In [144]:
# Preprocessing steps
# Load Nepali stopwords
# You may need to download a Nepali stopword list
import spacy
import re
import json

# Extract stop words for nepali texts
nepali_stopwords = []
with open('dataset/non-potential-topic-word-list.txt', 'r', encoding='utf-8') as f:
    nepali_stopwords = [line.strip() for line in f]  # Use a set for efficient lookup
    
from sklearn.feature_extraction import text

# Combine with built-in English stopwords
combined_stopwords = list(text.ENGLISH_STOP_WORDS.union(nepali_stopwords))

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [145]:

# Assuming you have your custom_nepali_tokenizer defined as shown previously
from nepalikit.tokenization import Tokenizer

nepali_tokenizer = Tokenizer()
def custom_nepali_tokenizer(text):
    return nepali_tokenizer.tokenize(text, level='word')

In [146]:
# Extract stop words for nepali texts

    
# Stem the extra words
def nepali_stemming(word):

    suffixes = []
    with open('dataset/stemming_words.txt', 'r', encoding='utf-8') as f:
        suffixes = [line.strip() for line in f]  # Use a set for efficient lookup
    
    # Define basic stemming rules
    
    word = word.replace('हरू', '')
    word = word.replace('हरु', '')
    for suffix in suffixes:
        if word.endswith(suffix):
            word = word[: -len(suffix)]
            
    if len(word.strip()) > 1 and word != "र":
        return word
    return ''


In [147]:

# Custom preprocessor to remove numbers and punctuation, keeping only words
def remove_numbers_and_punctuation(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)  # Remove all digits
    # Optionally, you can also remove punctuation here if needed
    text = re.sub(r'[^\w\s\u0900-\u097F]', '', text)  # Keep only words and Nepali characters
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with single space and strip leading/trailing spaces
    text = text.replace('।','')
    
    text = nepali_stemming(text)
    return text



In [148]:
# Open and read the content of data.txt
with open(dataset_input, 'r', encoding='utf-8') as file:
    content = file.read()

# If the content is already a valid JSON list, just load it
import json

try:
    docs = json.loads(content)
except json.JSONDecodeError:
    # If not, try to manually split paragraphs (fallback)
    docs = [para.strip() for para in content.split('\n') if para.strip()]

# Print the list or process it further
print(docs[0:10])

['यस रिट निवेदन मिसिल संलग्न कागजात अध्ययन गरी निवेदक उपस्थित हुनुभए विद्वान् अधिवक्ता श्री अनन्तराज लुइँटेल श्री बल्लभराज बस्नेत श्री विकास भट्टराई श्री धनेन्द्र श्रेष्ठ श्री श्रवनकुमार चौधरी श्री टेकबहादुर पौडेल श्री सन्तोष भण्डारी श्री गो सुवेदी श्री सपना सुवेदी श्री उज्ज्वल घिमिरे श्री लक्ष्मी थापा निवेदक स्वयं पूर्ण राजवंशी गर्नु बहस सुनियो यस के कसो हो निवेदक मागबमोजिम आदेश किन जारी हुनु नपर्ने हो मागबमोजिम आदेश जारी हुनु नपर्ने कुनै आधार कारण भए सबुद प्रमाण म्याद सूचना पाए मिति बाटा म्यादबाहेक दिन विपक्षी महान्यायाधिवक्ता कार्यालय लिखित जवाफ पेस गर्नु भनी आदेश निवेदन प्रतिलिपि साथै राखी विपक्षी नाम म्याद सूचना जारी गरी लिखित जवाफ परे वा अवधि नाघेपछि नियमानुसार गरी पेस गर्नु प्रस्तुत विषय व्यवस्थापकीय पक्ष अन्तरिम आदेश माग सम्बन्ध विचार गर्दा निवेदक जिकिर नाजायज पनि देखिदैन भने सरकार पनि संवेदनशील नै रहे देखिन्छ निवेदक उठाए कोभिड कोरोना भाइरस सङ्क्रमण भयावह कुनै एक भूगोल मात्र सीमित रहे विषय नरहे स्पष्ट यस जोगिन सङ्क्रमण देश भित्रिनै नदिन विपपूर्व जोखिम व्यवस्थापन समयमै हुन अति आ

In [149]:
def split_by_danda(text):
    return [s.strip() + "।" for s in text.split("।") if s.strip()]

def chunk_sentences(sentences, max_words=700):
    chunks, chunk, count = [], "", 0
    for s in sentences:
        wc = len(s.split())
        if count + wc <= max_words:
            chunk += s + " "
            count += wc
        else:
            chunks.append(chunk.strip())
            chunk = s + " "
            count = wc
    if chunk:
        chunks.append(chunk.strip())
    return chunks

'''chunked_docs = []
for doc in docs:
    sentences = split_by_danda(doc)
    if(sentences != ''):
       chunked_docs.extend(chunk_sentences(sentences))

docs = chunked_docs
print(len(docs))

'''
#print(docs)

"chunked_docs = []\nfor doc in docs:\n    sentences = split_by_danda(doc)\n    if(sentences != ''):\n       chunked_docs.extend(chunk_sentences(sentences))\n\ndocs = chunked_docs\nprint(len(docs))\n\n"

In [150]:
# !git lfs install
# !git clone https://huggingface.co/Rajan/NepaliBERT

from transformers import BertForMaskedLM

from transformers import BertTokenizer
import numpy
import torch 

# At a point where you think memory can be freed, e.g., after training before eval
torch.cuda.empty_cache()


#vocab_file_dir = './NepaliBERT/'
# tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,
#                                         strip_accents=False,
#                                          clean_text=False )

# model = BertForMaskedLM.from_pretrained('./NepaliBERT')


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")
model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT")

from transformers import pipeline

fill_mask = pipeline( "fill-mask", model=model, tokenizer=tokenizer ) 

In [None]:
import torch

# 2. Define the custom embedding function
def generate_embeddings(documents):
    # Ensure inputs is a list of texts 
    if isinstance(documents, str): 
        documents = [documents]
    
    inputs = tokenizer(documents, padding=True, truncation=True, return_tensors='pt',batch_size = 10)
    #inputs = inputs.to('cuda')
    with torch.no_grad():
        outputs = model(**inputs)
        
    embeddings = outputs.logits[:, 0,:] # [CLS] token embeddings
    return embeddings.cpu().numpy()

In [None]:
# Compute Coherence Score

from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary


# Tokenizer and cleaner
def tokenize(text):
    # Remove unwanted characters
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep only Devanagari characters
    text = re.sub(r'\s+', ' ', text).strip()        # Normalize whitespace
    
    # Tokenize
    tokens = text.split()
    
    # Remove stopwords and short tokens
    tokens = [token for token in tokens if token not in combined_stopwords and len(token) > 1]
    return tokens

# evaluate coherence score
def get_CoherenceScore(topics_dict,docs):


    # Convert topics to a format suitable for coherence calculation
        
    # Get top-k words per topic
    top_words_per_topic = [
        [word for word, _ in words[:10]] 
        for topic_id, words in topics_dict.items() 
        if topic_id != -1
    ]
    
    # Apply to all documents
    docs_tokenized = [tokenize(doc) for doc in docs]

    # Create a dictionary and corpus from the tokenized documents
    dictionary = Dictionary(docs_tokenized)

    coherence_model = CoherenceModel(
        topics=top_words_per_topic,
        texts=docs_tokenized,          # your tokenized documents
        dictionary=dictionary,
        coherence='c_v',               # or 'c_npmi', 'u_mass', etc.
    )

    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score: {coherence_score}")
    
    return coherence_score


# evaluation with topic diversity
def get_TopicDiversity(topics_dict):

    # Extract the top words from all topics
    topic_words = [word for topic in topics_dict.values() for word, _ in topic]

    # Calculate the number of unique words and total words
    unique_words = len(set(topic_words))
    total_words = len(topic_words)

    # Compute Topic Diversity
    topic_diversity = unique_words / total_words if total_words > 0 else 0

    print(f"Topic Diversity: {topic_diversity:.2f}")
    return topic_diversity

In [None]:
# Model initialization
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

from bertopic.representation import MaximalMarginalRelevance


from umap import UMAP
from hdbscan import HDBSCAN
import time

import logging
import sys

# ... rest of your code


# 2. Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7,15],
    'n_components': [5, 10, 15],
    'min_topic_size': [5, 10,15],
    'umap_kwargs': [{'metric': 'cosine'}],
    'hdbscan_kwargs': [{'min_cluster_size': 5}],
    'nr_topics': ['auto', 10,15],
    'min_cluster_size': [2, 3],        # HDBSCAN parameter
    'n_gram_range': [(1, 1), (1, 2)],      # CountVectorizer parameter
                      # UMAP parameter (keeping fixed for this example)
   
}


# --- Setup Logging ---
# Configure logging to write to both a file and the console
log_file = output_dir + "/bertopic_run_with_topics.log"
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    handlers=[
        logging.FileHandler(log_file, mode="w", encoding='utf-8'), # 'w' overwrites the log file each run
        logging.StreamHandler(sys.stdout)
    ],
    force=True # Necessary to reconfigure logging if run multiple times in the same session
)
logger = logging.getLogger("BERTopicRunWithTopics")
logger.info("Starting BERTopic script...")


2025-11-13 16:04:30,278 - INFO - Starting BERTopic script...


In [None]:

count = 0

results = []

bestResults = {}
best_cv = -10
best_diversity = -10

min_clust = 2 #5 #3 #7
n_gram = (1,2)
n_neigh = 10 #10 #8 #10 #5
n_comp = 7 #5
min_topic_size = 5
min_df = 0.02 # 0.02  #0.01 #0.02  #0.1
max_df = 0.6  #0.6    #0.8
min_dist = 0.25 #0.1
min_samples = 8 #2
nr_topics = "auto" #128
top_n_words = 10
mmr_diversity = 0.9

min_clust = 4
n_neigh = 7
n_comp = 8
min_topic_size = 5
min_df = 0.001
max_df = 0.6
min_dist = 0.25
min_samples = 3
mmr_diversity = 1.0

run_start_time = time.time()

# Initialize CountVectorizer with custom tokenizer
vectorizer_model = CountVectorizer(preprocessor=remove_numbers_and_punctuation, 
                                    tokenizer=custom_nepali_tokenizer,
                                    stop_words= combined_stopwords,
                                    ngram_range=n_gram,
                                    min_df=min_df,
                                    max_df=max_df)


umap_model = UMAP(n_neighbors=n_neigh,
                    n_components=n_comp, 
                    min_dist=min_dist, 
                    metric='cosine',
                    random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=min_clust, 
                        min_samples=min_samples,
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

# Define the MaximalMarginalRelevance model
# Start with a diversity value around 0.1 to 0.3 and adjust.
# A higher value means more diverse words, but potentially less relevant.
mmr_model = MaximalMarginalRelevance(diversity= mmr_diversity) # You can play with this value

import torch
torch.cuda.empty_cache()

#topic_model = BERTopic(embedding_model=fill_mask, verbose=True)
topic_model = BERTopic(embedding_model=generate_embeddings, 
                        umap_model = umap_model,
                        hdbscan_model=hdbscan_model,
                        vectorizer_model=vectorizer_model,
                        min_topic_size=min_topic_size,  # Increase minimum topic size
                        top_n_words=top_n_words,
                        calculate_probabilities=True,
                        nr_topics=nr_topics ,
                        representation_model=mmr_model, # <--- Add the MMR model here
                        verbose=True)


import torch
torch.cuda.empty_cache()

#embeddings = generate_embeddings(docs)

topics, probabilities = topic_model.fit_transform(docs)

#if(len(topics)> 50):
    #topic_model.reduce_topics(docs, nr_topics=50)

# 5. Print the topics
print(topic_model.get_topics())


2025-11-13 17:52:27,892 - BERTopic - Embedding - Transforming documents to embeddings.


2025-11-13 17:52:27,897 - INFO - Use pytorch device_name: cpu
2025-11-13 17:52:27,898 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Batches: 100%|██████████| 45/45 [00:53<00:00,  1.18s/it]
2025-11-13 17:53:24,794 - BERTopic - Embedding - Completed ✓
2025-11-13 17:53:24,796 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-13 17:53:27,381 - BERTopic - Dimensionality - Completed ✓
2025-11-13 17:53:27,383 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-13 17:53:27,468 - BERTopic - Cluster - Completed ✓
2025-11-13 17:53:27,469 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-11-13 17:53:29,158 - BERTopic - Representation - Completed ✓
2025-11-13 17:53:29,161 - BERTopic - Topic reduction - Reducing number of topics
2025-11-13 17:53:29,165 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-13 17:53:31,051 - BERTopic - Representation - Completed ✓
2025-11-13 17:53:31,062 - BERTopic - Topic reduction - Reduced number of topics from 4 to 4


{0: [('जग्गा', np.float64(0.01380544386738667)), ('कसुर', np.float64(0.010367141407244151)), ('आधार', np.float64(0.009164639384111503)), ('उल्लेख', np.float64(0.008990559351728248)), ('घटना', np.float64(0.007716409055383194)), ('बकपत्र', np.float64(0.007471084405416882)), ('महल', np.float64(0.0065594975912421005)), ('प्रस्तुत मुद्दा', np.float64(0.006199319895299649)), ('ने', np.float64(0.006188023757773742)), ('विषय', np.float64(0.006117890399696634))], 1: [('संकेत', np.float64(0.08640229454618037)), ('छुट्टे अभियोगपत्र', np.float64(0.07121511301676278)), ('दाबी नाबालक', np.float64(0.07121511301676278)), ('हक छुट्टे', np.float64(0.07121511301676278)), ('दायर व्यहोरा', np.float64(0.06874279061200045)), ('धिमाल साँठगांठ', np.float64(0.06874279061200045)), ('बरामद केजी', np.float64(0.06874279061200045)), ('साँठगांठ', np.float64(0.06874279061200045)), ('गाडे डुँड', np.float64(0.06874279061200045)), ('मसँगे', np.float64(0.06698865744893603))], 2: [('गरिदिएँ', np.float64(0.12984850273882384

In [None]:

# logger.info("Model training complete.")
logger.info(f"Number of topics found: {len(topic_model.get_topic_info())}")

# # # --- Get Topic Information ---
# logger.info("Extracting topic information...")
# topic_info_df = topic_model.get_topic_info()
# logger.info("Top 5 Topic Frequencies:\n" + topic_info_df.to_string())


# # Get detailed topic words and scores
all_topics = topic_model.get_topics() # Returns a dict {topic_id: [(word, score), ...]}
logger.info(f"Extracted details for {len(all_topics)} topics (including outlier topic -1 if present).")

# # --- Save Topic Information ---
# topic_info_csv_path = output_dir + "/"  + "_topic_info.csv"
# topic_words_json_path = output_dir + "/"  +  "_topic_words.json"

# logger.info(f"Saving topic information table to: {topic_info_csv_path}")
# topic_info_df.to_csv(topic_info_csv_path, index=False, encoding='utf-8') # Save DataFrame to CSV

# logger.info(f"Saving detailed topic words and scores to: {topic_words_json_path}")
# with open(topic_words_json_path, 'w', encoding='utf-8') as f:
#     json.dump(all_topics, f,  ensure_ascii=False, indent=4) # Save dictionary to JSON

2025-11-13 17:53:31,490 - INFO - Number of topics found: 4
2025-11-13 17:53:31,492 - INFO - Extracted details for 4 topics (including outlier topic -1 if present).


In [None]:

# Visualizations
import plotly.io as pio
fig = topic_model.visualize_barchart(top_n_topics=15)
fig.show()
fig.write_html(output_dir + '/barchart.html')


In [None]:
fig = topic_model.visualize_topics()
fig.show()
fig.write_html(output_dir + '/topics.html')

coherence_score_cv = get_CoherenceScore(topic_model.get_topics(),docs)
topic_diversity = get_TopicDiversity(topic_model.get_topics())

#logger.info(f"Coherence (c_v): {coherence_score_cv:.4f}")
logger.info(f"topic diversity : {topic_diversity:.4f}")


2025-11-13 17:53:45,877 - INFO - adding document #0 to Dictionary<0 unique tokens: []>
2025-11-13 17:53:46,089 - INFO - built Dictionary<30258 unique tokens: ['अग्राधिकार', 'अत्यावश्यक', 'अधिवक्ता', 'अध्ययन', 'अनन्तराज']...> from 1416 documents (total 494224 corpus positions)
2025-11-13 17:53:46,090 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<30258 unique tokens: ['अग्राधिकार', 'अत्यावश्यक', 'अधिवक्ता', 'अध्ययन', 'अनन्तराज']...> from 1416 documents (total 494224 corpus positions)", 'datetime': '2025-11-13T17:53:46.090805', 'gensim': '4.4.0', 'python': '3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-11-13 17:53:46,093 - INFO - using ParallelWordOccurrenceAccumulator<processes=11, batch_size=64> to estimate probabilities from sliding windows
2025-11-13 17:53:56,365 - INFO - 1 batches submitted to accumulate stats from 64 documents (13311 virtual)
2025-11-13 17:53:56,369

In [None]:
#topic_model = topic_model.reduce_topics(docs, nr_topics=100)

fig = topic_model.visualize_topics()
fig.show()
fig.write_html(output_dir + '/topics.html')

#fig = topic_model.visualize_term_rank()
#fig.show()
#fig.write_html(output_dir + '/termrank.html')



In [None]:


# num_topics_found = len(topic_model.get_topic_info()) -1 # Subtract outlier topic (-1)

# # --- Calculate Coherence Score (c_v) ---
# if num_topics_found > 0:
#     #logger.info(f"Found {num_topics_found} topics (excluding -1). Calculating coherence...")
    
#     # Prepare topics for Gensim CoherenceModel
#     # Get top N words for each topic
#     bertopic_topics_words = []
    
#     for topic_id in range(num_topics_found): # Iterate from topic 0 upwards
#         # topic_model.get_topic(topic_id) returns list of (word, score)
#         # We need only the words (first element of tuple)
        
#         topic_content = topic_model.get_topic(topic_id)
        
#         if topic_content: # Check if topic actually has words
#             bertopic_topics_words.append([word for word, score in topic_content])
#         else:
#             # Handle cases where a topic ID might exist but has no words (unlikely for 0 to num_topics_found-1 with default settings)
#             bertopic_topics_words.append([])


#     if bertopic_topics_words and any(bertopic_topics_words): # Ensure there are actual topic words
        
#         # Filter out empty lists if any topic had no words
#         bertopic_topics_words_filtered = [t for t in bertopic_topics_words if t]
        
#         if bertopic_topics_words_filtered:
            
#             coherence_score_cv = get_CoherenceScore(topic_model.get_topics(),docs)
#             topic_diversity = get_TopicDiversity(topic_model.get_topics())
            
#             #logger.info(f"Coherence (c_v): {coherence_score_cv:.4f}")
#             logger.info(f"topic diversity : {topic_diversity:.4f}")
 
# # At a point where you think memory can be freed, e.g., after training before eval
# torch.cuda.empty_cache()

In [None]:
coherence_score_cv = get_CoherenceScore(topic_model.get_topics(),docs)
topic_diversity = get_TopicDiversity(topic_model.get_topics())

#logger.info(f"Coherence (c_v): {coherence_score_cv:.4f}")
logger.info(f"topic diversity : {topic_diversity:.4f}")

2025-11-13 17:54:14,347 - INFO - adding document #0 to Dictionary<0 unique tokens: []>
2025-11-13 17:54:14,573 - INFO - built Dictionary<30258 unique tokens: ['अग्राधिकार', 'अत्यावश्यक', 'अधिवक्ता', 'अध्ययन', 'अनन्तराज']...> from 1416 documents (total 494224 corpus positions)
2025-11-13 17:54:14,574 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<30258 unique tokens: ['अग्राधिकार', 'अत्यावश्यक', 'अधिवक्ता', 'अध्ययन', 'अनन्तराज']...> from 1416 documents (total 494224 corpus positions)", 'datetime': '2025-11-13T17:54:14.574279', 'gensim': '4.4.0', 'python': '3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-11-13 17:54:14,577 - INFO - using ParallelWordOccurrenceAccumulator<processes=11, batch_size=64> to estimate probabilities from sliding windows
2025-11-13 17:54:24,672 - INFO - 1 batches submitted to accumulate stats from 64 documents (13311 virtual)
2025-11-13 17:54:24,676

In [None]:


'''
            logger.info("Topic information saved.")

            # --- Saving the Model (as before) ---
            # Method 1: Safetensors
            embedding_model_name = "NepaliBERT"
            save_dir_st = "my_bertopic_model_safetensors"
            logger.info(f"Saving model using safetensors to: {save_dir_st}")
            topic_model.save(save_dir_st,
                            serialization="safetensors",
                            save_ctfidf=True, # Good to save this for topic representations
                            save_embedding_model=embedding_model_name)
            logger.info("Safetensors model saved.")

            # Method 2: Pickle
            save_path_pickle = output_dir + "/"+ "_my_bertopic_model.pkl"
            logger.info(f"Saving model using pickle to: {save_path_pickle}")
            topic_model.save(save_path_pickle, serialization="pickle")
            logger.info("Pickle model saved.")

            # --- Loading the Model (as before) ---
            # (Loading code remains the same as the previous example)
            logger.info("--- Loading Models ---")

            # Load from Safetensors directory
            logger.info(f"Loading model from safetensors directory: {save_dir_st}")
            loaded_model_st = BERTopic.load(save_dir_st)
            logger.info("Safetensors model loaded.")

            # Load from Pickle file
            logger.info(f"Loading model from pickle file: {save_path_pickle}")
            loaded_model_pkl = BERTopic.load(save_path_pickle)
            logger.info("Pickle model loaded.")
'''            

'\n            logger.info("Topic information saved.")\n\n            # --- Saving the Model (as before) ---\n            # Method 1: Safetensors\n            embedding_model_name = "NepaliBERT"\n            save_dir_st = "my_bertopic_model_safetensors"\n            logger.info(f"Saving model using safetensors to: {save_dir_st}")\n            topic_model.save(save_dir_st,\n                            serialization="safetensors",\n                            save_ctfidf=True, # Good to save this for topic representations\n                            save_embedding_model=embedding_model_name)\n            logger.info("Safetensors model saved.")\n\n            # Method 2: Pickle\n            save_path_pickle = output_dir + "/"+ "_my_bertopic_model.pkl"\n            logger.info(f"Saving model using pickle to: {save_path_pickle}")\n            topic_model.save(save_path_pickle, serialization="pickle")\n            logger.info("Pickle model saved.")\n\n            # --- Loading the Model (as

In [None]:

#coherence_score_cv = get_CoherenceScore(topic_model.get_topics(),docs)
print(f"Coherence Score: {coherence_score_cv:.4f}")

print(f"topic diversity : {topic_diversity:.4f}")

Coherence Score: 0.3733
topic diversity : 1.0000


In [None]:
fig = topic_model.visualize_heatmap()
fig.show()

In [None]:
# # Reduce to fewer, more distinct topics (e.g., 70–80)
topic_model = topic_model.reduce_topics(docs, nr_topics=75)

# Optional: visualize again
topic_model.visualize_topics()

2025-11-13 17:54:25,833 - BERTopic - Topic reduction - Reducing number of topics
2025-11-13 17:54:25,834 - BERTopic - Topic reduction - Number of topics (75) is equal or higher than the clustered topics(4).
2025-11-13 17:54:25,836 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-13 17:54:28,059 - BERTopic - Representation - Completed ✓


In [None]:
coherence_score_cv = get_CoherenceScore(topic_model.get_topics(),docs)
topic_diversity = get_TopicDiversity(topic_model.get_topics())

#logger.info(f"Coherence (c_v): {coherence_score_cv:.4f}")
logger.info(f"topic diversity : {topic_diversity:.4f}")

2025-11-13 17:54:43,326 - INFO - adding document #0 to Dictionary<0 unique tokens: []>
2025-11-13 17:54:43,548 - INFO - built Dictionary<30258 unique tokens: ['अग्राधिकार', 'अत्यावश्यक', 'अधिवक्ता', 'अध्ययन', 'अनन्तराज']...> from 1416 documents (total 494224 corpus positions)
2025-11-13 17:54:43,549 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<30258 unique tokens: ['अग्राधिकार', 'अत्यावश्यक', 'अधिवक्ता', 'अध्ययन', 'अनन्तराज']...> from 1416 documents (total 494224 corpus positions)", 'datetime': '2025-11-13T17:54:43.549963', 'gensim': '4.4.0', 'python': '3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-11-13 17:54:43,554 - INFO - using ParallelWordOccurrenceAccumulator<processes=11, batch_size=64> to estimate probabilities from sliding windows
2025-11-13 17:54:53,653 - INFO - 1 batches submitted to accumulate stats from 64 documents (13311 virtual)
2025-11-13 17:54:53,656

In [None]:
fig = topic_model.visualize_heatmap()
fig.show()