In [1]:
# Preprocessing steps
# Load Nepali stopwords
# You may need to download a Nepali stopword list
import spacy
import re
import json

# Extract stop words for nepali texts
nepali_stopwords = []
with open('dataset/non-potential-topic-word-list.txt', 'r', encoding='utf-8') as f:
    nepali_stopwords = [line.strip() for line in f]  # Use a set for efficient lookup
    
from sklearn.feature_extraction import text

# Combine with built-in English stopwords
combined_stopwords = list(text.ENGLISH_STOP_WORDS.union(nepali_stopwords))

In [2]:
output_dir = 'results/LDA/'

# Open and read the content of data.txt
with open('dataset/docs_saved_new.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# If the content is already a valid JSON list, just load it
import json

try:
    docs = json.loads(content)
except json.JSONDecodeError:
    # If not, try to manually split paragraphs (fallback)
    docs = [para.strip() for para in content.split('\n') if para.strip()]

# Print the list or process it further
print(docs[0:10])

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/docs_saved_new.txt'

In [None]:
import gensim
from gensim import corpora
import nltk


# Tokenize and clean
texts = [
    [word for word in doc.split() if word not in combined_stopwords and len(word) > 2]
    for doc in docs
]

In [None]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(texts)

# Filter extremes (optional, tuneable)
dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
from gensim.models.ldamodel import LdaModel

# Train LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=50,          # tune this
    random_state=100,
    chunksize=200,          # larger chunks stabilize learning
    passes=20,              # increase passes for better convergence
    alpha='symmetric',      # or 'auto' for self-tuning
    eta='auto',             # improves word-topic distribution balance
    per_word_topics=True
)

In [None]:

# Print top 10 words from each topic
topics = lda_model.print_topics(num_words=10)
for i, topic in topics:
    print(f"Topic {i}: {topic}")

Topic 23: 0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" + 0.000*"पूर्वरिसइवी" + 0.000*"फर्किआए" + 0.000*"मुक्" + 0.000*"एकआपस" + 0.000*"हिर्काए" + 0.000*"भूमिहार"
Topic 90: 0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" + 0.000*"पूर्वरिसइवी" + 0.000*"फर्किआए" + 0.000*"मुक्" + 0.000*"एकआपस" + 0.000*"हिर्काए" + 0.000*"भूमिहार"
Topic 2: 0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" + 0.000*"पूर्वरिसइवी" + 0.000*"फर्किआए" + 0.000*"मुक्" + 0.000*"एकआपस" + 0.000*"हिर्काए" + 0.000*"भूमिहार"
Topic 5: 0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" + 0.000*"पूर्वरिसइवी" + 0.000*"फर्किआए" + 0.000*"मुक्" + 0.000*"एकआपस" + 0.000*"हिर्काए" + 0.000*"भूमिहार"
Topic 9: 0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" + 0.000*"पूर्वरिसइवी" + 0.000*"फर्किआए" + 0.000*"मुक्" + 0.000*"एकआपस" + 0.000*"हिर्काए" + 0.000*"भूमिहार"
Topic 56: 0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" 

In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)


In [None]:
import pandas as pd

topic_words = []
for i, topic in lda_model.show_topics( formatted=False):
    words = [word for word, prob in topic]
    topic_words.append({"Topic": i, "Words": ", ".join(words)})

df = pd.DataFrame(topic_words)
df.to_csv(output_dir + "lda_topics.csv", index=False)

In [None]:
def calculate_topic_diversity(lda_model, top_k=10):
    topic_words = lda_model.show_topics(num_topics=-1, num_words=top_k, formatted=False)

    all_words = []
    for topic in topic_words:
        words = [word for word, _ in topic[1]]
        all_words.extend(words)

    unique_words = set(all_words)
    diversity_score = len(unique_words) / len(all_words)
    return diversity_score

In [None]:

coherence_score = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_score:.4f}")

diversity = calculate_topic_diversity(lda_model, top_k=10)
print(f"Topic Diversity: {diversity:.4f}")

Coherence Score: 0.4302
Topic Diversity: 0.5710


In [None]:
topics[0]

(23,
 '0.000*"भवितव्य" + 0.000*"मुसलमान" + 0.000*"जगाउने" + 0.000*"झुठ्ठा" + 0.000*"पूर्वरिसइवी" + 0.000*"फर्किआए" + 0.000*"मुक्" + 0.000*"एकआपस" + 0.000*"हिर्काए" + 0.000*"भूमिहार"')