In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
df = pd.read_csv("../data/processed/zerodha_kite_reviews_clean.csv")
df["review_date"] = pd.to_datetime(df["review_date"])

texts = df["clean_text"].tolist()
print("Documents:", len(texts))


Documents: 2271


In [12]:
# Domain-specific stopwords to reduce generic noise
domain_stopwords = [
    "app", "apps", "use", "using",
    "good", "nice", "best", "great",
    "ok", "okay"
]

In [13]:
vectorizer = TfidfVectorizer(
    max_df=0.9,
    min_df=20,
    ngram_range=(1, 2),
    stop_words=domain_stopwords
)

X = vectorizer.fit_transform(texts)
print("TF-IDF shape:", X.shape)

TF-IDF shape: (2271, 196)


In [14]:
n_topics = 6

lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    learning_method="batch"
)

lda.fit(X)

0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",6
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [15]:
feature_names = vectorizer.get_feature_names_out()

def print_topics(model, feature_names, top_n=10):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[:-top_n - 1:-1]
        ))

print_topics(lda, feature_names)



Topic 0:
not, issue, time, account, problem, chart, zerodha, always, much, market

Topic 1:
service, excellent, experience, trading, customer, zerodha, support, customer support, customer service, platform

Topic 2:
easy, user, friendly, broker, response, user friendly, india, care, thanks, quick

Topic 3:
option, please, add, like, update, chart, need, one, stocks, feature

Topic 4:
support, not, simple, working, happy, work, helpful, not working, login, zerodha

Topic 5:
application, charges, investment, interface, hai, platform, thank, overall, smooth, fast


In [16]:
topic_probs = lda.transform(X)
df["dominant_topic"] = topic_probs.argmax(axis=1)


In [17]:
df["dominant_topic"].value_counts().sort_index()


dominant_topic
0    677
1    298
2    269
3    546
4    286
5    195
Name: count, dtype: int64

In [18]:
topic_time = (
    df
    .set_index("review_date")
    .groupby([pd.Grouper(freq="W"), "dominant_topic"])
    .size()
    .unstack(fill_value=0)
)

topic_time.head()


dominant_topic,0,1,2,3,4,5
review_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-08-24,13,15,8,18,8,6
2025-08-31,29,18,17,17,11,10
2025-09-07,61,19,14,28,25,11
2025-09-14,49,19,23,44,18,14
2025-09-21,40,12,16,42,13,15


In [19]:
df.to_csv("../data/processed/zerodha_kite_reviews_with_topics.csv", index=False)
